[torqueusers] simple startup troubleshooting

Christina Salls christina.salls at noaa.gov
Thu Feb 9 12:47:05 MST 2012


Hi all,

      I am new to Torque.  In fact, I have just installed torque-2.5.9
(server) on the head node of a 20 node cluster and torque client and mom
packages on the compute nodes.  I used the Torque Administrator's Guide and
the installation process seemed to proceed smoothly (on my second attempt).
 My first attempt was complicated by the fact that PBS was pre-installed on
both the head node and server and seemed to be getting in my way because of
processes that were already running and ports that were already in use.  I
removed everything I could find of the PBS installation and started from
scratch.  I am stuck at the point where I should be seeing my nodes as
free, but they are showing up as down.  I am looking for any clues in
troubleshooting this problem.  I don't know where to start.  I am including
some  information to illustrate my setup.

Thanks in advance,

Christina

Here is the output of the pbsnodes command

[root at wings torque-packages]# pbsnodes -a
n001
     state = down
     np = 1
     ntype = cluster
     gpus = 0

n002
     state = down
     np = 1
     ntype = cluster
     gpus = 0

n003
     state = down
     np = 1
     ntype = cluster
     gpus = 0

.....

It is the same for all 20 nodes.  I truncated it for the sake of brevity.

On the headnode:

[root at wings server_priv]# ping n001
PING n001.default.domain (10.0.1.1) 56(84) bytes of data.
64 bytes from n001.default.domain (10.0.1.1): icmp_seq=1 ttl=64 time=0.193
ms
64 bytes from n001.default.domain (10.0.1.1): icmp_seq=2 ttl=64 time=0.189
ms

[root at wings server_priv]# qmgr
Max open servers: 10239
Qmgr: list server
Server wings.glerl.noaa.gov
server_state = Active
scheduling = True
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
acl_hosts = wings.glerl.noaa.gov
default_queue = batch
log_events = 511
mail_from = adm
scheduler_iteration = 600
node_check_rate = 150
tcp_timeout = 6
mom_job_sync = True
pbs_version = 2.5.9
keep_completed = 300
next_job_number = 0
net_counter = 4 4 4
Qmgr: list node n001
Node n001
state = down
np = 1
ntype = cluster
gpus = 0
Qmgr: print node n001
#
# Create nodes and set their properties.
#
#
# Create and define node n001
#
create node n001
set node n001 state = down
set node n001 np = 1
set node n001 ntype = cluster
set node n001 gpus = 0


[root at wings server_priv]# ps -ef | grep pbs
root      3925     1  0 Feb03 ?        00:03:00 /usr/local/sbin/pbs_mom -q
-d /var/spool/torque
root      7056     1  0 11:47 ?        00:00:02 pbs_server
root     29031  7993  0 12:59 pts/29   00:00:00 grep pbs

[root at wings torque-2.5.9]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = wings.glerl.noaa.gov
set server managers = salls at wings.glerl.noaa.gov
set server operators = salls at wings.glerl.noaa.gov
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300

>From the compute nodes:

root     15891     1  0 11:45 ?        00:00:00 pbs_mom
root     16742 16709  0 13:11 pts/0    00:00:00 grep pbs

[root at n001 ~]# ping wings
PING wings.glerl.noaa.gov (192.94.173.9) 56(84) bytes of data.
64 bytes from wings.glerl.noaa.gov (192.94.173.9): icmp_seq=1 ttl=64
time=0.093 ms
64 bytes from wings.glerl.noaa.gov (192.94.173.9): icmp_seq=2 ttl=64
time=0.165 ms

[root at n001 ~]# qmgr
Max open servers: 10239
Qmgr: list server
Server wings.glerl.noaa.gov
server_state = Active
scheduling = True
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
acl_hosts = wings.glerl.noaa.gov
default_queue = batch
log_events = 511
mail_from = adm
scheduler_iteration = 600
node_check_rate = 150
tcp_timeout = 6
mom_job_sync = True
pbs_version = 2.5.9
keep_completed = 300
next_job_number = 0
net_counter = 6 5 4

Qmgr: list node n001
Node n001
state = down
np = 1
ntype = cluster
gpus = 0
[root at wings server_priv]# qmgr
Max open servers: 10239
Qmgr: print node n001
#
# Create nodes and set their properties.
#
#
# Create and define node n001
#
create node n001
set node n001 state = down
set node n001 np = 1
set node n001 ntype = cluster
set node n001 gpus = 0


I am not sure how to proceed at this point.  Any help would be appreciated.
 I wasn't sure what other files or output to include.  Let me know if any
other information would be useful.



-- 
Christina A. Salls
GLERL Computer Group
help.glerl at noaa.gov
Help Desk x2127
Christina.Salls at noaa.gov
Voice Mail 734-741-2446
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20120209/6e38106e/attachment.html 


More information about the torqueusers mailing list