[torqueusers] simple startup troubleshooting
Christina Salls
christina.salls at noaa.gov
Thu Feb 9 12:47:05 MST 2012
Hi all,
I am new to Torque. In fact, I have just installed torque-2.5.9
(server) on the head node of a 20 node cluster and torque client and mom
packages on the compute nodes. I used the Torque Administrator's Guide and
the installation process seemed to proceed smoothly (on my second attempt).
My first attempt was complicated by the fact that PBS was pre-installed on
both the head node and server and seemed to be getting in my way because of
processes that were already running and ports that were already in use. I
removed everything I could find of the PBS installation and started from
scratch. I am stuck at the point where I should be seeing my nodes as
free, but they are showing up as down. I am looking for any clues in
troubleshooting this problem. I don't know where to start. I am including
some information to illustrate my setup.
Thanks in advance,
Christina
Here is the output of the pbsnodes command
[root at wings torque-packages]# pbsnodes -a
n001
state = down
np = 1
ntype = cluster
gpus = 0
n002
state = down
np = 1
ntype = cluster
gpus = 0
n003
state = down
np = 1
ntype = cluster
gpus = 0
.....
It is the same for all 20 nodes. I truncated it for the sake of brevity.
On the headnode:
[root at wings server_priv]# ping n001
PING n001.default.domain (10.0.1.1) 56(84) bytes of data.
64 bytes from n001.default.domain (10.0.1.1): icmp_seq=1 ttl=64 time=0.193
ms
64 bytes from n001.default.domain (10.0.1.1): icmp_seq=2 ttl=64 time=0.189
ms
[root at wings server_priv]# qmgr
Max open servers: 10239
Qmgr: list server
Server wings.glerl.noaa.gov
server_state = Active
scheduling = True
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
acl_hosts = wings.glerl.noaa.gov
default_queue = batch
log_events = 511
mail_from = adm
scheduler_iteration = 600
node_check_rate = 150
tcp_timeout = 6
mom_job_sync = True
pbs_version = 2.5.9
keep_completed = 300
next_job_number = 0
net_counter = 4 4 4
Qmgr: list node n001
Node n001
state = down
np = 1
ntype = cluster
gpus = 0
Qmgr: print node n001
#
# Create nodes and set their properties.
#
#
# Create and define node n001
#
create node n001
set node n001 state = down
set node n001 np = 1
set node n001 ntype = cluster
set node n001 gpus = 0
[root at wings server_priv]# ps -ef | grep pbs
root 3925 1 0 Feb03 ? 00:03:00 /usr/local/sbin/pbs_mom -q
-d /var/spool/torque
root 7056 1 0 11:47 ? 00:00:02 pbs_server
root 29031 7993 0 12:59 pts/29 00:00:00 grep pbs
[root at wings torque-2.5.9]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = wings.glerl.noaa.gov
set server managers = salls at wings.glerl.noaa.gov
set server operators = salls at wings.glerl.noaa.gov
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
>From the compute nodes:
root 15891 1 0 11:45 ? 00:00:00 pbs_mom
root 16742 16709 0 13:11 pts/0 00:00:00 grep pbs
[root at n001 ~]# ping wings
PING wings.glerl.noaa.gov (192.94.173.9) 56(84) bytes of data.
64 bytes from wings.glerl.noaa.gov (192.94.173.9): icmp_seq=1 ttl=64
time=0.093 ms
64 bytes from wings.glerl.noaa.gov (192.94.173.9): icmp_seq=2 ttl=64
time=0.165 ms
[root at n001 ~]# qmgr
Max open servers: 10239
Qmgr: list server
Server wings.glerl.noaa.gov
server_state = Active
scheduling = True
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
acl_hosts = wings.glerl.noaa.gov
default_queue = batch
log_events = 511
mail_from = adm
scheduler_iteration = 600
node_check_rate = 150
tcp_timeout = 6
mom_job_sync = True
pbs_version = 2.5.9
keep_completed = 300
next_job_number = 0
net_counter = 6 5 4
Qmgr: list node n001
Node n001
state = down
np = 1
ntype = cluster
gpus = 0
[root at wings server_priv]# qmgr
Max open servers: 10239
Qmgr: print node n001
#
# Create nodes and set their properties.
#
#
# Create and define node n001
#
create node n001
set node n001 state = down
set node n001 np = 1
set node n001 ntype = cluster
set node n001 gpus = 0
I am not sure how to proceed at this point. Any help would be appreciated.
I wasn't sure what other files or output to include. Let me know if any
other information would be useful.
--
Christina A. Salls
GLERL Computer Group
help.glerl at noaa.gov
Help Desk x2127
Christina.Salls at noaa.gov
Voice Mail 734-741-2446
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20120209/6e38106e/attachment.html
More information about the torqueusers
mailing list