[torqueusers] torque pbs spawn jobs on 2 nodes but all processes actually run only on 1 node (OpenMPI)

Steven Truong midair77 at gmail.com
Wed Feb 27 19:02:43 MST 2008


Dear, all.  For some weird reasons, my user submitted a jobs spanning
2 nodes and 16 cores but I checked and nothing is running on one node
whereas 16 processes are running on the other nodes for this job.

#pbsnodes
compute-0-0.local
state = job-exclusive
 np = 8
ntype = cluster
jobs = 0/558.jupiter.mydomain.com, 1/558.jupiter.mydomain.com,
2/558.jupiter.mydomain.com, 3/558.jupiter.mydomain.com,
4/558.jupiter.mydomain.com, 5/558.jupiter.mydomain.com,
6/558.jupiter.mydomain.com, 7/558.jupiter.mydomain.com
                         status = opsys=linux,uname=Linux
compute-0-0.local 2.6.9-55.0.9.EL_lustre.1.6.4.2smp #1 SMP Wed Jan 16
19:52:57 EST 2008 x86_64,sessions=? 0,nsessions=?
0,nusers=0,idletime=27476,totmem=17439232kb,availmem=17246848kb,physmem=16419116kb,ncpus=8,loadave=0.02,netload=778724723,state=free,jobs=558.jupiter.mydomain.com,rectime=1204162492

compute-0-1.local
state = job-exclusive,busy
np = 8
ntype = cluster
jobs = 0/558.jupiter.mydomain.com, 1/558.jupiter.mydomain.com,
2/558.jupiter.mydomain.com, 3/558.jupiter.mydomain.com,
4/558.jupiter.mydomain.com, 5/558.jupiter.mydomain.com,
6/558.jupiter.mydomain.com, 7/558.jupiter.mydomain.com,status =
opsys=linux,uname=Linux compute-0-1.local
2.6.9-55.0.9.EL_lustre.1.6.4.2smp #1 SMP Wed Jan 16 19:52:57 EST 2008
x86_64,sessions=7046
7083,nsessions=2,nusers=1,idletime=93285,totmem=17439232kb,availmem=11384736kb,physmem=16419116kb,ncpus=8,loadave=16.15,netload=916299888,state=busy,jobs=558.jupiter.mydomain.com,rectime=1204162491

#qstat -f 558
Job Id: 558.jupiter.mydomain.com
Job_Name = MNP
Job_Owner = User at Jupiter.mydomain.com
  resources_used.cput = 00:00:00
  resources_used.mem = 5176kb
  resources_used.vmem = 148792kb
  resources_used.walltime = 02:17:51
  job_state = R
  queue = default
  server = jupiter.mydomain.com
  Checkpoint = u
  ctime = Wed Feb 27 15:31:38 2008
  Error_Path = jupiter.mydomain.com:/mnt/lustre/User/VASP/MNP_test1/55/Pt55/16_core/NPAR16/MNP.e558
 exec_host = compute-0-1.local/7+compute-0-1.local/6+compute-0-1.local/5+compute-0-1.local/4+compute-0-1.local/3+compute-0-1.local/2+compute-0-1.local/1+compute-0-1.local/0+compute-0-0.local/7+compute-0-0.local/6+compute-0-0.local/5+compute-0-0.local/4+compute-0-0.local/3+compute-0-0.local/2+compute-0-0.local/1+compute-0-0.local/0
  Hold_Types = n
  Join_Path = n
  Keep_Files = n
  Mail_Points = a
  mtime = Wed Feb 27 15:32:16 2008
  Output_Path =
jupiter.mydomain.com:/mnt/lustre/User/VASP/MNP_test1/55/Pt55/16_core/NPAR16/MNP.o558
  Priority = 0
  qtime = Wed Feb 27 15:31:38 2008
  Rerunable = True
  Resource_List.neednodes = 2:ppn=8
  Resource_List.nodect = 2
  Resource_List.nodes = 2:ppn=8
  Resource_List.walltime = 360:00:00
  session_id = 7046
  substate = 42
  Variable_List = PBS_O_HOME=/home/User,PBS_O_LANG=en_US.iso885915,
  PBS_O_LOGNAME=User,
  PBS_O_PATH=/opt/amd/gnutools-4.1.2-barcelona/bin/:/usr/kerberos/bin:/usr/java/jdk1.5.0_10/bin:/usr/local/bin:/bin:/usr/bin:/usr/X11R6/bin:/opt/c3-4/:/opt/eclipse:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/maven/bin:/opt/openmpi/bin/:/opt/maui/bin:/opt/torque/bin:/opt/torque/sbin:/opt/rocks/bin:/opt/rocks/sbin:/home/User/bin,
  PBS_O_MAIL=/var/spool/mail/User,PBS_O_SHELL=/bin/bash,
  PBS_O_HOST=jupiter.mydomain.com,
  PBS_O_WORKDIR=/mnt/lustre/User/VASP/MNP_test1/55/Pt55/16_core/NPAR16,PBS_O_QUEUE=default
....

#qmgr -c "print server"
create queue default
set queue default queue_type = Execution
set queue default kill_delay = 90
set queue default enabled = True
set queue default started = True
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_host_enable = False
set server managers = maui at jupiter.mydomain.com
set server managers += root at jupiter.mydomain.com
set server operators = maui at jupiter.mydomain.com
set server operators += root at jupiter.mydomain.com
set server default_queue = default
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.walltime = 336:00:00
set server scheduler_iteration = 60
set server node_ping_rate = 300
set server node_check_rate = 150
set server tcp_timeout = 6
set server pbs_version = 2.1.8

#cat nodes
compute-0-0.local np=8
.....

# grep -v ^# maui.cfg
RMPOLLINTERVAL          00:00:30
SERVERHOST              Jupiter.mydomain.com
SERVERPORT              42559
SERVERMODE              NORMAL
RMCFG[Jupiter.mydomain.com]          TYPE=PBS
ADMIN1                maui root
LOGFILE               maui.log
LOGFILEMAXSIZE        10000000
LOGLEVEL              3
QUEUETIMEWEIGHT       1
BACKFILLPOLICY        FIRSTFIT
RESERVATIONPOLICY     CURRENTHIGHEST
NODEALLOCATIONPOLICY  MINRESOURCE
ENABLEMULTINODEJOBS   TRUE
ENABLEMULTIREQJOBS    TRUE


This is really weird as this has been the first time I saw such
problems.  Could anybody tell me what are missing or wrong?

Thank you.


More information about the torqueusers mailing list