[torqueusers] job only runs on 1 cpu

Jan jand at uvic.ca
Tue Feb 12 17:09:36 MST 2008


Thanks again for all the help so far.

I was able to proceed with the installation and have two nodes up now. 
homathko1 is the server and also a node.

I ran into another problem though (that I reported before). Starting a 
job with a script containing as first line
#PBS -l nodes=1:ppn=8
works perfectly. The job starts on stikine1 on all 8 cpus.
However
#PBS -l nodes=2:ppn=8
will start the job. qstat -f (see output below) tells me that it is 
running on 16 cpus but checking with "top" shows that the job is only 
running one 1 cpu on homathko1. I could not find anything in the MOM 
scripts.

Thanks, Jan


Here is some system output and tails of logfiles:
root at homathko:/usr/local/torque-2.2.1# pbsnodes -a
stikine1
      state = free
      np = 8
      ntype = cluster
      status = opsys=linux,uname=Linux stikine1 2.6.22-14-server #1 SMP 
Thu Jan 31 23:57:25 UTC 2008 
x86_64,sessions=5157,nsessions=1,nusers=1,idletime=3912,totmem=6936156kb,availmem=6877644kb,physmem=2053352kb,ncpus=8,loadave=0.00,netload=215206,state=free,jobs=,varattr=,rectime=1202858610

homathko1
      state = free
      np = 8
      ntype = cluster
      status = opsys=linux,uname=Linux homathko 2.6.22-14-generic #1 SMP 
Thu Jan 31 23:33:13 UTC 2008 x86_64,sessions=5158 6256 6323 6364 6383 
6401 6419 6441 6462 6480 6500 6518 6536 6573 6591 6609 6627 6652 6670 
6688 6716 6736 6754 6781 6799 14513 14644 18601 21087 
27756,nsessions=30,nusers=3,idletime=118,totmem=9948720kb,availmem=9092600kb,physmem=5065916kb,ncpus=8,loadave=0.00,netload=3009847260,state=free,jobs=,varattr=,rectime=1202858616

Job Id: 16.homathko1.cluster.loc
     Job_Name = fgs_4lay
     Job_Owner = jand at homathko1.cluster.loc
     job_state = R
     queue = batch
     server = homathko1.cluster.loc
     Checkpoint = u
     ctime = Tue Feb 12 15:55:01 2008
     Error_Path = 
homathko.seos.uvic.ca:/home/jand/uni/phd/holland/layer_strip/
         evidence/syn_grad/4lay_second_try/fgs_4lay.e16
     exec_host = 
homathko1/7+homathko1/6+homathko1/5+homathko1/4+homathko1/3+ho
 
mathko1/2+homathko1/1+homathko1/0+stikine1/7+stikine1/6+stikine1/5+sti
         kine1/4+stikine1/3+stikine1/2+stikine1/1+stikine1/0
     Hold_Types = n
     Join_Path = oe
     Keep_Files = n
     Mail_Points = a
     mtime = Tue Feb 12 15:55:01 2008
     Output_Path = 
homathko.seos.uvic.ca:/home/jand/uni/phd/holland/layer_strip
         /evidence/syn_grad/4lay_second_try/fgs_4lay.o16
     Priority = 0
     qtime = Tue Feb 12 15:55:01 2008
     Rerunable = True
     Resource_List.nodect = 2
     Resource_List.nodes = 2:ppn=8
     Resource_List.walltime = 96:00:00
     Variable_List = PBS_O_HOME=/home/jand,PBS_O_LANG=en_CA.UTF-8,
         PBS_O_LOGNAME=jand,
 
PBS_O_PATH=/opt/intel/idbe/10.1.012/bin:/opt/intel/fce/10.1.012/bin:/
 
home/jand/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:
         /bin:/usr/games:/home/jand/bin:/home/jand/oases/bin,
         PBS_O_SHELL=/bin/bash,PBS_SERVER=homathko.seos.uvic.ca,
         PBS_O_HOST=homathko.seos.uvic.ca,
 
PBS_O_WORKDIR=/home/jand/uni/phd/holland/layer_strip/evidence/syn_gra
         d/4lay_second_try,PBS_O_QUEUE=batch
     comment = Job started on Tue Feb 12 at 15:55
     etime = Tue Feb 12 15:55:01 2008
     submit_args = fgs_4lay

homathko1 MOM log:
root at homathko:/var/spool/torque# tail mom_logs/20080212
02/12/2008 15:55:01;0100;   pbs_mom;Req;;Type JobScript request received 
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:01;0100;   pbs_mom;Req;;Type ReadyToCommit request 
received from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:01;0100;   pbs_mom;Req;;Type Commit request received 
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:01;0100;   pbs_mom;Req;;Type StatusJob request received 
from PBS_Server at homathko1.cluster.loc, sock=13
02/12/2008 15:55:01;0001;   pbs_mom;Job;TMomFinalizeJob3;job 
16.homathko1.cluster.loc started, pid = 2709
02/12/2008 15:55:34;0100;   pbs_mom;Req;;Type StatusJob request received 
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:40;0100;   pbs_mom;Req;;Type SignalJob request received 
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:40;0008; 
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2709 task 1 
with sig 15
02/12/2008 15:55:40;0008; 
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2710 task 1 
with sig 15
02/12/2008 15:55:40;0008; 
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2713 task 1 
with sig 15
02/12/2008 15:55:40;0008; 
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2714 task 1 
with sig 15
02/12/2008 15:55:40;0080; 
pbs_mom;Job;16.homathko1.cluster.loc;scan_for_terminated: job 
16.homathko1.cluster.loc task 1 terminated, sid=2709
02/12/2008 15:55:40;0008;   pbs_mom;Job;16.homathko1.cluster.loc;job was 
terminated
02/12/2008 15:55:40;0080;   pbs_mom;Svr;preobit_reply;top of preobit_reply
02/12/2008 15:55:40;0080; 
pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top 
of while loop
02/12/2008 15:55:40;0080;   pbs_mom;Svr;preobit_reply;in while loop, no 
error from job stat
02/12/2008 15:55:40;0008;   pbs_mom;Job;scan_for_terminated;checking job 
post-processing routine
02/12/2008 15:55:40;0080;   pbs_mom;Job;16.homathko1.cluster.loc;obit 
sent to server
02/12/2008 15:55:40;0100;   pbs_mom;Req;;Type CopyFiles request received 
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:40;0100;   pbs_mom;Req;;Type DeleteJob request received 
from PBS_Server at homathko1.cluster.loc, sock=10


stikine1 MOM log:
root at stikine1:/var/spool/torque# tail mom_logs/20080212
02/12/2008 15:51:52;0080; 
pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top 
of while loop
02/12/2008 15:51:52;0080;   pbs_mom;Svr;preobit_reply;in while loop, no 
error from job stat
02/12/2008 15:51:52;0100;   pbs_mom;Req;;Type StatusJob request received 
from PBS_Server at homathko1.cluster.loc, sock=13
02/12/2008 15:51:52;0008;   pbs_mom;Job;scan_for_terminated;checking job 
post-processing routine
02/12/2008 15:51:52;0080;   pbs_mom;Job;14.homathko1.cluster.loc;obit 
sent to server
02/12/2008 15:51:52;0100;   pbs_mom;Req;;Type SignalJob request received 
from PBS_Server at homathko1.cluster.loc, sock=14
02/12/2008 15:51:52;0100;   pbs_mom;Req;;Type CopyFiles request received 
from PBS_Server at homathko1.cluster.loc, sock=11
02/12/2008 15:51:56;0100;   pbs_mom;Req;;Type DeleteJob request received 
from PBS_Server at homathko1.cluster.loc, sock=11
02/12/2008 15:55:01;0008;   pbs_mom;Job;16.homathko1.cluster.loc;JOIN 
JOB as node 1
02/12/2008 15:55:39;0100; 
pbs_mom;Job;16.homathko1.cluster.loc;kill_job received



-- 
Jan Dettmer, Postdoctoral Fellow
School of Earth and Ocean Sciences, University of Victoria	
Victoria, BC V8W 3P6
office: (250) 472-4342	email: jand at uvic.ca
http://web.uvic.ca/~jand/


More information about the torqueusers mailing list