[torqueusers] job only runs on 1 cpu
Jan
jand at uvic.ca
Tue Feb 12 17:09:36 MST 2008
Thanks again for all the help so far.
I was able to proceed with the installation and have two nodes up now.
homathko1 is the server and also a node.
I ran into another problem though (that I reported before). Starting a
job with a script containing as first line
#PBS -l nodes=1:ppn=8
works perfectly. The job starts on stikine1 on all 8 cpus.
However
#PBS -l nodes=2:ppn=8
will start the job. qstat -f (see output below) tells me that it is
running on 16 cpus but checking with "top" shows that the job is only
running one 1 cpu on homathko1. I could not find anything in the MOM
scripts.
Thanks, Jan
Here is some system output and tails of logfiles:
root at homathko:/usr/local/torque-2.2.1# pbsnodes -a
stikine1
state = free
np = 8
ntype = cluster
status = opsys=linux,uname=Linux stikine1 2.6.22-14-server #1 SMP
Thu Jan 31 23:57:25 UTC 2008
x86_64,sessions=5157,nsessions=1,nusers=1,idletime=3912,totmem=6936156kb,availmem=6877644kb,physmem=2053352kb,ncpus=8,loadave=0.00,netload=215206,state=free,jobs=,varattr=,rectime=1202858610
homathko1
state = free
np = 8
ntype = cluster
status = opsys=linux,uname=Linux homathko 2.6.22-14-generic #1 SMP
Thu Jan 31 23:33:13 UTC 2008 x86_64,sessions=5158 6256 6323 6364 6383
6401 6419 6441 6462 6480 6500 6518 6536 6573 6591 6609 6627 6652 6670
6688 6716 6736 6754 6781 6799 14513 14644 18601 21087
27756,nsessions=30,nusers=3,idletime=118,totmem=9948720kb,availmem=9092600kb,physmem=5065916kb,ncpus=8,loadave=0.00,netload=3009847260,state=free,jobs=,varattr=,rectime=1202858616
Job Id: 16.homathko1.cluster.loc
Job_Name = fgs_4lay
Job_Owner = jand at homathko1.cluster.loc
job_state = R
queue = batch
server = homathko1.cluster.loc
Checkpoint = u
ctime = Tue Feb 12 15:55:01 2008
Error_Path =
homathko.seos.uvic.ca:/home/jand/uni/phd/holland/layer_strip/
evidence/syn_grad/4lay_second_try/fgs_4lay.e16
exec_host =
homathko1/7+homathko1/6+homathko1/5+homathko1/4+homathko1/3+ho
mathko1/2+homathko1/1+homathko1/0+stikine1/7+stikine1/6+stikine1/5+sti
kine1/4+stikine1/3+stikine1/2+stikine1/1+stikine1/0
Hold_Types = n
Join_Path = oe
Keep_Files = n
Mail_Points = a
mtime = Tue Feb 12 15:55:01 2008
Output_Path =
homathko.seos.uvic.ca:/home/jand/uni/phd/holland/layer_strip
/evidence/syn_grad/4lay_second_try/fgs_4lay.o16
Priority = 0
qtime = Tue Feb 12 15:55:01 2008
Rerunable = True
Resource_List.nodect = 2
Resource_List.nodes = 2:ppn=8
Resource_List.walltime = 96:00:00
Variable_List = PBS_O_HOME=/home/jand,PBS_O_LANG=en_CA.UTF-8,
PBS_O_LOGNAME=jand,
PBS_O_PATH=/opt/intel/idbe/10.1.012/bin:/opt/intel/fce/10.1.012/bin:/
home/jand/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:
/bin:/usr/games:/home/jand/bin:/home/jand/oases/bin,
PBS_O_SHELL=/bin/bash,PBS_SERVER=homathko.seos.uvic.ca,
PBS_O_HOST=homathko.seos.uvic.ca,
PBS_O_WORKDIR=/home/jand/uni/phd/holland/layer_strip/evidence/syn_gra
d/4lay_second_try,PBS_O_QUEUE=batch
comment = Job started on Tue Feb 12 at 15:55
etime = Tue Feb 12 15:55:01 2008
submit_args = fgs_4lay
homathko1 MOM log:
root at homathko:/var/spool/torque# tail mom_logs/20080212
02/12/2008 15:55:01;0100; pbs_mom;Req;;Type JobScript request received
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:01;0100; pbs_mom;Req;;Type ReadyToCommit request
received from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:01;0100; pbs_mom;Req;;Type Commit request received
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:01;0100; pbs_mom;Req;;Type StatusJob request received
from PBS_Server at homathko1.cluster.loc, sock=13
02/12/2008 15:55:01;0001; pbs_mom;Job;TMomFinalizeJob3;job
16.homathko1.cluster.loc started, pid = 2709
02/12/2008 15:55:34;0100; pbs_mom;Req;;Type StatusJob request received
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:40;0100; pbs_mom;Req;;Type SignalJob request received
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:40;0008;
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2709 task 1
with sig 15
02/12/2008 15:55:40;0008;
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2710 task 1
with sig 15
02/12/2008 15:55:40;0008;
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2713 task 1
with sig 15
02/12/2008 15:55:40;0008;
pbs_mom;Job;16.homathko1.cluster.loc;kill_task: killing pid 2714 task 1
with sig 15
02/12/2008 15:55:40;0080;
pbs_mom;Job;16.homathko1.cluster.loc;scan_for_terminated: job
16.homathko1.cluster.loc task 1 terminated, sid=2709
02/12/2008 15:55:40;0008; pbs_mom;Job;16.homathko1.cluster.loc;job was
terminated
02/12/2008 15:55:40;0080; pbs_mom;Svr;preobit_reply;top of preobit_reply
02/12/2008 15:55:40;0080;
pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top
of while loop
02/12/2008 15:55:40;0080; pbs_mom;Svr;preobit_reply;in while loop, no
error from job stat
02/12/2008 15:55:40;0008; pbs_mom;Job;scan_for_terminated;checking job
post-processing routine
02/12/2008 15:55:40;0080; pbs_mom;Job;16.homathko1.cluster.loc;obit
sent to server
02/12/2008 15:55:40;0100; pbs_mom;Req;;Type CopyFiles request received
from PBS_Server at homathko1.cluster.loc, sock=10
02/12/2008 15:55:40;0100; pbs_mom;Req;;Type DeleteJob request received
from PBS_Server at homathko1.cluster.loc, sock=10
stikine1 MOM log:
root at stikine1:/var/spool/torque# tail mom_logs/20080212
02/12/2008 15:51:52;0080;
pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top
of while loop
02/12/2008 15:51:52;0080; pbs_mom;Svr;preobit_reply;in while loop, no
error from job stat
02/12/2008 15:51:52;0100; pbs_mom;Req;;Type StatusJob request received
from PBS_Server at homathko1.cluster.loc, sock=13
02/12/2008 15:51:52;0008; pbs_mom;Job;scan_for_terminated;checking job
post-processing routine
02/12/2008 15:51:52;0080; pbs_mom;Job;14.homathko1.cluster.loc;obit
sent to server
02/12/2008 15:51:52;0100; pbs_mom;Req;;Type SignalJob request received
from PBS_Server at homathko1.cluster.loc, sock=14
02/12/2008 15:51:52;0100; pbs_mom;Req;;Type CopyFiles request received
from PBS_Server at homathko1.cluster.loc, sock=11
02/12/2008 15:51:56;0100; pbs_mom;Req;;Type DeleteJob request received
from PBS_Server at homathko1.cluster.loc, sock=11
02/12/2008 15:55:01;0008; pbs_mom;Job;16.homathko1.cluster.loc;JOIN
JOB as node 1
02/12/2008 15:55:39;0100;
pbs_mom;Job;16.homathko1.cluster.loc;kill_job received
--
Jan Dettmer, Postdoctoral Fellow
School of Earth and Ocean Sciences, University of Victoria
Victoria, BC V8W 3P6
office: (250) 472-4342 email: jand at uvic.ca
http://web.uvic.ca/~jand/
More information about the torqueusers
mailing list