[torqueusers] Issues with Parallel Job Submissions in PBS
佳栋 蔚
yjdvsroger at yahoo.com.cn
Thu Jul 26 00:06:41 MDT 2012
I have found the problem, thx!
I failed to install MPI into the same path!
followed my configuration and error message .
EMAILS:
PBS JOB ID:76.lancelot-laptop
[lancelot at cfa ~]$ cat job3.pbs
#!/bin/bash
#PBS -N job3
#PBS -o job3.log
#PBS -e job3.err
#PBS -q sai
#PBA -I
#PBS -l nodes=2:ppn=2
#PBS -l walltime=24:00:00
#PBS -l cput=1:00:00
#PBS -V
cd /home/lancelot
echo running on hosts `hostname`
echo time is `date`
echo directory is $PWD
echo job runs on the nodes:
cat $PBS_NODEFILE
NPROCS=`wc -l < $PBS_NODEFILE`
echo this job has allocated $NPROCS nodes
mpiexec -np 4 ./prog
[lancelot at cfa ~]$ cat prog
#!/bin/bash
echo 999999999|./icpi
root at lancelot-laptop:/home/lancelot# pbsnodes
lancelot-laptop
state = free
np = 2
ntype = cluster
jobs = 0/76.lancelot-laptop
status = rectime=1343122703,varattr=,jobs=76.lancelot-laptop,state=free,netload=95261305,gres=,loadave=0.57,ncpus=2,physmem=1542608kb,availmem=2981784kb,totmem=3494344kb,idletime=14158,nusers=2,nsessions=13,sessions=1100 792 1309 1349 1365 1374 1384 1439 1452 1682 1749 1798 2737,uname=Linux lancelot-laptop 2.6.32-41-generic #94-Ubuntu SMP Fri Jul 6 16:51:39 UTC 2012 i686,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
gpus = 0
cfa
state = free
np = 12
ntype = cluster
jobs = 0/76.lancelot-laptop
status = rectime=1343122703,varattr=,jobs=76.lancelot-laptop,state=free,netload=492745850,gres=,loadave=0.00,ncpus=12,physmem=8015456kb,availmem=22517440kb,totmem=24399448kb,idletime=2992,nusers=5,nsessions=58,sessions=18335 469 27670 752 18344 834 1171 1982 2226 3403 2290 14058 14160 14359 14579 15144 15464 15698 15913 16121 16201 16444 16988 17058 17603 18048 18278 18378 18379 18405 18411 18479 18557 18884 19096 22028 22149 22256 22257 22283 22290 22347 27347 27515 27561 30703 30712 30795 30797 30823 30829 30905 32454 32458 32459 32467 32469 32489,uname=Linux cfa 2.6.32-220.el6.x86_64 #1 SMP Tue Dec 6 19:48:22 GMT 2011 x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
gpus = 0
root at lancelot-laptop:/home/lancelot# tracejob 76
Job: 76.lancelot-laptop
07/24/2012 15:01:03 M JOIN JOB as node 1
07/24/2012 15:01:03 S enqueuing into sai, state 1 hop 1
07/24/2012 15:01:03 S Job Queued at request of lancelot at cfa, owner =
lancelot at cfa, job name = job3, queue = sai
07/24/2012 15:01:03 S Job Modified at request of Scheduler at lancelot-laptop
07/24/2012 15:01:03 L Job Run
07/24/2012 15:01:03 S Job Run at request of Scheduler at lancelot-laptop
07/24/2012 15:01:03 A queue=sai
07/24/2012 15:01:03 A user=lancelot group=lancelot jobname=job3 queue=sai
ctime=1343113263 qtime=1343113263 etime=1343113263
start=1343113263 owner=lancelot at cfa
exec_host=cfa/0+lancelot-laptop/0
Resource_List.cput=01:00:00 Resource_List.neednodes=2
Resource_List.nodect=2 Resource_List.nodes=2
Resource_List.walltime=24:00:00
07/24/2012 15:01:57 S Not sending email: User does not want mail of this
type.
root at lancelot-laptop:/home/lancelot# tracejob 77
Job: 77.lancelot-laptop
07/24/2012 15:13:11 S enqueuing into sai, state 1 hop 1
07/24/2012 15:13:11 S Job Queued at request of lancelot at cfa, owner =
lancelot at cfa, job name = job4, queue = sai
07/24/2012 15:13:11 S Job Modified at request of Scheduler at lancelot-laptop
07/24/2012 15:13:11 L Job Run
07/24/2012 15:13:11 S Job Run at request of Scheduler at lancelot-laptop
07/24/2012 15:13:11 S Not sending email: User does not want mail of this
type.
07/24/2012 15:13:11 A queue=sai
07/24/2012 15:13:11 A user=lancelot group=lancelot jobname=job4 queue=sai
ctime=1343113991 qtime=1343113991 etime=1343113991
start=1343113991 owner=lancelot at cfa
exec_host=lancelot-laptop/1
Resource_List.cput=01:00:00
Resource_List.walltime=24:00:00
07/24/2012 15:13:56 S Not sending email: User does not want mail of this
type.
07/24/2012 15:13:56 S Exit_status=0 resources_used.cput=00:00:45
resources_used.mem=5300kb resources_used.vmem=19680kb
resources_used.walltime=00:00:45
07/24/2012 15:13:56 M scan_for_terminated: job 77.lancelot-laptop task 1
terminated, sid=4008
07/24/2012 15:13:56 M job was terminated
07/24/2012 15:13:56 M obit sent to server
07/24/2012 15:13:56 A user=lancelot group=lancelot jobname=job4 queue=sai
ctime=1343113991 qtime=1343113991 etime=1343113991
start=1343113991 owner=lancelot at cfa
exec_host=lancelot-laptop/1
Resource_List.cput=01:00:00
Resource_List.walltime=24:00:00 session=4008
end=1343114036 Exit_status=0
resources_used.cput=00:00:45 resources_used.mem=5300kb
resources_used.vmem=19680kb
resources_used.walltime=00:00:45
07/24/2012 15:13:57 M removed job script
07/24/2012 15:18:57 S dequeuing from sai, state COMPLETE
root at lancelot-laptop:/home/lancelot# tracejob 78
/var/spool/torque/mom_logs/20120724: No matching job records located
Job: 78.lancelot-laptop
07/24/2012 16:25:51 S enqueuing into sai, state 1 hop 1
07/24/2012 16:25:51 S Job Queued at request of lancelot at cfa, owner =
lancelot at cfa, job name = job3, queue = sai
07/24/2012 16:25:51 A queue=sai
07/24/2012 16:25:56 S Job Modified at request of Scheduler at lancelot-laptop
07/24/2012 16:25:56 L Not enough of the right type of nodes available
root at lancelot-laptop:/home/lancelot# qstat -f 76
Job Id: 76.lancelot-laptop
Job_Name = job3
Job_Owner = lancelot at cfa
resources_used.cput = 00:00:00
resources_used.mem = 9304kb
resources_used.vmem = 478176kb
resources_used.walltime = 01:15:51
job_state = R
queue = sai
server = lancelot-laptop
Checkpoint = u
ctime = Tue Jul 24 15:01:03 2012
Error_Path = cfa:/home/lancelot/job3.err
exec_host = cfa/0+lancelot-laptop/0
exec_port = 15003+15003
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Tue Jul 24 15:01:57 2012
Output_Path = cfa:/home/lancelot/job3.log
Priority = 0
qtime = Tue Jul 24 15:01:03 2012
Rerunable = True
Resource_List.cput = 01:00:00
Resource_List.neednodes = 2
Resource_List.nodect = 2
Resource_List.nodes = 2
Resource_List.walltime = 24:00:00
session_id = 752
substate = 42
Variable_List = PBS_O_QUEUE=sai,PBS_O_HOST=cfa,PBS_O_HOME=/home/lancelot,
PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=lancelot,
PBS_O_PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/
sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/torque/bin:/usr/local/t
orque/sbin:/usr/local/maui/bin:/usr/local/maui/sbin:/usr/java/jdk1.6.0
_33/bin:/home/shu/software/mpich2-1.4:/root/bin:/usr/local/torque/bin:
/usr/local/maui/bin:/usr/java/jdk1.6.0_33/bin:/home/shu/software/mpich
2-1.4,PBS_O_MAIL=/var/spool/mail/lancelot,PBS_O_SHELL=/bin/bash,
PBS_SERVER=lancelot-laptop,PBS_O_WORKDIR=/home/lancelot,
TOMCAT_HOME=/home/shu/software/apache-tomcat-7.0.29,HOSTNAME=cfa,
SHELL=/bin/bash,TERM=xterm,HISTSIZE=1000,
SSH_CLIENT=192.168.0.46 58198 22,QTDIR=/usr/lib64/qt-3.3,
QTINC=/usr/lib64/qt-3.3/include,SSH_TTY=/dev/pts/6,USER=lancelot,
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=
40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=3
0;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj
=01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.
zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01
;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=0
1;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpi
o=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.b
mp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*
.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;
35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=
01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.v
ob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.r
mvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*
.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:
*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36
:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=0
1;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=
01;36:*.xspf=01;36:,
PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/
usr/bin:/sbin:/bin:/usr/games:/usr/local/torque/bin:/usr/local/torque/
sbin:/usr/local/maui/bin:/usr/local/maui/sbin:/usr/java/jdk1.6.0_33/bi
n:/home/shu/software/mpich2-1.4:/root/bin:/usr/local/torque/bin:/usr/l
ocal/maui/bin:/usr/java/jdk1.6.0_33/bin:/home/shu/software/mpich2-1.4,
MAIL=/var/spool/mail/lancelot,PWD=/home/lancelot,
JAVA_HOME=/usr/java/jdk1.6.0_33,LANG=en_US.UTF-8,
HISTCONTROL=ignoredups,
SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass,
HOME=/home/lancelot,SHLVL=6,LOGNAME=lancelot,CVS_RSH=ssh,
QTLIB=/usr/lib64/qt-3.3/lib,
SSH_CONNECTION=192.168.0.46 58198 192.168.0.111 22,
CLASSPATH=.:/usr/java/jdk1.6.0_33/jre/lib/rt.jar:/usr/java/jdk1.6.0_3
3/lib/dt.jar:/usr/java/jdk1.6.0_33/lib/tools.jar,
LESSOPEN=|/usr/bin/lesspipe.sh %s,TORQUE=/usr/local/torque,
MAUI=/usr/local/maui,G_BROKEN_FILENAMES=1,_=/usr/local/bin/qsub
euser = lancelot
egroup = lancelot
hashname = 76.lancelot-laptop
queue_rank = 19
queue_type = E
comment = Job started on Tue Jul 24 at 15:01
etime = Tue Jul 24 15:01:03 2012
submit_args = job3.pbs
start_time = Tue Jul 24 15:01:03 2012
Walltime.Remaining = 76692
start_count = 1
fault_tolerant = False
submit_host = cfa
init_work_dir = /home/lancelot
root at lancelot-laptop:/home/lancelot# qstat
Job id Name User Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
76.lancelot-laptop job3 lancelot 00:00:00 R sai
78.lancelot-laptop job3 lancelot 0 Q sai
root at lancelot-laptop:/home/lancelot# qsub --version
version: 3.0.3
root at lancelot-laptop:/home/lancelot# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue sai
#
create queue sai
set queue sai queue_type = Execution
set queue sai acl_groups = lancelot-laptop
set queue sai acl_group_sloppy = True
set queue sai route_destinations = lancelot-laptop
set queue sai enabled = True
set queue sai started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = lancelot-laptop
set server managers = lancelot at lancelot-laptop
set server operators = lancelot at lancelot-laptop
set server default_queue = sai
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
set server next_job_number = 79
root at lancelot-laptop:/home/lancelot# qmgr -c "list queue sai"
Queue sai
queue_type = Execution
total_jobs = 2
state_count = Transit:0 Queued:1 Held:0 Waiting:0 Running:1 Exiting:0
acl_groups = lancelot-laptop
acl_group_sloppy = True
mtime = Tue Jul 24 11:31:37 2012
resources_assigned.nodect = 2
route_destinations = lancelot-laptop
enabled = True
started = True
root at lancelot-laptop:/home/lancelot# cat /var/spool/torque/server_priv/nodes
lancelot-laptop np=2
cfa np=12
tom np=2
root at lancelot-laptop:/home/lancelot# cat /var/spool/torque/mom_priv/config
$pbsserver lancelot-laptop
$logevent 255
cat server_name
lancelot-laptop
root at lancelot-laptop:/home/lancelot# qstat -Q
Queue Max Tot Ena Str Que Run Hld Wat Trn Ext T
---------------- --- --- --- --- --- --- --- --- --- --- -
sai 0 2 yes yes 1 1 0 0 0 0 E
root at lancelot-laptop:/home/lancelot# qstat -q
server: lancelot-laptop
Queue Memory CPU Time Walltime Node Run Que Lm State
---------------- ------ -------- -------- ---- --- --- -- -----
sai -- -- -- -- 1 1 -- E R
----- -----
1 1
root at lancelot-laptop:/home/lancelot# qstat -B
Server Max Tot Que Run Hld Wat Trn Ext Status
---------------- --- --- --- --- --- --- --- --- ----------
lancelot-laptop 0 2 1 1 0 0 0 0 Active
mom_logs:
07/24/2012 14:53:43;0002; pbs_mom;Svr;im_eof;End of File from addr 192.168.0.111:1023
07/24/2012 14:53:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 14:54:44;0001; pbs_mom;Svr;pbs_mom;LOG_DEBUG::mom_checkpoint_job_has_checkpoint, FALSE
07/24/2012 14:54:44;0001; pbs_mom;Job;TMomFinalizeJob3;job 75.lancelot-laptop started, pid = 3807
07/24/2012 14:55:29;0080; pbs_mom;Job;75.lancelot-laptop;scan_for_terminated: job 75.lancelot-laptop task 1 terminated, sid=3807
07/24/2012 14:55:29;0008; pbs_mom;Job;75.lancelot-laptop;job was terminated
07/24/2012 14:55:29;0080; pbs_mom;Svr;preobit_reply;top of preobit_reply
07/24/2012 14:55:29;0080; pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop
07/24/2012 14:55:29;0080; pbs_mom;Svr;preobit_reply;in while loop, no error from job stat
07/24/2012 14:55:29;0080; pbs_mom;Job;75.lancelot-laptop;obit sent to server
07/24/2012 14:55:29;0080; pbs_mom;Job;75.lancelot-laptop;removed job script
07/24/2012 14:58:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:01:03;0008; pbs_mom;Job;76.lancelot-laptop;JOIN JOB as node 1
07/24/2012 15:03:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:08:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:13:11;0001; pbs_mom;Svr;pbs_mom;LOG_DEBUG::mom_checkpoint_job_has_checkpoint, FALSE
07/24/2012 15:13:11;0001; pbs_mom;Job;TMomFinalizeJob3;job 77.lancelot-laptop started, pid = 4008
07/24/2012 15:13:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:13:56;0080; pbs_mom;Job;77.lancelot-laptop;scan_for_terminated: job 77.lancelot-laptop task 1 terminated, sid=4008
07/24/2012 15:13:56;0008; pbs_mom;Job;77.lancelot-laptop;job was terminated
07/24/2012 15:13:56;0080; pbs_mom;Svr;preobit_reply;top of preobit_reply
07/24/2012 15:13:56;0080; pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop
07/24/2012 15:13:56;0080; pbs_mom;Svr;preobit_reply;in while loop, no error from job stat
07/24/2012 15:13:56;0080; pbs_mom;Job;77.lancelot-laptop;obit sent to server
07/24/2012 15:13:57;0080; pbs_mom;Job;77.lancelot-laptop;removed job script
07/24/2012 15:18:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:23:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:28:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:33:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20120726/92677c3d/attachment-0001.html
More information about the torqueusers
mailing list