[torqueusers] free nodes but jobs are not running
Jackie Scoggins
JScoggins at lbl.gov
Wed Mar 21 00:12:44 MDT 2007
I have a question and it could be something very simple but I don't see it:
qmgr -c 'p s'
# Create queues and set their attributes.
#
#
# Create and define queue ram16
#
create queue ram16
set queue ram16 queue_type = Execution
set queue ram16 resources_max.mem = 16gb
set queue ram16 resources_min.mem = 8gb
set queue ram16 resources_default.mem = 8gb
set queue ram16 enabled = True
set queue ram16 started = True
#
# Create and define queue ram8
#
create queue ram8
set queue ram8 queue_type = Execution
set queue ram8 resources_max.mem = 8gb
set queue ram8 resources_min.mem = 4gb
set queue ram8 resources_default.mem = 4gb
set queue ram8 enabled = True
set queue ram8 started = True
#
# Create and define queue ram4
#
create queue ram4
set queue ram4 queue_type = Execution
set queue ram4 resources_max.mem = 4gb
set queue ram4 resources_default.mem = 1gb
set queue ram4 enabled = True
set queue ram4 started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = ram4
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.nodes = 1
set server scheduler_iteration = 60
set server node_ping_rate = 300
set server node_check_rate = 600
set server tcp_timeout = 6
set server pbs_version = 2.1.8
qstat -s
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
-------------------- -------- -------- ---------- ------ ----- --- ------ ----- - -----
50.coupled-cluster.l user ram4 CuPCuP++.2 13037 1 -- 4gb -- R 05:16
Job started on Tue Mar 20 at 16:51
57.coupled-cluster.l user ram8 CuPCuP.1d. 16589 1 -- 8gb -- R 04:17
Job started on Tue Mar 20 at 17:50
63.coupled-cluster.l user ram16 CuPCuP.2d. 22138 1 -- 16gb -- R 04:10
Job started on Tue Mar 20 at 17:57
79.coupled-cluster.l user ram16 test -- 1 -- 16gb -- Q --
Not Running: Not enough memory available
98.coupled-cluster.l scoggins ram4 mpi-hello. -- 3 -- 2gb -- Q --
Not Running: Not enough memory available
101.coupled-cluster. scoggins ram8 mpi-hello. -- 3 -- 6gb -- Q --
Not Running: Not enough memory available
pbsnodes -a
node0000
state = free
np = 2
properties = mem16gb
ntype = cluster
status = opsys=linux,uname=Linux node0000 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessions=? 0,nusers=
0,idletime=53301,totmem=16355264kb,availmem=16307688kb,physmem=16355264kb,ncpus=2,loadave=0.00,netload=190750343,state=free,jobs=? 0,rectime=117445
7374
node0001
state = free
np = 2
properties = mem16gb
ntype = cluster
status = opsys=linux,uname=Linux node0001 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessions=? 0,nusers=
0,idletime=53264,totmem=18134864kb,availmem=18087688kb,physmem=16174976kb,ncpus=2,loadave=0.00,netload=15128314,state=free,jobs=? 0,rectime=1174457
361
node0002
state = free
np = 2
properties = mem8gb
ntype = cluster
jobs = 0/57.coupled-cluster.lbl.gov
status = opsys=linux,uname=Linux node0002 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=16589,nsessions=1,nusers=
1,idletime=53315,totmem=10133908kb,availmem=9857084kb,physmem=8093664kb,ncpus=2,loadave=1.00,netload=104881749,state=free,jobs=57.coupled-cluster.l
bl.gov,rectime=1174457362
node0003
state = free
np = 2
properties = mem8gb
ntype = cluster
status = opsys=linux,uname=Linux node0003 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessions=? 0,nusers=
0,idletime=53264,totmem=10133904kb,availmem=10089216kb,physmem=8093660kb,ncpus=2,loadave=0.00,netload=14935077,state=free,jobs=? 0,rectime=11744573
60
node0004
state = free
np = 2
properties = mem8gb
ntype = cluster
status = opsys=linux,uname=Linux node0004 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessi
...
And there are a lot more free.
No jobs are running on the free nodes.
cat /var/spool/torque/sched_priv/sched_config
round_robin: False all
by_queue: True prime
by_queue: True non_prime
strict_fifo: false ALL
fair_share: false ALL
help_starving_jobs false ALL
sort_queues false ALL
load_balancing: true ALL
sort_by: shortest_job_first ALL
log_filter: 256
dedicated_prefix: ded
max_starve: 24:00:00
half_life: 24:00:00
unknown_shares: 10
sync_time: 1:00:00
Why are no other jobs running on the free nodes?
Thanks
Jackie
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20070320/01537087/attachment.html
More information about the torqueusers
mailing list