[torqueusers] free nodes but jobs are not running

Jackie Scoggins JScoggins at lbl.gov
Wed Mar 21 00:12:44 MDT 2007


I have a question and it could be something very simple but I don't see it:

qmgr -c 'p s'
# Create queues and set their attributes.
#
#
# Create and define queue ram16
#
create queue ram16
set queue ram16 queue_type = Execution
set queue ram16 resources_max.mem = 16gb
set queue ram16 resources_min.mem = 8gb
set queue ram16 resources_default.mem = 8gb
set queue ram16 enabled = True
set queue ram16 started = True
#
# Create and define queue ram8
#
create queue ram8
set queue ram8 queue_type = Execution
set queue ram8 resources_max.mem = 8gb
set queue ram8 resources_min.mem = 4gb
set queue ram8 resources_default.mem = 4gb
set queue ram8 enabled = True
set queue ram8 started = True
#
# Create and define queue ram4
#
create queue ram4
set queue ram4 queue_type = Execution
set queue ram4 resources_max.mem = 4gb
set queue ram4 resources_default.mem = 1gb
set queue ram4 enabled = True
set queue ram4 started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = ram4
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.nodes = 1
set server scheduler_iteration = 60
set server node_ping_rate = 300
set server node_check_rate = 600
set server tcp_timeout = 6
set server pbs_version = 2.1.8


qstat -s
                                                                   Req'd  Req'd   Elap
Job ID               Username Queue    Jobname    SessID NDS   TSK Memory Time  S Time
-------------------- -------- -------- ---------- ------ ----- --- ------ ----- - -----
50.coupled-cluster.l user   ram4     CuPCuP++.2  13037     1  --    4gb   --  R 05:16
   Job started on Tue Mar 20 at 16:51
57.coupled-cluster.l user   ram8     CuPCuP.1d.  16589     1  --    8gb   --  R 04:17
   Job started on Tue Mar 20 at 17:50
63.coupled-cluster.l user   ram16    CuPCuP.2d.  22138     1  --   16gb   --  R 04:10
   Job started on Tue Mar 20 at 17:57
79.coupled-cluster.l user ram16    test          --      1  --   16gb   --  Q   -- 
   Not Running: Not enough memory available
98.coupled-cluster.l scoggins ram4     mpi-hello.    --      3  --    2gb   --  Q   -- 
   Not Running: Not enough memory available
101.coupled-cluster. scoggins ram8     mpi-hello.    --      3  --    6gb   --  Q   -- 
   Not Running: Not enough memory available


pbsnodes -a

node0000
     state = free
     np = 2
     properties = mem16gb
     ntype = cluster
     status = opsys=linux,uname=Linux node0000 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessions=? 0,nusers=
0,idletime=53301,totmem=16355264kb,availmem=16307688kb,physmem=16355264kb,ncpus=2,loadave=0.00,netload=190750343,state=free,jobs=? 0,rectime=117445
7374
 
node0001
     state = free
     np = 2
     properties = mem16gb
     ntype = cluster
     status = opsys=linux,uname=Linux node0001 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessions=? 0,nusers=
0,idletime=53264,totmem=18134864kb,availmem=18087688kb,physmem=16174976kb,ncpus=2,loadave=0.00,netload=15128314,state=free,jobs=? 0,rectime=1174457
361
 
node0002
     state = free
     np = 2
     properties = mem8gb
     ntype = cluster
     jobs = 0/57.coupled-cluster.lbl.gov
     status = opsys=linux,uname=Linux node0002 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=16589,nsessions=1,nusers=
1,idletime=53315,totmem=10133908kb,availmem=9857084kb,physmem=8093664kb,ncpus=2,loadave=1.00,netload=104881749,state=free,jobs=57.coupled-cluster.l
bl.gov,rectime=1174457362
 
node0003
     state = free
     np = 2
     properties = mem8gb
     ntype = cluster
     status = opsys=linux,uname=Linux node0003 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessions=? 0,nusers=
0,idletime=53264,totmem=10133904kb,availmem=10089216kb,physmem=8093660kb,ncpus=2,loadave=0.00,netload=14935077,state=free,jobs=? 0,rectime=11744573
60
 
node0004
     state = free
     np = 2
     properties = mem8gb
     ntype = cluster
     status = opsys=linux,uname=Linux node0004 2.6.17.11-102.caos.smp #1 SMP Thu Aug 24 23:30:43 EDT 2006 x86_64,sessions=? 0,nsessi
...

And there are a lot more free.

No jobs are running on the free nodes.

cat /var/spool/torque/sched_priv/sched_config

round_robin: False      all
 
 
 
by_queue: True          prime
by_queue: True          non_prime
 
 
 
strict_fifo: false      ALL
 
fair_share: false       ALL
 
 
help_starving_jobs      false   ALL
 
sort_queues     false   ALL
 
load_balancing: true    ALL
 
 
 
sort_by: shortest_job_first     ALL
 
log_filter: 256
 
dedicated_prefix: ded
 
max_starve: 24:00:00
 
 
half_life: 24:00:00
 
unknown_shares: 10
 
sync_time: 1:00:00


Why are no other jobs running on the free nodes?

Thanks

Jackie
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20070320/01537087/attachment.html


More information about the torqueusers mailing list