[torqueusers] Using TORQUE in a supercomputer with lots of CPU's - one node - gets job-exclusive

Silas Silva silasdb at gmail.com
Tue Feb 11 10:49:01 MST 2014


On Thu, Feb 06, 2014 at 02:57:35PM -0200, Silas Silva wrote:
> Yes.  This is SGI Altix.
> 
> I see, so I have to compile it with --enable-numa-suport?  TORQUE admin
> guide is explanatory about that...
> 
> Thank you!

Hi there!

After installing TORQUE with NUMA support, cpus are recognized as
independent, separated by NUMA nodes.  The mom.layout file was generated
by the mom_gencfg script in the contrib/ directory.  After configuring,
NUMA nodes appear like a charm in pbsnodes.

But there is a problem, I have some nodes just free (I could even
allocate a job for bachianas-8) but others are just down.  Anybody could
help me with this?

Just below is the output of pbsnodes.

Thank you very much.

bachianas-0
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32280928kb,availmem=22578384kb,totmem=32280928kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-1
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32309232kb,availmem=22618976kb,totmem=32309232kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-2
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32309248kb,availmem=22600080kb,totmem=32309248kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-3
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32309232kb,availmem=22485920kb,totmem=32309232kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-4
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32309248kb,availmem=22570320kb,totmem=32309248kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-5
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32309232kb,availmem=22549840kb,totmem=32309232kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-6
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=32309248kb,availmem=22286656kb,totmem=32309248kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-7
     state = free
     np = 4
     ntype = cluster
     status = rectime=1392139586,varattr=,jobs=,state=free,netload=? 0,gres=,loadave=0.00,ncpus=16,physmem=28246000kb,availmem=16861728kb,totmem=28246000kb,idletime=678,nusers=0,nsessions=0,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-8
     state = job-exclusive
     np = 4
     ntype = cluster
     jobs = 0/48.bachianas.ufabc.edu.br
     status = rectime=1392139586,varattr=,jobs=48.bachianas.ufabc.edu.br,state=free,netload=? 0,gres=,loadave=0.00,ncpus=8,physmem=16131568kb,availmem=11036912kb,totmem=16131568kb,idletime=678,nusers=1,nsessions=1,sessions=117991,uname=Linux bachianas 2.6.16.60-0.42.10-default #1 SMP Tue Apr 27 05:11:27 UTC 2010 ia64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-9
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-10
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-11
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-12
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-13
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-14
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-15
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-16
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-17
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-18
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-19
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-20
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-21
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-22
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-23
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-24
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-25
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-26
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-27
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-28
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-29
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-30
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-31
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-32
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

bachianas-33
     state = down
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003


-- 
Silas Silva


More information about the torqueusers mailing list