[torqueusers] torque/maui disregarding pmem with procs

Lance Westerhoff lance at quantumbioinc.com
Fri Oct 21 13:37:37 MDT 2011


Hello all-

We are trying to use procs and pmem on an 18 node cluster with nodes of various memory size. pbsnodes shows the correct memory complement for each node, so apparently PBS is getting the right specs (see the output of pbsnodes below for more information). If we use the following settings in the PBS script, invariably PBS will try to fill up the all 8 of the 8 cores of each node. That is even though there is no where near enough memory on any of these nodes for 8*3700mb=29600mb. Considering the physical memory limit goes from 8GB to 24GB depending upon the node, this is just taking down nodes left and right.

Below I have provided an example along with the associated output. I also provided the output for pbsnodes in case there is something I am missing here. Thanks for your help!

torque version: tried 2.5.4, 2.5.8, and 3.0.2
maui version: 3.2.6p21 (also tried maui 3.3.1 but it is a complete fail in terms of the procs option and it only asks for a single CPU)

$ cat tmp.pbs
#!/bin/bash
#PBS -S /bin/bash
#PBS -l procs=24
#PBS -l pmem=3700mb
#PBS -l walltime=6:00:00 
#PBS -j oe

cat $PBS_NODEFILE

$ qsub tmp.pbs
337003.XXXX
$ wc -l tmp.pbs.o337003
24 tmp.pbs.o337003
$ cat tmp.pbs.o337003
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16

$ pbsnodes -a
compute-0-16
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219085,varattr=,jobs=,state=free,netload=1834011936,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10095652kb,totmem=10225576kb,idletime=5582,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-16.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-15
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=700017694,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10150996kb,totmem=10225576kb,idletime=5606,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-15.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-14
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=1003164957,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10131180kb,totmem=10225576kb,idletime=5615,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-14.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-13
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=1173266470,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10132104kb,totmem=10225576kb,idletime=5637,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-13.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-12
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=3991477,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14276448kb,totmem=14350232kb,idletime=5604,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-12.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-11
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2947879,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14274604kb,totmem=14350232kb,idletime=5588,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-11.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-9
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=3721396,gres=,loadave=0.05,ncpus=8,physmem=12301956kb,availmem=14253816kb,totmem=14350232kb,idletime=5660,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-9.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-8
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2934478,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14254796kb,totmem=14350232kb,idletime=5675,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-8.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-7
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2909406,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14254812kb,totmem=14350232kb,idletime=5489,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-7.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-6
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2936791,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14275644kb,totmem=14350232kb,idletime=5748,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-6.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-5
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2966183,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14276260kb,totmem=14350232kb,idletime=5695,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-5.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-4
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2886627,gres=,loadave=0.00,ncpus=8,physmem=16438900kb,availmem=18412332kb,totmem=18487176kb,idletime=5634,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-4.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-3
     state = free
     np = 8
     properties = lustre
     ntype = cluster
     status = rectime=1319219108,varattr=,jobs=,state=free,netload=436527254,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26636656kb,totmem=26736488kb,idletime=2224,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-3.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-2
     state = free
     np = 8
     properties = lustre
     ntype = cluster
     status = rectime=1319219106,varattr=,jobs=,state=free,netload=1184385,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26659668kb,totmem=26736488kb,idletime=2223,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-2.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-1
     state = free
     np = 8
     properties = lustre
     ntype = cluster
     status = rectime=1319219102,varattr=,jobs=,state=free,netload=1258074,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26657304kb,totmem=26736488kb,idletime=2228,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-1.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-0
     state = free
     np = 8
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=3416356,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26635624kb,totmem=26736488kb,idletime=5603,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-0.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-10
     state = free
     np = 2
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=283846193,gres=,loadave=0.23,ncpus=8,physmem=12301956kb,availmem=13762696kb,totmem=14350232kb,idletime=5622,nusers=1,nsessions=1,sessions=3410,uname=Linux compute-0-10.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0

compute-0-17
     state = free
     np = 8
     properties = testbox
     ntype = cluster
     status = rectime=1319219090,varattr=,jobs=,state=free,netload=2948331,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10144432kb,totmem=10225576kb,idletime=5558,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-17.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
     gpus = 0





More information about the torqueusers mailing list