[torqueusers] torque/maui disregarding pmem with procs

Lance Westerhoff lance at quantumbioinc.com
Sun Oct 23 14:50:01 MDT 2011


Hello all-

We are trying to use procs and pmem on an 18 node cluster with nodes of various memory size. pbsnodes shows the correct memory complement for each node, so apparently PBS is getting the right specs (see the output of pbsnodes below for more information). If we use the following settings in the PBS script, invariably PBS will try to fill up the all 8 of the 8 cores of each node. That is even though there is nowhere near enough memory on any of these nodes for 8*3700mb=29600mb. Considering the physical memory limit goes from 8GB to 24GB depending upon the node, this is just taking down nodes left and right.

Below I have provided an example along with the associated output. I also provided the output for pbsnodes in case there is something I am missing here. Thanks for your help!  -Lance

torque version: tried 2.5.4, 2.5.8, and 3.0.2
maui version: 3.2.6p21 (also tried maui 3.3.1 but it is a complete fail in terms of the procs option and it only asks for a single CPU)

$ cat tmp.pbs
#!/bin/bash
#PBS -S /bin/bash
#PBS -l procs=24
#PBS -l pmem=3700mb
#PBS -l walltime=6:00:00 
#PBS -j oe

cat $PBS_NODEFILE

$ qsub tmp.pbs
337003.XXXX
$ wc -l tmp.pbs.o337003
24 tmp.pbs.o337003
$ cat tmp.pbs.o337003
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-14
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-15
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16
compute-0-16

$ pbsnodes -a
compute-0-16
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219085,varattr=,jobs=,state=free,netload=1834011936,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10095652kb,totmem=10225576kb,idletime=5582,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-16.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-15
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=700017694,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10150996kb,totmem=10225576kb,idletime=5606,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-15.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-14
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=1003164957,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10131180kb,totmem=10225576kb,idletime=5615,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-14.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-13
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=1173266470,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10132104kb,totmem=10225576kb,idletime=5637,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-13.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-12
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=3991477,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14276448kb,totmem=14350232kb,idletime=5604,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-12.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-11
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2947879,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14274604kb,totmem=14350232kb,idletime=5588,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-11.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-9
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=3721396,gres=,loadave=0.05,ncpus=8,physmem=12301956kb,availmem=14253816kb,totmem=14350232kb,idletime=5660,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-9.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-8
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2934478,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14254796kb,totmem=14350232kb,idletime=5675,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-8.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-7
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2909406,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14254812kb,totmem=14350232kb,idletime=5489,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-7.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-6
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2936791,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14275644kb,totmem=14350232kb,idletime=5748,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-6.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-5
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2966183,gres=,loadave=0.00,ncpus=8,physmem=12301956kb,availmem=14276260kb,totmem=14350232kb,idletime=5695,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-5.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-4
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2886627,gres=,loadave=0.00,ncpus=8,physmem=16438900kb,availmem=18412332kb,totmem=18487176kb,idletime=5634,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-4.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-3
    state = free
    np = 8
    properties = lustre
    ntype = cluster
    status = rectime=1319219108,varattr=,jobs=,state=free,netload=436527254,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26636656kb,totmem=26736488kb,idletime=2224,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-3.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-2
    state = free
    np = 8
    properties = lustre
    ntype = cluster
    status = rectime=1319219106,varattr=,jobs=,state=free,netload=1184385,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26659668kb,totmem=26736488kb,idletime=2223,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-2.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-1
    state = free
    np = 8
    properties = lustre
    ntype = cluster
    status = rectime=1319219102,varattr=,jobs=,state=free,netload=1258074,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26657304kb,totmem=26736488kb,idletime=2228,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-1.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-0
    state = free
    np = 8
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=3416356,gres=,loadave=0.00,ncpus=8,physmem=24688212kb,availmem=26635624kb,totmem=26736488kb,idletime=5603,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-0.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-10
    state = free
    np = 2
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=283846193,gres=,loadave=0.23,ncpus=8,physmem=12301956kb,availmem=13762696kb,totmem=14350232kb,idletime=5622,nusers=1,nsessions=1,sessions=3410,uname=Linux compute-0-10.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0

compute-0-17
    state = free
    np = 8
    properties = testbox
    ntype = cluster
    status = rectime=1319219090,varattr=,jobs=,state=free,netload=2948331,gres=,loadave=0.00,ncpus=8,physmem=8177300kb,availmem=10144432kb,totmem=10225576kb,idletime=5558,nusers=0,nsessions=? 0,sessions=? 0,uname=Linux compute-0-17.local 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64,opsys=linux
    gpus = 0


(please note: sorry if you received this email twice, but for some reason the first time I sent it the email did not appear in the archive).


More information about the torqueusers mailing list