[Mauiusers] Can´t get busy nodes

Fernando Caba fcaba at uns.edu.ar
Thu Sep 29 06:27:15 MDT 2011


Hi Gus, here are the results of all commands you mention:

[root at fe ~]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 2400:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = fe
set server managers = root at fe
set server operators = root at fe
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
set server auto_node_np = True
set server next_job_number = 182
set server record_job_info = True
[root at fe ~]#


${TORQUE}/bin/pbsnodes

[root at fe ~]# pbsnodes
n10
      state = free
      np = 12
      ntype = cluster
      jobs = 0/121.fe
      status = 
rectime=1317298640,varattr=,jobs=121.fe,state=free,netload=261129374581,gres=,loadave=4.00,ncpus=12,physmem=16360208kb,availmem=62484756kb,totmem=83471736kb,idletime=63369,nusers=2,nsessions=2,sessions=4394 
8087,uname=Linux n10 2.6.18-194.el5 #1 SMP Fri Apr 2 14:58:14 EDT 2010 
x86_64,opsys=linux
      mom_service_port = 15002
      mom_manager_port = 15003
      gpus = 0

n11
      state = free
      np = 12
      ntype = cluster
      jobs = 0/143.fe
      status = 
rectime=1317298637,varattr=,jobs=143.fe,state=free,netload=12864227236,gres=,loadave=8.00,ncpus=12,physmem=16360208kb,availmem=78708424kb,totmem=83469060kb,idletime=1354314,nusers=2,nsessions=2,sessions=4583 
20253,uname=Linux n11 2.6.18-194.el5 #1 SMP Fri Apr 2 14:58:14 EDT 2010 
x86_64,opsys=linux
      mom_service_port = 15002
      mom_manager_port = 15003
      gpus = 0

n12
      state = free
      np = 12
      ntype = cluster
      jobs = 0/144.fe
      status = 
rectime=1317298647,varattr=,jobs=144.fe,state=free,netload=953102292987,gres=,loadave=8.01,ncpus=12,physmem=16360208kb,availmem=78740696kb,totmem=83469060kb,idletime=1168354,nusers=2,nsessions=2,sessions=4635 
20289,uname=Linux n12 2.6.18-194.el5 #1 SMP Fri Apr 2 14:58:14 EDT 2010 
x86_64,opsys=linux
      mom_service_port = 15002
      mom_manager_port = 15003
      gpus = 0

n13
      state = free
      np = 12
      ntype = cluster
      jobs = 0/181.fe
      status = 
rectime=1317298672,varattr=,jobs=181.fe,state=free,netload=1010169147229,gres=,loadave=4.00,ncpus=12,physmem=15955108kb,availmem=81150100kb,totmem=83066636kb,idletime=138726,nusers=2,nsessions=2,sessions=4407 
29186,uname=Linux n13 2.6.18-194.el5xen #1 SMP Fri Apr 2 15:34:40 EDT 
2010 x86_64,opsys=linux
      mom_service_port = 15002
      mom_manager_port = 15003
      gpus = 0

[root at fe ~]#

${MAUI}/bin/showconfig

[root at fe ~]# which showconfig
/usr/local/maui/bin/showconfig
[root at fe ~]# showconfig
# Maui version 3.3.1 (PID: 18407)
# global policies

REJECTNEGPRIOJOBS[0]              FALSE
ENABLENEGJOBPRIORITY[0]           FALSE
ENABLEMULTINODEJOBS[0]            TRUE
ENABLEMULTIREQJOBS[0]             FALSE
BFPRIORITYPOLICY[0]               [NONE]
JOBPRIOACCRUALPOLICY            QUEUEPOLICY
NODELOADPOLICY                  ADJUSTSTATE
USEMACHINESPEEDFORFS            FALSE
USEMACHINESPEED                 FALSE
USESYSTEMQUEUETIME              TRUE
USELOCALMACHINEPRIORITY         FALSE
NODEUNTRACKEDLOADFACTOR         1.2
JOBNODEMATCHPOLICY[0]             EXACTNODE

JOBMAXSTARTTIME[0]                  INFINITY

METAMAXTASKS[0]                   0
NODESETPOLICY[0]                  [NONE]
NODESETATTRIBUTE[0]               [NONE]
NODESETLIST[0]
NODESETDELAY[0]                   00:00:00
NODESETPRIORITYTYPE[0]            MINLOSS
NODESETTOLERANCE[0]                 0.00

BACKFILLPOLICY[0]                 FIRSTFIT
BACKFILLDEPTH[0]                  0
BACKFILLPROCFACTOR[0]             0
BACKFILLMAXSCHEDULES[0]           10000
BACKFILLMETRIC[0]                 PROCS

BFCHUNKDURATION[0]                00:00:00
BFCHUNKSIZE[0]                    0
PREEMPTPOLICY[0]                  REQUEUE
MINADMINSTIME[0]                  00:00:00
RESOURCELIMITPOLICY[0]
NODEAVAILABILITYPOLICY[0]         COMBINED:[DEFAULT]
NODEALLOCATIONPOLICY[0]           MINRESOURCE
TASKDISTRIBUTIONPOLICY[0]         DEFAULT
RESERVATIONPOLICY[0]              CURRENTHIGHEST
RESERVATIONRETRYTIME[0]           00:00:00
RESERVATIONTHRESHOLDTYPE[0]       NONE
RESERVATIONTHRESHOLDVALUE[0]      0

FSPOLICY                        [NONE]
FSPOLICY                        [NONE]
FSINTERVAL                      12:00:00
FSDEPTH                         8
FSDECAY                         1.00



# Priority Weights

SERVICEWEIGHT[0]                  1
TARGETWEIGHT[0]                   1
CREDWEIGHT[0]                     1
ATTRWEIGHT[0]                     1
FSWEIGHT[0]                       1
RESWEIGHT[0]                      1
USAGEWEIGHT[0]                    1
QUEUETIMEWEIGHT[0]                1
XFACTORWEIGHT[0]                  0
SPVIOLATIONWEIGHT[0]              0
BYPASSWEIGHT[0]                   0
TARGETQUEUETIMEWEIGHT[0]          0
TARGETXFACTORWEIGHT[0]            0
USERWEIGHT[0]                     0
GROUPWEIGHT[0]                    0
ACCOUNTWEIGHT[0]                  0
QOSWEIGHT[0]                      0
CLASSWEIGHT[0]                    0
FSUSERWEIGHT[0]                   0
FSGROUPWEIGHT[0]                  0
FSACCOUNTWEIGHT[0]                0
FSQOSWEIGHT[0]                    0
FSCLASSWEIGHT[0]                  0
ATTRATTRWEIGHT[0]                 0
ATTRSTATEWEIGHT[0]                0
NODEWEIGHT[0]                     0
PROCWEIGHT[0]                     0
MEMWEIGHT[0]                      0
SWAPWEIGHT[0]                     0
DISKWEIGHT[0]                     0
PSWEIGHT[0]                       0
PEWEIGHT[0]                       0
WALLTIMEWEIGHT[0]                 0
UPROCWEIGHT[0]                    0
UJOBWEIGHT[0]                     0
CONSUMEDWEIGHT[0]                 0
USAGEEXECUTIONTIMEWEIGHT[0]       0
REMAININGWEIGHT[0]                0
PERCENTWEIGHT[0]                  0
XFMINWCLIMIT[0]                   00:02:00


# partition DEFAULT policies

REJECTNEGPRIOJOBS[1]              FALSE
ENABLENEGJOBPRIORITY[1]           FALSE
ENABLEMULTINODEJOBS[1]            TRUE
ENABLEMULTIREQJOBS[1]             FALSE
BFPRIORITYPOLICY[1]               [NONE]
JOBPRIOACCRUALPOLICY            QUEUEPOLICY
NODELOADPOLICY                  ADJUSTSTATE
JOBNODEMATCHPOLICY[1]

JOBMAXSTARTTIME[1]                  INFINITY

METAMAXTASKS[1]                   0
NODESETPOLICY[1]                  [NONE]
NODESETATTRIBUTE[1]               [NONE]
NODESETLIST[1]
NODESETDELAY[1]                   00:00:00
NODESETPRIORITYTYPE[1]            MINLOSS
NODESETTOLERANCE[1]                 0.00

# Priority Weights

XFMINWCLIMIT[1]                   00:00:00

RMAUTHTYPE[0]                     CHECKSUM

CLASSCFG[[NONE]]  DEFAULT.FEATURES=[NONE]
CLASSCFG[[ALL]]  DEFAULT.FEATURES=[NONE]
CLASSCFG[batch]  DEFAULT.FEATURES=[NONE]
QOSPRIORITY[0]                    0
QOSQTWEIGHT[0]                    0
QOSXFWEIGHT[0]                    0
QOSTARGETXF[0]                      0.00
QOSTARGETQT[0]                    00:00:00
QOSFLAGS[0]
QOSPRIORITY[1]                    0
QOSQTWEIGHT[1]                    0
QOSXFWEIGHT[1]                    0
QOSTARGETXF[1]                      0.00
QOSTARGETQT[1]                    00:00:00
QOSFLAGS[1]
# SERVER MODULES:  MX
SERVERMODE                      NORMAL
SERVERNAME
SERVERHOST                      fe
SERVERPORT                      42559
LOGFILE                         maui.log
LOGFILEMAXSIZE                  10000000
LOGFILEROLLDEPTH                1
LOGLEVEL                        3
LOGFACILITY                     fALL
SERVERHOMEDIR                   /usr/local/maui/
TOOLSDIR                        /usr/local/maui/tools/
LOGDIR                          /usr/local/maui/log/
STATDIR                         /usr/local/maui/stats/
LOCKFILE                        /usr/local/maui/maui.pid
SERVERCONFIGFILE                /usr/local/maui/maui.cfg
CHECKPOINTFILE                  /usr/local/maui/maui.ck
CHECKPOINTINTERVAL              00:05:00
CHECKPOINTEXPIRATIONTIME        3:11:20:00
TRAPJOB
TRAPNODE
TRAPFUNCTION
RESDEPTH                        24

RMPOLLINTERVAL                  00:00:30
NODEACCESSPOLICY                SHARED
ALLOCLOCALITYPOLICY             [NONE]
SIMTIMEPOLICY                   [NONE]
ADMIN1                          root
ADMINHOSTS                      ALL
NODEPOLLFREQUENCY               0
DISPLAYFLAGS
DEFAULTDOMAIN
DEFAULTCLASSLIST                [DEFAULT:1]
FEATURENODETYPEHEADER
FEATUREPROCSPEEDHEADER
FEATUREPARTITIONHEADER
DEFERTIME                       1:00:00
DEFERCOUNT                      24
DEFERSTARTCOUNT                 1
JOBPURGETIME                    0
NODEPURGETIME                   2140000000
APIFAILURETHRESHHOLD            6
NODESYNCTIME                    600
JOBSYNCTIME                     600
JOBMAXOVERRUN                   00:10:00
NODEMAXLOAD                     0.0

PLOTMINTIME                     120
PLOTMAXTIME                     245760
PLOTTIMESCALE                   11
PLOTMINPROC                     1
PLOTMAXPROC                     512
PLOTPROCSCALE                   9
SCHEDCFG[]                        MODE=NORMAL SERVER=fe:42559
# RM MODULES: PBS SSS WIKI NATIVE
RMCFG[FE] AUTHTYPE=CHECKSUM EPORT=15004 TIMEOUT=00:00:09 TYPE=PBS
SIMWORKLOADTRACEFILE            workload
SIMRESOURCETRACEFILE            resource
SIMAUTOSHUTDOWN                 OFF
SIMSTARTTIME                    0
SIMSCALEJOBRUNTIME              FALSE
SIMFLAGS
SIMJOBSUBMISSIONPOLICY          CONSTANTJOBDEPTH
SIMINITIALQUEUEDEPTH            16
SIMWCACCURACY                   0.00
SIMWCACCURACYCHANGE             0.00
SIMNODECOUNT                    0
SIMNODECONFIGURATION            NORMAL
SIMWCSCALINGPERCENT             100
SIMCOMRATE                      0.10
SIMCOMTYPE                      ROUNDROBIN
COMINTRAFRAMECOST               0.30
COMINTERFRAMECOST               0.30
SIMSTOPITERATION                -1
SIMEXITITERATION                -1



[root at fe ~]# ps -ef |grep  maui
root     18407     1  0 Sep28 ?        00:00:04 /usr/local/maui/sbin/maui
root     22527 22463  0 09:19 pts/2    00:00:00 grep maui
[root at fe ~]# service maui status
maui (pid 18407) is running...
[root at fe ~]# service pbs_server status
pbs_server (pid 4147) is running...
[root at fe ~]#

service pbs_sched status [just in case it is also running ...]
service pbs_mom status
service pbs status

none of those 3 services are installed

Thank you very much

----------------------------------------------------
Ing. Fernando Caba
Director General de Telecomunicaciones
Universidad Nacional del Sur
http://www.dgt.uns.edu.ar
Tel/Fax: (54)-291-4595166
Tel: (54)-291-4595101 int. 2050
Avda. Alem 1253, (B8000CPB) Bahía Blanca - Argentina
----------------------------------------------------


El 28/09/2011 04:07 PM, Gus Correa escribió:
> Hi Fernando
>
> Did you restart maui after you changed maui.cfg? [service maui restart]
>
> Any chances that what you see is still residual from old jobs,
> submitted before you changed the maui configuration and job scripts
> [#PBS -l nodes=1:ppn=12]?
>
> For more help from everybody in the list,
> it may be useful if you send the output of:
>
> qmgr -c 'p s'
>
> ${TORQUE}/bin/pbsnodes
>
> ${MAUI}/bin/showconfig
>
> ps -ef |grep  maui
>
> service maui status
> service pbs_server status
> service pbs_sched status [just in case it is also running ...]
> service pbs_mom status
> service pbs status
>
> I hope this helps,
> Gus Correa
>
>
> Fernando Caba wrote:
>> Hi everybody, thanks for all answers.
>> I try all that you point out:
>>
>> including
>> #PBS -l nodes=1:ppn=12
>>
>> adding
>>
>> JOBNODEMATCHPOLICY EXACTNODE
>>
>> to maui.cfg
>>
>> but nothing of this work. I´m thinking that the problem is in another
>> config parameter (maui or torque).
>>
>> I will reading more about all.
>>
>> Thanks!!
>>
>> ----------------------------------------------------
>> Ing. Fernando Caba
>> Director General de Telecomunicaciones
>> Universidad Nacional del Sur
>> http://www.dgt.uns.edu.ar
>> Tel/Fax: (54)-291-4595166
>> Tel: (54)-291-4595101 int. 2050
>> Avda. Alem 1253, (B8000CPB) Bahía Blanca - Argentina
>> ----------------------------------------------------
>>
>>
>> El 28/09/2011 12:33 PM, Gus Correa escribió:
>>> Hi Fernando
>>>
>>> Dennis already pointed out the first/main problem.
>>> Your Torque/PBS script is not requesting a specific number of nodes
>>> and cores/processors.
>>> You can ask for 12 processors, even if your MPI command doesn't
>>> use all of them:
>>>
>>> #PBS -l nodes=1:ppn=12
>>>
>>> [You can still do mpirun -np 8 if you want.]
>>>
>>> This will prevent two jobs to run in the same node [which seems
>>> to be your goal, if I understood it right].
>>>
>>> I like to add also the queue name [even if it is the default]
>>> and the job name [for documentation and stdout/stderr
>>> naming consistency]
>>>
>>> #PBS -q myqueue [whatever you called your queue]
>>> #PBS -N myjob [15 characters at most, the rest gets truncated]
>>>
>>> The #PBS clauses must be together and right after the #! /bin/sh line.
>>>
>>> Ask your users to always add these lines to their jobs.
>>> There is a feature of torque that allows you to write a wrapper
>>> that will whatever you want to the job script,
>>> but if your pool of users is small
>>> you can just ask them to cooperate.
>>>
>>> Of course there is much more that you can add.
>>> 'man qsub' and 'man pbs_resources' are good sources of information,
>>> highly recommended reading.
>>>
>>>
>>> Then there is what Antonio Messina mentioned, the cpuset feature
>>> of Torque.
>>> I don't know if you installed Torque with this feature enabled.
>>> However, if you did, it will allow the specific cores to be
>>> assigned to each process, which could allow node-sharing without
>>> jobs stepping on each other toes.
>>> However:
>>> A) this requires a bit more of setup [not a lot, check the
>>> list archives and the Torque Admin Guide]
>>> B) if your users are cooperative and request 12 processors for each job,
>>> and you're using the Maui 'JOBNODEMATCHPOLICY EXACTNODE' each job will
>>> get to a single node anyway.
>>>
>>> BTW, did you restart Maui after you added 'JOBNODEMATCHPOLICY EXACTNODE'
>>> to the maui.cfg file?
>>>
>>> I hope this helps,
>>> Gus Correa
>>>
>>>
>>> Fernando Caba wrote:
>>>> Hi Gus, my node file /var/spool/torque /server_priv/nodes looks like:
>>>>
>>>> [root at fe server_priv]# more nodes
>>>> n10 np=12
>>>> n11 np=12
>>>> n12 np=12
>>>> n13 np=12
>>>> [root at fe server_priv]#
>>>>
>>>> it is exact as your comment.
>>>>
>>>> My script:
>>>>
>>>> #!/bin/bash
>>>>
>>>> cd $PBS_O_WORKDIR
>>>>
>>>> mpirun -np 8 /usr/local/vasp/vasp
>>>>
>>>> launch 8 vasp in one node. If i start one job more (with -np 8),
>>>> the job will run in the same node (n13).
>>>> So if i start another job with -np 8
>>>> (or -np 4), it will run in the same node n13.
>>>>
>>>> I configured JOBNODEMATCHPOLICY EXACTNODE in maui.cfg,
>>>> but unfortunately the ran in node n13.
>>>> This is an example of the output of top
>>>>
>>>> top - 00:05:53 up 14 days,  6:47,  1 user,  load average: 4.18, 4.06, 4.09
>>>> Mem:  15955108k total, 13287888k used,  2667220k free,   142168k buffers
>>>> Swap: 67111528k total,    16672k used, 67094856k free, 11360332k cached
>>>>
>>>>      PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
>>>> 21796 patricia  25   0  463m 291m  12m R 100.5  1.9 517:29.59 vasp
>>>> 21797 patricia  25   0  448m 276m  11m R 100.2  1.8 518:51.49 vasp
>>>> 21798 patricia  25   0  458m 287m  11m R 100.2  1.8 522:01.79 vasp
>>>> 21799 patricia  25   0  448m 276m  11m R 99.9  1.8 519:04.25 vasp
>>>>        1 root      15   0 10348  672  568 S  0.0  0.0   0:00.53 init
>>>>        2 root      RT  -5     0    0    0 S  0.0  0.0   0:00.06 migration/0
>>>>        3 root      34  19     0    0    0 S  0.0  0.0   0:00.00 ksoftirqd/0
>>>>        4 root      RT  -5     0    0    0 S  0.0  0.0   0:00.00 watchdog/0
>>>>        5 root      RT  -5     0    0    0 S  0.0  0.0   0:00.04 migration/1
>>>>
>>>> The job that generate those 4 vasp process is:
>>>>
>>>> #!/bin/bash
>>>>
>>>> cd $PBS_O_WORKDIR
>>>>
>>>> mpirun -np 4 /usr/local/vasp/vasp
>>>>
>>>> Thanks
>>>>
>>>> ----------------------------------------------------
>>>> Ing. Fernando Caba
>>>> Director General de Telecomunicaciones
>>>> Universidad Nacional del Sur
>>>> http://www.dgt.uns.edu.ar
>>>> Tel/Fax: (54)-291-4595166
>>>> Tel: (54)-291-4595101 int. 2050
>>>> Avda. Alem 1253, (B8000CPB) Bahía Blanca - Argentina
>>>> ----------------------------------------------------
>>>>
>>>>
>>>> El 27/09/2011 08:07 PM, Gus Correa escribió:
>>>>> Hi Fernando
>>>>>
>>>>> Did you try something like this in your
>>>>> ${TORQUE}/server_priv/nodes file?
>>>>>
>>>>> frontend np=12 [skip this line if the frontend is not to do job work]
>>>>> node1 np=12
>>>>> node2 np=12
>>>>> node3 np=12
>>>>> node4 np=12
>>>>>
>>>>> This is probably the first thing to do.
>>>>> It is not Maui, just plain Torque [actually pbs_server configuration].
>>>>>
>>>>> The lines above assume your nodes are called node1, ...
>>>>> and the head node is called frontend,
>>>>> in some name-resolvable manner [most likely
>>>>> in your /etc/hosts file, most likely pointing to the nodes'
>>>>> IP addresses in your cluster's private subnet, 192.168.X.X,
>>>>> 10.X.X.X or equivalent].
>>>>>
>>>>> The 'np=12' clause will allow at most 12 *processes* per node.
>>>>>
>>>>>
>>>>> [However, if VASP is *threaded*, say via OpenMP, then it won't
>>>>> prevent that several threads are launched from each process.
>>>>> To handle threaded you can use some tricks, such as requesting
>>>>> more cores than processes.
>>>>> Sorry, I am not familiar to VASP to be able to say more than this.]
>>>>>
>>>>> I would suggest that you take a look at the Torque Admin Manual
>>>>> for more details:
>>>>> http://www.adaptivecomputing.com/resources/docs/torque/
>>>>>
>>>>> There are further controls in Maui, such as
>>>>> 'JOBNODEMATCHPOLICY EXACTNODE' in maui.cfg,
>>>>> for instance, if you want full nodes allocated to each job,
>>>>> as opposed to jobs sharing cores in a single node.
>>>>> However, these choices may come later.
>>>>> [You can change maui.cfg and restart the maui scheduler to
>>>>> test various changes.]
>>>>>
>>>>> For Maui details see the Maui Admin Guide:
>>>>> http://www.adaptivecomputing.com/resources/docs/maui/index.php
>>>>>
>>>>> I hope this helps,
>>>>> Gus Correa
>>>>>
>>>>> Fernando Caba wrote:
>>>>>> Hi every body, i am using torque 3.0.1 and maui 3.3.1 in a configuration
>>>>>> composed by a front end and 4 nodes (2 processors, 6 cores each)
>>>>>> totalizing 48 cores.
>>>>>> I need to configure that in each node don´t run no more than 12 process
>>>>>> (particular we are using vasp), so we wan´t no more than 12 vasp process
>>>>>> by node.
>>>>>> How can i configure this? I´m so confusing reading a lot of information
>>>>>> from torque and maui configuration.
>>>>>>
>>>>>> Thank´s in advance.
>>>>>>
>>>>> _______________________________________________
>>>>> mauiusers mailing list
>>>>> mauiusers at supercluster.org
>>>>> http://www.supercluster.org/mailman/listinfo/mauiusers
>>>>>
>>>> _______________________________________________
>>>> mauiusers mailing list
>>>> mauiusers at supercluster.org
>>>> http://www.supercluster.org/mailman/listinfo/mauiusers
>>> _______________________________________________
>>> mauiusers mailing list
>>> mauiusers at supercluster.org
>>> http://www.supercluster.org/mailman/listinfo/mauiusers
>>>
>> _______________________________________________
>> mauiusers mailing list
>> mauiusers at supercluster.org
>> http://www.supercluster.org/mailman/listinfo/mauiusers
> _______________________________________________
> mauiusers mailing list
> mauiusers at supercluster.org
> http://www.supercluster.org/mailman/listinfo/mauiusers
>


More information about the mauiusers mailing list