[torqueusers] Assigning Nodes to specific nodes

Jonathan Smale JRS221 at bham.ac.uk
Thu Oct 13 05:08:42 MDT 2011


Dear Torque & Maui users,

I'm crossposting this to maui & torque users as I'm not sure which of these is posing a problem.  I'm trying to make three separate queue for the three different types of nodes on our cluster using the combination of qmgr commands:


set queue firstgen resources_default.neednodes = 1stgennodes
set nodes compute-0-0 properties = 1stgennodes

I've found a fair few previous emails about this and have followed their solution without sucess.  I can submit the jobs to the queues but they remain in a queued state, qstat -f gives:-

[root at che-hydra /]# qstat -f 48691
Job Id: 48691.che-hydra.bham.ac.uk
    Job_Name = allenega_p2000_g300_r2.cmd
    Job_Owner = jsmale at che-hydra.bham.ac.uk
    resources_used.cput = 529:06:32
    resources_used.mem = 16872kb
    resources_used.vmem = 245908kb
    resources_used.walltime = 529:11:31
    job_state = R
    queue = default
    server = che-hydra.bham.ac.uk
    Checkpoint = u
    ctime = Wed Sep 21 10:45:00 2011
    Error_Path = che-hydra.bham.ac.uk:/home/jsmale/allenega/allenega_p2000_g30
        0_r2.cmd.e48691
    exec_host = compute-0-16/1
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = a
    mtime = Wed Sep 21 10:45:01 2011
    Output_Path = che-hydra.bham.ac.uk:/home/jsmale/allenega/allenega_p2000_g3
        00_r2.cmd.o48691
    Priority = 0
    qtime = Wed Sep 21 10:45:00 2011
    Rerunable = True
    Resource_List.neednodes = 1
    Resource_List.nodect = 1
    Resource_List.nodes = 1
    session_id = 20687
    substate = 42
    Variable_List = PBS_O_HOME=/home/jsmale,PBS_O_LANG=en_US.iso885915,
        PBS_O_LOGNAME=jsmale,
        PBS_O_PATH=/home/jsmale/mctdh90.svn/bin/x86_64:/home/jsmale/mctdh90.s
        vn/bin:/usr/lib64/openmpi/1.3.2-gcc/bin:/usr/kerberos/bin:/usr/java/la
        test/bin:/usr/local/bin:/bin:/usr/bin:/opt/maui/bin:/opt/torque/bin:/o
        pt/torque/sbin:/usr/share/pvm3/pvm3//bin/LINUX64:/opt/rocks/bin:/opt/r
        ocks/sbin:/global64/pgi/linux86-64/10.4/bin:/user/worth/gaussian/bin:/
        user/jsmale/bin:/home/gaussian/bin:~/mctdh90.svn/bin:/home/jsmale/bin,
        PBS_O_MAIL=/var/spool/mail/jsmale,PBS_O_SHELL=/bin/bash,
        PBS_SERVER=che-hydra.bham.ac.uk,PBS_O_HOST=che-hydra.bham.ac.uk,
        PBS_O_WORKDIR=/home/jsmale/allenega,PBS_O_QUEUE=default
    euser = jsmale
    egroup = worth
    hashname = 48691.che-hydra.bham.ac.uk
    queue_rank = 35491
    queue_type = E
    etime = Wed Sep 21 10:45:00 2011
    submit_args = allenega_p2000_g300_r2.cmd
    start_time = Wed Sep 21 10:45:01 2011
    start_count = 1

and checkjob gives the following:
[root at che-hydra /]# checkjob 48691


checking job 48691

State: Running
Creds:  user:jsmale  group:worth  class:default  qos:DEFAULT
WallTime: 22:01:12:16 of 99:23:59:59
SubmitTime: Wed Sep 21 10:45:00
  (Time Queued  Total: 00:00:01  Eligible: 00:00:00)

StartTime: Wed Sep 21 10:45:01
Total Tasks: 1

Req[0]  TaskCount: 1  Partition: DEFAULT
Network: [NONE]  Memory >= 0  Disk >= 0  Swap >= 0
Opsys: [NONE]  Arch: [NONE]  Features: [NONE]
NodeCount: 1
Allocated Nodes:
[compute-0-16:1]


IWD: [NONE]  Executable:  [NONE]
Bypass: 0  StartCount: 1
PartitionMask: [ALL]
Flags:       RESTARTABLE

Reservation '48691' ( -INFINITY -> 77:22:47:56  Duration: 99:23:59:59)
PE:  1.00  StartPriority:  20

I'm not sure why the job isn't running, there doesn't seem to be any reason given in either the maui or torque (server&mom) logs.  Could anyone help me decipher the cause?  Configuration of the server follows.


Top of maui.cfg file:

# maui.cfg 3.2.6p20

SERVERHOST            che-hydra.bham.ac.uk
# primary admin must be first in list
ADMIN1                root

# Resource Manager Definition

RMCFG[che-hydra.bham.ac.uk] TYPE=PBS

# Allocation Manager Definition

AMCFG[bank]  TYPE=NONE

# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration

RMPOLLINTERVAL        00:00:30

SERVERPORT            42559
SERVERMODE            NORMAL

# Admin: http://supercluster.org/mauidocs/a.esecurity.html


LOGFILE               maui.log
LOGFILEMAXSIZE        100000000
LOGLEVEL              3

# Setting up node information for throttling policies

#
NODECFG[compute-0-0] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-1] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-2] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-3] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-4] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-5] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-6] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-7] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-8] SPEED=1.4 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-9] SPEED=1.4 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-10] SPEED=1.4 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-11] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-12] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-13] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-14] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-15] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-16] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
NODECFG[compute-0-17] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
NODECFG[compute-0-18] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
NODECFG[compute-0-19] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes


# Setting up queue information to allow allocation to specfic types of nodes via queues

CLASSCFG[firstgen] hostlist = compute-0-0,compute-0-1,compute-0-2,compute-0-3
CLASSCFG[secondgen] hostlist = compute-0-4,compute-0-5,compute-0-6,compute-0-7,compute-0-8,compute-0-9,compute-0-10,compute-0-11,compute-0-12,compute-0-13,compute-0-14,compute-0-15
CLASSCFG[thirdgen] hostlist = compute-0-16,compute-0-17,compute-0-18,compute-0-19

# Backfill: http://supercluster.org/mauidocs/8.2backfill.html

BACKFILLPOLICY        FIRSTFIT
RESERVATIONPOLICY     CURRENTHIGHEST

# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html

NODEALLOCATIONPOLICY CPULOAD

Some torque setting that might be of use:


[root at che-hydra]# pbsnodes (truncated, one example of each type of node
compute-0-0
     state = free
     np = 4
     properties = firstgennodes
     ntype = cluster
     status = opsys=linux,uname=Linux compute-0-0.local 2.6.18-164.6.1.el5 #1 SMP Tue Nov 3 16:12:36 EST 2009 x86_64,sessions=? 15201,nsessions=? 15201,nusers=0,idletime=13287377,totmem=9195716kb,availmem=8926420kb,physmem=8175600kb,ncpus=4,loadave=0.00,netload=54589282244,state=free,jobs=,varattr=,rectime=1318502240

compute-0-4
     state = free
     np = 8
     properties = secondgennodes
     ntype = cluster
     status = opsys=linux,uname=Linux compute-0-4.local 2.6.18-164.6.1.el5 #1 SMP Tue Nov 3 16:12:36 EST 2009 x86_64,sessions=? 15201,nsessions=? 15201,nusers=0,idletime=21840103,totmem=17464156kb,availmem=17170748kb,physmem=16444040kb,ncpus=8,loadave=0.00,netload=494140575539,state=free,jobs=,varattr=,rectime=1318502242

compute-0-16
     state = free
     np = 16
     properties = thirdgennodes
     ntype = cluster
     jobs = 0/48738.che-hydra.bham.ac.uk, 1/48691.che-hydra.bham.ac.uk, 3/48693.che-hydra.bham.ac.uk
     status = opsys=linux,uname=Linux compute-0-16.local 2.6.18-164.6.1.el5 #1 SMP Tue Nov 3 16:12:36 EST 2009 x86_64,sessions=6691 20687 20764,nsessions=3,nusers=2,idletime=7342084,totmem=17461096kb,availmem=8761384kb,physmem=16440980kb,ncpus=16,loadave=3.08,netload=647310098799,state=free,jobs=48691.che-hydra.bham.ac.uk 48693.che-hydra.bham.ac.uk 48738.che-hydra.bham.ac.uk,varattr=,rectime=1318502257

[root at che-hydra]# qmgr -c "p s"
#
# Create queues and set their attributes.
#
#
# Create and define queue default
#
create queue default
set queue default queue_type = Execution
set queue default Priority = 100
set queue default resources_default.nodes = 1
set queue default enabled = True
set queue default started = True
#
# Create and define queue secondgen
#
create queue secondgen
set queue secondgen queue_type = Execution
set queue secondgen Priority = 100
set queue secondgen acl_host_enable = False
set queue secondgen acl_hosts = che-hydra+localhost
set queue secondgen resources_default.neednodes = secondgennodes
set queue secondgen resources_default.nodes = 1
set queue secondgen enabled = True
set queue secondgen started = True
#
# Create and define queue thirdgen
#
create queue thirdgen
set queue thirdgen queue_type = Execution
set queue thirdgen Priority = 100
set queue thirdgen acl_host_enable = False
set queue thirdgen acl_hosts = che-hydra+localhost
set queue thirdgen resources_default.neednodes = thirdgennodes
set queue thirdgen resources_default.nodes = 1
set queue thirdgen enabled = True
set queue thirdgen started = True
#
# Create and define queue firstgen
#
create queue firstgen
set queue firstgen queue_type = Execution
set queue firstgen Priority = 100
set queue firstgen acl_host_enable = False
set queue firstgen acl_hosts = che-hydra+localhost
set queue firstgen resources_default.neednodes = firstgennodes
set queue firstgen resources_default.nodes = 1
set queue firstgen enabled = True
set queue firstgen started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_host_enable = False
set server acl_hosts = che-hydra.bham.ac.uk
set server default_queue = default
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server auto_node_np = True
set server next_job_number = 49702



Jonathan Smale


More information about the torqueusers mailing list