[Mauiusers] Preemption not working : job is deferred. Reason: RMFailure (cannot start job - RM failure, rc: 15044, msg: 'Resource temporarily unavailable MSG=job allocation request exceeds currently available cluster nodes, 1 requested, 0 available'

Andre Gauthier andre.gauthier at gmail.com
Tue Apr 20 09:51:26 MDT 2010


HI, I'm trying to get preemption to work with Maui and Torque.     I
have dozen queues, but one is define as a preemptee (general queue &
qos) and another as a preemptor (admins queue & qos).  I submit a job
to the queue that is a premptee then a job to the preemptor.  The
preemptor does not run.  Maui version 3.2.6p21, Torque Version
2.3.6-1.

qstat:

Job id                    Name             User            Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
459.hpc-test              sleep.sh         user2           00:00:00 R
general
460.hpc-test              sleep.sh         user1                  0 Q admins


checkjob 460:

checking job 460

State: Idle  EState: Deferred
Creds:  user:user1  group:admins  class:admins  qos:admins
WallTime: 00:00:00 of 1:00:00
SubmitTime: Tue Apr 20 11:41:28
  (Time Queued  Total: 00:00:02  Eligible: 00:00:01)

StartDate: 00:00:00  Tue Apr 20 11:41:30
Total Tasks: 8

Req[0]  TaskCount: 8  Partition: ALL
Network: [NONE]  Memory >= 0  Disk >= 0  Swap >= 0
Opsys: [NONE]  Arch: [NONE]  Features: [NONE]
Dedicated Resources Per Task: PROCS: 1  MEM: 32M


IWD: [NONE]  Executable:  [NONE]
Bypass: 0  StartCount: 1
PartitionMask: [ALL]
Flags:       RESTARTABLE PREEMPTOR

job is deferred.  Reason:  RMFailure  (cannot start job - RM failure,
rc: 15044, msg: 'Resource temporarily unavailable MSG=job allocation
request exceeds currently available cluster nodes, 1 requested, 0
available')
Holds:    Defer  (hold reason:  RMFailure)
PE:  8.00  StartPriority:  3001
cannot select job 460 for partition DEFAULT (job hold active)


checkjob 459:

checking job 459

State: Running
Creds:  user:user2  group:user2  class:general  qos:general
WallTime: 00:03:05 of 1:00:00
SubmitTime: Tue Apr 20 11:41:11
  (Time Queued  Total: 00:00:19  Eligible: 00:00:01)

StartTime: Tue Apr 20 11:41:30
Total Tasks: 96

Req[0]  TaskCount: 96  Partition: DEFAULT
Network: [NONE]  Memory >= 0  Disk >= 0  Swap >= 0
Opsys: [NONE]  Arch: [NONE]  Features: [NONE]
Dedicated Resources Per Task: PROCS: 1  MEM: 2M
Allocated Nodes:
[compute-0-15:8][compute-0-13:8][compute-0-12:8][compute-0-11:8]
[compute-0-10:8][compute-0-9:8][compute-0-8:8][compute-0-7:8]
[compute-0-6:8][compute-0-5:8][compute-0-4:8][compute-0-3:8]



IWD: [NONE]  Executable:  [NONE]
Bypass: 0  StartCount: 2
PartitionMask: [ALL]
Flags:       RESTARTABLE PREEMPTEE
Attr:        PREEMPTEE

Reservation '459' (-00:03:06 -> 00:56:54  Duration: 1:00:00)
PE:  96.00  StartPriority:  200





showconfig:



IWD: [NONE]  Executable:  [NONE]
Bypass: 0  StartCount: 2
PartitionMask: [ALL]
Flags:       RESTARTABLE PREEMPTEE
Attr:        PREEMPTEE

Reservation '459' (-00:03:06 -> 00:56:54  Duration: 1:00:00)
PE:  96.00  StartPriority:  200

[root at hpc-test maui]# showconfig
# Maui version 3.2.6p21 (PID: 16046)
# global policies

REJECTNEGPRIOJOBS[0]              FALSE
ENABLENEGJOBPRIORITY[0]           FALSE
ENABLEMULTINODEJOBS[0]            TRUE
ENABLEMULTIREQJOBS[0]             FALSE
BFPRIORITYPOLICY[0]               [NONE]
JOBPRIOACCRUALPOLICY            QUEUEPOLICY
NODELOADPOLICY                  ADJUSTSTATE
USEMACHINESPEED                 FALSE
USESYSTEMQUEUETIME              TRUE
USELOCALMACHINEPRIORITY         FALSE
NODEUNTRACKEDLOADFACTOR         1.2
JOBNODEMATCHPOLICY[0]

JOBMAXSTARTTIME[0]                  INFINITY

METAMAXTASKS[0]                   0
NODESETPOLICY[0]                  [NONE]
NODESETATTRIBUTE[0]               [NONE]
NODESETLIST[0]
NODESETDELAY[0]                   00:00:00
NODESETPRIORITYTYPE[0]            MINLOSS
NODESETTOLERANCE[0]                 0.00

BACKFILLPOLICY[0]                 FIRSTFIT
BACKFILLDEPTH[0]                  0
BACKFILLPROCFACTOR[0]             0
BACKFILLMAXSCHEDULES[0]           10000
BACKFILLMETRIC[0]                 PROCS

BFCHUNKDURATION[0]                00:00:00
BFCHUNKSIZE[0]                    0
PREEMPTPOLICY[0]                  REQUEUE
MINADMINSTIME[0]                  00:00:00
RESOURCELIMITPOLICY[0]
NODEAVAILABILITYPOLICY[0]         COMBINED:[DEFAULT]
NODEALLOCATIONPOLICY[0]           MINRESOURCE
TASKDISTRIBUTIONPOLICY[0]         DEFAULT
RESERVATIONPOLICY[0]              NEVER
RESERVATIONRETRYTIME[0]           00:00:00
RESERVATIONTHRESHOLDTYPE[0]       NONE
RESERVATIONTHRESHOLDVALUE[0]      0

FSPOLICY                        [NONE]
FSPOLICY                        [NONE]
FSINTERVAL                      12:00:00
FSDEPTH                         8
FSDECAY                         1.00



# Priority Weights

SERVICEWEIGHT[0]                  1
TARGETWEIGHT[0]                   1
CREDWEIGHT[0]                     1
ATTRWEIGHT[0]                     1
FSWEIGHT[0]                       1
RESWEIGHT[0]                      1
USAGEWEIGHT[0]                    1
QUEUETIMEWEIGHT[0]                1
XFACTORWEIGHT[0]                  0
SPVIOLATIONWEIGHT[0]              0
BYPASSWEIGHT[0]                   0
TARGETQUEUETIMEWEIGHT[0]          0
TARGETXFACTORWEIGHT[0]            0
USERWEIGHT[0]                     1
GROUPWEIGHT[0]                    1
ACCOUNTWEIGHT[0]                  0
QOSWEIGHT[0]                      1
CLASSWEIGHT[0]                    1
FSUSERWEIGHT[0]                   0
FSGROUPWEIGHT[0]                  0
FSACCOUNTWEIGHT[0]                0
FSQOSWEIGHT[0]                    0
FSCLASSWEIGHT[0]                  0
ATTRATTRWEIGHT[0]                 0
ATTRSTATEWEIGHT[0]                0
NODEWEIGHT[0]                     0
PROCWEIGHT[0]                     0
MEMWEIGHT[0]                      0
SWAPWEIGHT[0]                     0
DISKWEIGHT[0]                     0
PSWEIGHT[0]                       0
PEWEIGHT[0]                       0
WALLTIMEWEIGHT[0]                 0
UPROCWEIGHT[0]                    0
UJOBWEIGHT[0]                     0
CONSUMEDWEIGHT[0]                 0
USAGEEXECUTIONTIMEWEIGHT[0]       0
REMAININGWEIGHT[0]                0
PERCENTWEIGHT[0]                  0
XFMINWCLIMIT[0]                   00:02:00


# partition DEFAULT policies

REJECTNEGPRIOJOBS[1]              FALSE
ENABLENEGJOBPRIORITY[1]           FALSE
ENABLEMULTINODEJOBS[1]            TRUE
ENABLEMULTIREQJOBS[1]             FALSE
BFPRIORITYPOLICY[1]               [NONE]
JOBPRIOACCRUALPOLICY            QUEUEPOLICY
NODELOADPOLICY                  ADJUSTSTATE
JOBNODEMATCHPOLICY[1]

JOBMAXSTARTTIME[1]                  INFINITY

METAMAXTASKS[1]                   0
NODESETPOLICY[1]                  [NONE]
NODESETATTRIBUTE[1]               [NONE]
NODESETLIST[1]
NODESETDELAY[1]                   00:00:00
NODESETPRIORITYTYPE[1]            MINLOSS
NODESETTOLERANCE[1]                 0.00

# Priority Weights

XFMINWCLIMIT[1]                   00:00:00

RMAUTHTYPE[0]                     CHECKSUM

CLASSCFG[[NONE]]  DEFAULT.FEATURES=[NONE]
CLASSCFG[[ALL]]  DEFAULT.FEATURES=[NONE]
CLASSCFG[DEFAULT]  DEFAULT.FEATURES=[NONE]
CLASSCFG[batch]  DEFAULT.FEATURES=[NONE]
CLASSCFG[interactive]  DEFAULT.FEATURES=[NONE]
CLASSCFG[general]  DEFAULT.FEATURES=[NONE]
CLASSCFG[priya]  DEFAULT.FEATURES=[NONE]
CLASSCFG[admins]  DEFAULT.FEATURES=[NONE]
CLASSCFG[sohrab]  DEFAULT.FEATURES=[NONE]
CLASSCFG[micro]  DEFAULT.FEATURES=[NONE]
CLASSCFG[altonji]  DEFAULT.FEATURES=[NONE]
CLASSCFG[easther]  DEFAULT.FEATURES=[NONE]
CLASSCFG[berry]  DEFAULT.FEATURES=[NONE]
CLASSCFG[hpcprog]  DEFAULT.FEATURES=[NONE]
CLASSCFG[macro]  DEFAULT.FEATURES=[NONE]
QOSPRIORITY[0]                    0
QOSQTWEIGHT[0]                    0
QOSXFWEIGHT[0]                    0
QOSTARGETXF[0]                      0.00
QOSTARGETQT[0]                    00:00:00
QOSFLAGS[0]
QOSPRIORITY[1]                    0
QOSQTWEIGHT[1]                    0
QOSXFWEIGHT[1]                    0
QOSTARGETXF[1]                      0.00
QOSTARGETQT[1]                    00:00:00
QOSFLAGS[1]
QOSPRIORITY[2]                    100
QOSQTWEIGHT[2]                    0
QOSXFWEIGHT[2]                    0
QOSTARGETXF[2]                    100.00
QOSTARGETQT[2]                    00:00:00
QOSFLAGS[2]
QOSPRIORITY[3]                    -1000
QOSQTWEIGHT[3]                    0
QOSXFWEIGHT[3]                    0
QOSTARGETXF[3]                      0.00
QOSTARGETQT[3]                    00:00:00
QOSFLAGS[3]
QOSPRIORITY[4]                    1000
QOSQTWEIGHT[4]                    0
QOSXFWEIGHT[4]                    0
QOSTARGETXF[4]                      0.00
QOSTARGETQT[4]                    00:00:00
QOSFLAGS[4]                       PREEMPTOR
QOSPRIORITY[5]                    100
QOSQTWEIGHT[5]                    0
QOSXFWEIGHT[5]                    0
QOSTARGETXF[5]                      0.00
QOSTARGETQT[5]                    00:00:00
QOSFLAGS[5]                       PREEMPTEE
# SERVER MODULES:  MX
SERVERMODE                      NORMAL
SERVERNAME
SERVERHOST                      hpc-test.wss.yale.edu
SERVERPORT                      42559
LOGFILE                         maui.log
LOGFILEMAXSIZE                  10000000
LOGFILEROLLDEPTH                1
LOGLEVEL                        3
LOGFACILITY                     fALL
SERVERHOMEDIR                   /opt/maui/
TOOLSDIR                        /opt/maui/tools/
LOGDIR                          /opt/maui/log/
STATDIR                         /opt/maui/stats/
LOCKFILE                        /opt/maui/maui.pid
SERVERCONFIGFILE                /opt/maui/maui.cfg
CHECKPOINTFILE                  /opt/maui/maui.ck
CHECKPOINTINTERVAL              00:05:00
CHECKPOINTEXPIRATIONTIME        3:11:20:00
TRAPJOB
TRAPNODE
TRAPFUNCTION
RESDEPTH                        24

RMPOLLINTERVAL                  00:00:30
NODEACCESSPOLICY                SHARED
ALLOCLOCALITYPOLICY             [NONE]
SIMTIMEPOLICY                   [NONE]
ADMIN1                          maui root
ADMINHOSTS                      ALL
NODEPOLLFREQUENCY               0
DISPLAYFLAGS
DEFAULTDOMAIN
DEFAULTCLASSLIST                [DEFAULT:1]
FEATURENODETYPEHEADER
FEATUREPROCSPEEDHEADER
FEATUREPARTITIONHEADER
DEFERTIME                       1:00:00
DEFERCOUNT                      24
DEFERSTARTCOUNT                 1
JOBPURGETIME                    0
NODEPURGETIME                   2140000000
APIFAILURETHRESHHOLD            6
NODESYNCTIME                    600
JOBSYNCTIME                     600
JOBMAXOVERRUN                   00:10:00
NODEMAXLOAD                     0.0

PLOTMINTIME                     120
PLOTMAXTIME                     245760
PLOTTIMESCALE                   11
PLOTMINPROC                     1
PLOTMAXPROC                     512
PLOTPROCSCALE                   9
SCHEDCFG[]                        MODE=NORMAL
SERVER=hpc-test.wss.yale.edu:42559
# RM MODULES: PBS SSS WIKI NATIVE
RMCFG[base] AUTHTYPE=CHECKSUM EPORT=15004 TIMEOUT=00:01:30 TYPE=PBS
SIMWORKLOADTRACEFILE            workload
SIMRESOURCETRACEFILE            resource
SIMAUTOSHUTDOWN                 OFF
SIMSTARTTIME                    0
SIMSCALEJOBRUNTIME              FALSE
SIMFLAGS
SIMJOBSUBMISSIONPOLICY          CONSTANTJOBDEPTH
SIMINITIALQUEUEDEPTH            16
SIMWCACCURACY                   0.00
SIMWCACCURACYCHANGE             0.00
SIMNODECOUNT                    0
SIMNODECONFIGURATION            NORMAL
SIMWCSCALINGPERCENT             100
SIMCOMRATE                      0.10
SIMCOMTYPE                      ROUNDROBIN
COMINTRAFRAMECOST               0.30
COMINTERFRAMECOST               0.30
SIMSTOPITERATION                -1
SIMEXITITERATION                -1



cat maui.cfg:


# maui.cfg.tmpl for Maui v3.2.5

# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration

RMPOLLINTERVAL		00:00:30

SERVERHOST		hpc-test.wss.yale.edu
SERVERPORT		42559
SERVERMODE		NORMAL

RMCFG[base]		TYPE=PBS TIMEOUT=90

# Admin: http://supercluster.org/mauidocs/a.esecurity.html
# ADMIN1 users have full scheduler control

ADMIN1                maui root

LOGFILE               maui.log
LOGFILEMAXSIZE        10000000
LOGLEVEL              3

# Job Priority: http://supercluster.org/mauidocs/5.1jobprioritization.html

QUEUETIMEWEIGHT       1

# FairShare: http://supercluster.org/mauidocs/6.3fairshare.html

#FSPOLICY              PSDEDICATED
#FSDEPTH               7
#FSINTERVAL            86400
#FSDECAY               0.80

# Throttling Policies:
http://supercluster.org/mauidocs/6.2throttlingpolicies.html

# NONE SPECIFIED

# Backfill: http://supercluster.org/mauidocs/8.2backfill.html

BACKFILLPOLICY        FIRSTFIT
RESERVATIONPOLICY     NEVER # set to never for premption.

# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html

NODEALLOCATIONPOLICY  MINRESOURCE

# QOS: http://supercluster.org/mauidocs/7.3qos.html

 QOSCFG[hi]  PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
 QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE

# Standing Reservations:
http://supercluster.org/mauidocs/7.1.3standingreservations.html

# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test]   17:00:00
# SRDAYS[test]      MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test]   0:30:00

#PREEMPTPOLICY set by  AG
PREEMPTIONPOLICY REQUEUE

# Creds: http://supercluster.org/mauidocs/6.1fairnessoverview.html

 USERCFG[DEFAULT]      FSTARGET=25.0
 USERCFG[john]         PRIORITY=100  FSTARGET=10.0-
 GROUPCFG[staff]       PRIORITY=1000 QLIST=hi:low QDEF=hi
 CLASSCFG[batch]       FLAGS=PREEMPTEE
 CLASSCFG[interactive] FLAGS=PREEMPTOR

###set QOS needed for premptions
QOSWEIGHT 1
QOSCFG[admins]		QFLAGS=PREEMPTOR  PRIORITY=1000
QOSCFG[general]	  	QFLAGS=PREEMPTEE PRIORITY=100

GROUPWEIGHT 1
CLASSWEIGHT 1
CREDWEIGHT 1
USERWEIGHT 1


CLASSCFG[general] QDEF=general PRIORITY=100

GROUPWEIGHT 1
CLASSCFG[DEFAULT]	MAXPROC=280 QDEF=general  PRIORITY=200
CLASSCFG[admins]	MAXPROC=280 QDEF=admins   PRIORITY=2001


More information about the mauiusers mailing list