[torqueusers] Walltime ellapsed

chris job.fr chrisjob.fr at gmail.com
Fri Dec 18 01:49:37 MST 2009


These are MPI jobs.
      The problem is that when the maxwalltime of the queue is
reached, the job doesn't stop immediatly. The job is considered as
finished for TORQUE but stay on nodes. New job is sent on these nodes.
In general one node will be down after a moment.

Here are my torque and maui configurations. Is there something wrong ?

1) TORQUE
#
# Create queues and set their attributes.
#
#
# Create and define queue xeon32G
#
create queue xeon32G
set queue xeon32G queue_type = Execution
set queue xeon32G resources_max.mem = 32000000kb
set queue xeon32G resources_max.walltime = 72:00:00
set queue xeon32G resources_default.neednodes = xeon32G
set queue xeon32G resources_default.nodes = 1:xeon32G
set queue xeon32G enabled = True
set queue xeon32G started = True
#
# Create and define queue long
#
create queue long
set queue long queue_type = Execution
set queue long resources_max.nodes = 1:ppn=8
set queue long resources_max.walltime = 168:00:00
set queue long resources_default.neednodes = long
set queue long resources_default.nodes = 1:long
set queue long enabled = True
set queue long started = True
#
# Create and define queue xeon
#
create queue xeon
set queue xeon queue_type = Execution
set queue xeon resources_max.mem = 16000000kb
set queue xeon resources_max.walltime = 72:00:00
set queue xeon resources_default.neednodes = xeon
set queue xeon resources_default.nodes = 1:xeon
set queue xeon enabled = True
set queue xeon started = True
#
# Create and define queue opteron
#
create queue opteron
set queue opteron queue_type = Execution
set queue opteron resources_max.mem = 8000000kb
set queue opteron resources_max.walltime = 24:00:00
set queue opteron resources_default.neednodes = opteron
set queue opteron resources_default.nodes = 1:opteron
set queue opteron enabled = True
set queue opteron started = True
#
# Create and define queue opteron32G
#
create queue opteron32G
set queue opteron32G queue_type = Execution
set queue opteron32G resources_max.mem = 32000000kb
set queue opteron32G resources_max.walltime = 48:00:00
set queue opteron32G resources_default.neednodes = opteron32G
set queue opteron32G resources_default.nodes = 1:opteron32G
set queue opteron32G enabled = True
set queue opteron32G started = True
#
# Create and define queue def
#
create queue def
set queue def queue_type = Route
set queue def route_destinations = opteron
set queue def route_destinations += xeon
set queue def route_destinations += opteron32G
set queue def route_destinations += xeon32G
set queue def enabled = True
set queue def started = True
#
# Create and define queue short
#
create queue short
set queue short queue_type = Execution
set queue short resources_max.nodes = 1:ppn=4
set queue short resources_max.walltime = 01:00:00
set queue short resources_default.neednodes = short
set queue short resources_default.nodes = 1:short
set queue short enabled = True
set queue short started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = def
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.mem = 4000000kb
set server resources_default.neednodes = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server resources_default.walltime = 72:00:00
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server pbs_version = 2.1.6

2) MAUI

# maui.cfg 3.2.6p16

SERVERHOST            XXXX
# primary admin must be first in list
ADMIN1                root

# Resource Manager Definition

RMCFG[TWELVE] TYPE=PBS

# Allocation Manager Definition

AMCFG[bank]  TYPE=NONE

# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration

RMPOLLINTERVAL        00:00:30

SERVERPORT            42559
SERVERMODE            NORMAL

# Admin: http://supercluster.org/mauidocs/a.esecurity.html


LOGFILE               maui.log
LOGFILEMAXSIZE        10000000
LOGLEVEL              3

# Job Priority: http://supercluster.org/mauidocs/5.1jobprioritization.html

QUEUETIMEWEIGHT       1

# FairShare: http://supercluster.org/mauidocs/6.3fairshare.html

FSPOLICY              DEDICATEDPS
FSDEPTH               4
FSINTERVAL            604800
FSDECAY               0.50
FSWEIGHT	      1
FSUSERWEIGHT	      10
FSGROUPWEIGHT	      10

# Throttling Policies:
http://supercluster.org/mauidocs/6.2throttlingpolicies.html

# NONE SPECIFIED

# Backfill: http://supercluster.org/mauidocs/8.2backfill.html

BACKFILLPOLICY        FIRSTFIT
RESERVATIONPOLICY     CURRENTHIGHEST
JOBNODEMATCHPOLICY      EXACTNODE
ENABLEMULTIREQJOBS	TRUE
ENABLEMULTINODEJOBS	TRUE

# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html

NODEALLOCATIONPOLICY  MINRESOURCE

# QOS: http://supercluster.org/mauidocs/7.3qos.html

# QOSCFG[hi]  PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
# QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE

# Standing Reservations:
http://supercluster.org/mauidocs/7.1.3standingreservations.html

# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test]   17:00:00
# SRDAYS[test]      MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test]   0:30:00

# Creds: http://supercluster.org/mauidocs/6.1fairnessoverview.html

# USERCFG[DEFAULT]      FSTARGET=25.0
# USERCFG[john]         PRIORITY=100  FSTARGET=10.0-
# GROUPCFG[staff]       PRIORITY=1000 QLIST=hi:low QDEF=hi
# CLASSCFG[batch]       FLAGS=PREEMPTEE
# CLASSCFG[interactive] FLAGS=PREEMPTOR
GROUPCFG[alouani]       PRIORITY=1000	FSTARGET=30
GROUPCFG[gemme]       PRIORITY=100	FSTARGET=30
groUPCFG[gsi]       PRIORITY=100	FSTARGET=30
GROUPCFG[DEFAULT]       PRIORITY=10 	FSTARGET=10
CLASSCFG[short] MAXPROC=4 PRIORITY=1000
CLASSCFG[long] MAXPROC=8 PRIORITY=1000
CLASSCFG[opteron] PRIORITY=100
CLASSCFG[xeon] PRIORITY=100
CLASSCFG[xeon32G] PRIORITY=1000
CLASSCFG[opteron32G] PRIORITY=1000

Thank you for your help
Chris

2009/12/17 Joshua Bernstein <jbernstein at penguincomputing.com>:
>
>
> chris job.fr wrote:
>>
>>  I am setting the walltime limits inside Torque. I realise that
>> something is incoherent. The default walltime is longer than the max
>> walltime of some queues. Perhaps this is the problem. I have corrected
>> this mistake.
>>
>> here is my pbs environment :
>> #
>> # Create and define queue opteron32G
>> #
>> create queue opteron32G
>> set queue opteron32G queue_type = Execution
>> set queue opteron32G resources_max.walltime = 48:00:00
>> ...
>> #
>>
>> ...
>> # Set server attributes.
>> #
>> set server resources_default.walltime = 72:00:00
>
> This shouldn't be a problem, but you need to make sure that if you want jobs
> to get killed within the walltime of queue opteron32G the job must be sure
> to run inside of that queue.
>
> Are these MPI jobs or serial?
>
> -Josh
>


More information about the torqueusers mailing list