[torqueusers] Walltime ellapsed
chris job.fr
chrisjob.fr at gmail.com
Fri Dec 18 01:49:37 MST 2009
These are MPI jobs.
The problem is that when the maxwalltime of the queue is
reached, the job doesn't stop immediatly. The job is considered as
finished for TORQUE but stay on nodes. New job is sent on these nodes.
In general one node will be down after a moment.
Here are my torque and maui configurations. Is there something wrong ?
1) TORQUE
#
# Create queues and set their attributes.
#
#
# Create and define queue xeon32G
#
create queue xeon32G
set queue xeon32G queue_type = Execution
set queue xeon32G resources_max.mem = 32000000kb
set queue xeon32G resources_max.walltime = 72:00:00
set queue xeon32G resources_default.neednodes = xeon32G
set queue xeon32G resources_default.nodes = 1:xeon32G
set queue xeon32G enabled = True
set queue xeon32G started = True
#
# Create and define queue long
#
create queue long
set queue long queue_type = Execution
set queue long resources_max.nodes = 1:ppn=8
set queue long resources_max.walltime = 168:00:00
set queue long resources_default.neednodes = long
set queue long resources_default.nodes = 1:long
set queue long enabled = True
set queue long started = True
#
# Create and define queue xeon
#
create queue xeon
set queue xeon queue_type = Execution
set queue xeon resources_max.mem = 16000000kb
set queue xeon resources_max.walltime = 72:00:00
set queue xeon resources_default.neednodes = xeon
set queue xeon resources_default.nodes = 1:xeon
set queue xeon enabled = True
set queue xeon started = True
#
# Create and define queue opteron
#
create queue opteron
set queue opteron queue_type = Execution
set queue opteron resources_max.mem = 8000000kb
set queue opteron resources_max.walltime = 24:00:00
set queue opteron resources_default.neednodes = opteron
set queue opteron resources_default.nodes = 1:opteron
set queue opteron enabled = True
set queue opteron started = True
#
# Create and define queue opteron32G
#
create queue opteron32G
set queue opteron32G queue_type = Execution
set queue opteron32G resources_max.mem = 32000000kb
set queue opteron32G resources_max.walltime = 48:00:00
set queue opteron32G resources_default.neednodes = opteron32G
set queue opteron32G resources_default.nodes = 1:opteron32G
set queue opteron32G enabled = True
set queue opteron32G started = True
#
# Create and define queue def
#
create queue def
set queue def queue_type = Route
set queue def route_destinations = opteron
set queue def route_destinations += xeon
set queue def route_destinations += opteron32G
set queue def route_destinations += xeon32G
set queue def enabled = True
set queue def started = True
#
# Create and define queue short
#
create queue short
set queue short queue_type = Execution
set queue short resources_max.nodes = 1:ppn=4
set queue short resources_max.walltime = 01:00:00
set queue short resources_default.neednodes = short
set queue short resources_default.nodes = 1:short
set queue short enabled = True
set queue short started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = def
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.mem = 4000000kb
set server resources_default.neednodes = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server resources_default.walltime = 72:00:00
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server pbs_version = 2.1.6
2) MAUI
# maui.cfg 3.2.6p16
SERVERHOST XXXX
# primary admin must be first in list
ADMIN1 root
# Resource Manager Definition
RMCFG[TWELVE] TYPE=PBS
# Allocation Manager Definition
AMCFG[bank] TYPE=NONE
# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration
RMPOLLINTERVAL 00:00:30
SERVERPORT 42559
SERVERMODE NORMAL
# Admin: http://supercluster.org/mauidocs/a.esecurity.html
LOGFILE maui.log
LOGFILEMAXSIZE 10000000
LOGLEVEL 3
# Job Priority: http://supercluster.org/mauidocs/5.1jobprioritization.html
QUEUETIMEWEIGHT 1
# FairShare: http://supercluster.org/mauidocs/6.3fairshare.html
FSPOLICY DEDICATEDPS
FSDEPTH 4
FSINTERVAL 604800
FSDECAY 0.50
FSWEIGHT 1
FSUSERWEIGHT 10
FSGROUPWEIGHT 10
# Throttling Policies:
http://supercluster.org/mauidocs/6.2throttlingpolicies.html
# NONE SPECIFIED
# Backfill: http://supercluster.org/mauidocs/8.2backfill.html
BACKFILLPOLICY FIRSTFIT
RESERVATIONPOLICY CURRENTHIGHEST
JOBNODEMATCHPOLICY EXACTNODE
ENABLEMULTIREQJOBS TRUE
ENABLEMULTINODEJOBS TRUE
# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html
NODEALLOCATIONPOLICY MINRESOURCE
# QOS: http://supercluster.org/mauidocs/7.3qos.html
# QOSCFG[hi] PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
# QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE
# Standing Reservations:
http://supercluster.org/mauidocs/7.1.3standingreservations.html
# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test] 17:00:00
# SRDAYS[test] MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test] 0:30:00
# Creds: http://supercluster.org/mauidocs/6.1fairnessoverview.html
# USERCFG[DEFAULT] FSTARGET=25.0
# USERCFG[john] PRIORITY=100 FSTARGET=10.0-
# GROUPCFG[staff] PRIORITY=1000 QLIST=hi:low QDEF=hi
# CLASSCFG[batch] FLAGS=PREEMPTEE
# CLASSCFG[interactive] FLAGS=PREEMPTOR
GROUPCFG[alouani] PRIORITY=1000 FSTARGET=30
GROUPCFG[gemme] PRIORITY=100 FSTARGET=30
groUPCFG[gsi] PRIORITY=100 FSTARGET=30
GROUPCFG[DEFAULT] PRIORITY=10 FSTARGET=10
CLASSCFG[short] MAXPROC=4 PRIORITY=1000
CLASSCFG[long] MAXPROC=8 PRIORITY=1000
CLASSCFG[opteron] PRIORITY=100
CLASSCFG[xeon] PRIORITY=100
CLASSCFG[xeon32G] PRIORITY=1000
CLASSCFG[opteron32G] PRIORITY=1000
Thank you for your help
Chris
2009/12/17 Joshua Bernstein <jbernstein at penguincomputing.com>:
>
>
> chris job.fr wrote:
>>
>> I am setting the walltime limits inside Torque. I realise that
>> something is incoherent. The default walltime is longer than the max
>> walltime of some queues. Perhaps this is the problem. I have corrected
>> this mistake.
>>
>> here is my pbs environment :
>> #
>> # Create and define queue opteron32G
>> #
>> create queue opteron32G
>> set queue opteron32G queue_type = Execution
>> set queue opteron32G resources_max.walltime = 48:00:00
>> ...
>> #
>>
>> ...
>> # Set server attributes.
>> #
>> set server resources_default.walltime = 72:00:00
>
> This shouldn't be a problem, but you need to make sure that if you want jobs
> to get killed within the walltime of queue opteron32G the job must be sure
> to run inside of that queue.
>
> Are these MPI jobs or serial?
>
> -Josh
>
More information about the torqueusers
mailing list