[torqueusers] Serial jobs only running on one node
Jacqueline Scoggins
jscoggins at lbl.gov
Tue Apr 4 14:46:02 MDT 2006
I am running torque-1.1.0p2. I just recently reconfigured my queuing
system to allow more than one queue based on the type of job you want to
run certain limits will be set.
The problem I am having for one queue only called "serial" is that all
jobs in this queue are going to the first node in the nodes file and to
no other machines. If that node is down they stay in the Q state
otherwise all jobs from this queue goes there, runs on node0 even if the
system load average is above 2.0.
Here are my configurations:
qmgr -c 'p s '
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Route
set queue batch route_destinations = sh_16
set queue batch route_destinations += sh_32
set queue batch route_destinations += dc_16
set queue batch route_destinations += parallel
set queue batch route_destinations += serial
set queue batch enabled = True
set queue batch started = True
#
# Create and define queue dc_16
#
create queue dc_16
set queue dc_16 queue_type = Execution
set queue dc_16 resources_max.ncpus = 32
set queue dc_16 resources_max.nodect = 16
set queue dc_16 resources_max.walltime = 12:00:00
set queue dc_16 resources_min.ncpus = 17
set queue dc_16 resources_min.nodect = 4
set queue dc_16 resources_min.nodes = 4:dualcore
set queue dc_16 resources_default.walltime = 12:00:00
set queue dc_16 enabled = True
set queue dc_16 started = True
#
# Create and define queue serial
#
create queue serial
set queue serial queue_type = Execution
set queue serial enabled = True
set queue serial started = True
#
# Create and define queue sh_16
#
create queue sh_16
set queue sh_16 queue_type = Execution
set queue sh_16 resources_max.ncpus = 31
set queue sh_16 resources_max.nodect = 16
set queue sh_16 resources_max.walltime = 12:00:00
set queue sh_16 resources_min.ncpus = 17
set queue sh_16 resources_min.nodect = 8
set queue sh_16 resources_min.nodes = 8:shared
set queue sh_16 resources_default.walltime = 12:00:00
set queue sh_16 enabled = True
set queue sh_16 started = True
#
# Create and define queue parallel
#
create queue parallel
set queue parallel queue_type = Execution
set queue parallel resources_min.ncpus = 2
set queue parallel resources_min.nodes = 2:ppn=2
set queue parallel enabled = True
set queue parallel started = True
#
# Create and define queue sh_32
#
create queue sh_32
set queue sh_32 queue_type = Execution
set queue sh_32 resources_max.walltime = 06:00:00
set queue sh_32 resources_min.ncpus = 34
set queue sh_32 resources_min.nodect = 17
set queue sh_32 resources_min.nodes = 17:shared
set queue sh_32 resources_default.walltime = 06:00:00
set queue sh_32 enabled = True
set queue sh_32 started = True
#
# Set server attributes.
#
set server scheduling = True
set server max_running = 1000
set server max_user_run = 200
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server scheduler_iteration = 60
set server node_ping_rate = 300
set server node_check_rate = 600
set server tcp_timeout = 6
qstat -qR
server: hbar.lbl.gov
Queue Memory CPU Time Walltime Node Run Que Lm State
---------------- ------ -------- -------- ---- --- --- -- -----
batch -- -- -- -- 0 0 -- E R
dc_16 -- -- 12:00:00 16 0 0 -- E R
serial -- -- -- -- 2 2 -- E R
sh_16 -- -- 12:00:00 16 0 0 -- E R
parallel -- -- -- -- 0 1 -- E R
sh_32 -- -- 06:00:00 -- 0 0 -- E R
--- ---
2 3
# more config.nodes
$clienthost 192.168.2.10
$clienthost 192.168.1.200
$restricted 192.168.2.10
$logevent 255
$ideal_load 1.75
$max_load 1.99
# cat sched_priv/config
round_robin: false all
by_queue: false prime
by_queue: false non_prime
strict_fifo: false ALL
fair_share: false ALL
help_starving_jobs false ALL
sort_queues false ALL
load_balancing: true ALL
sort_by: shortest_job_first ALL
log_filter: 256
dedicated_prefix: ded
max_starve: 24:00:00
half_life: 24:00:00
unknown_shares: 10
sync_time: 1:00:00
smp_cluster_dist: round_robin prime
smp_cluster_dist: pack non_prime
This has been modified several times and new stuff added. Any advice
would be appreciated at this time.
Thanks
Jackie
More information about the torqueusers
mailing list