[torqueusers] Serial jobs only running on one node

Jacqueline Scoggins jscoggins at lbl.gov
Tue Apr 4 14:46:02 MDT 2006


I am running torque-1.1.0p2.  I just recently reconfigured my queuing
system to allow more than one queue based on the type of job you want to
run certain limits will be set.

The problem I am having for one queue only called "serial" is that all
jobs in this queue are going to the first node in the nodes file and to
no other machines.  If that node is down they stay in the Q state
otherwise all jobs from this queue goes there, runs on node0 even if the
system load average is above 2.0.

Here are my configurations:

qmgr -c 'p s '
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Route
set queue batch route_destinations = sh_16
set queue batch route_destinations += sh_32
set queue batch route_destinations += dc_16
set queue batch route_destinations += parallel
set queue batch route_destinations += serial
set queue batch enabled = True
set queue batch started = True
#
# Create and define queue dc_16
#
create queue dc_16
set queue dc_16 queue_type = Execution
set queue dc_16 resources_max.ncpus = 32
set queue dc_16 resources_max.nodect = 16
set queue dc_16 resources_max.walltime = 12:00:00
set queue dc_16 resources_min.ncpus = 17
set queue dc_16 resources_min.nodect = 4
set queue dc_16 resources_min.nodes = 4:dualcore
set queue dc_16 resources_default.walltime = 12:00:00
set queue dc_16 enabled = True
set queue dc_16 started = True
#
# Create and define queue serial
#
create queue serial
set queue serial queue_type = Execution
set queue serial enabled = True
set queue serial started = True
#
# Create and define queue sh_16
#
create queue sh_16
set queue sh_16 queue_type = Execution
set queue sh_16 resources_max.ncpus = 31
set queue sh_16 resources_max.nodect = 16
set queue sh_16 resources_max.walltime = 12:00:00
set queue sh_16 resources_min.ncpus = 17
set queue sh_16 resources_min.nodect = 8
set queue sh_16 resources_min.nodes = 8:shared
set queue sh_16 resources_default.walltime = 12:00:00
set queue sh_16 enabled = True
set queue sh_16 started = True
#
# Create and define queue parallel
#
create queue parallel
set queue parallel queue_type = Execution
set queue parallel resources_min.ncpus = 2
set queue parallel resources_min.nodes = 2:ppn=2
set queue parallel enabled = True
set queue parallel started = True
#
# Create and define queue sh_32
#
create queue sh_32
set queue sh_32 queue_type = Execution
set queue sh_32 resources_max.walltime = 06:00:00
set queue sh_32 resources_min.ncpus = 34
set queue sh_32 resources_min.nodect = 17
set queue sh_32 resources_min.nodes = 17:shared
set queue sh_32 resources_default.walltime = 06:00:00
set queue sh_32 enabled = True
set queue sh_32 started = True
#
# Set server attributes.
#
set server scheduling = True
set server max_running = 1000
set server max_user_run = 200
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server scheduler_iteration = 60
set server node_ping_rate = 300
set server node_check_rate = 600
set server tcp_timeout = 6


qstat -qR

server: hbar.lbl.gov

Queue            Memory CPU Time Walltime Node Run Que Lm  State
---------------- ------ -------- -------- ---- --- --- --  -----
batch              --      --       	--     --    0   0   --   E R
dc_16              --      --      12:00:00   16    0   0   --   E R
serial             --      --            --     --    2   2   --   E R
sh_16              --      --      12:00:00   16    0   0   --   E R
parallel           --      --           --     --    0   1   --   E R
sh_32              --      --       06:00:00  --    0   0   --   E R
                                                           --- ---
                                                            2   3

# more config.nodes
$clienthost 192.168.2.10
$clienthost 192.168.1.200
$restricted 192.168.2.10
$logevent 255
$ideal_load 1.75
$max_load 1.99

# cat sched_priv/config


round_robin: false      all
by_queue: false         prime
by_queue: false         non_prime

strict_fifo: false      ALL
fair_share: false       ALL

help_starving_jobs      false   ALL

sort_queues     false   ALL

load_balancing: true    ALL



sort_by: shortest_job_first     ALL

log_filter: 256

dedicated_prefix: ded

max_starve: 24:00:00


half_life: 24:00:00

unknown_shares: 10

sync_time: 1:00:00

smp_cluster_dist: round_robin  prime
smp_cluster_dist: pack  non_prime


This has been modified several times and new stuff added.  Any advice
would be appreciated at this time.

Thanks

Jackie




More information about the torqueusers mailing list