[torqueusers] Serial jobs only running on one node

Jacqueline Scoggins jscoggins at lbl.gov
Thu Apr 6 14:53:24 MDT 2006


Ok. I figured it out.

It is very simple:

If no location was specified on the Run Job request, but the job
requests nodes,then cluster nodes which match the request are allocated
if possible.

In your job script or on command line for serial jobs you have to state:

-l ncpus=1,nodes

Simple isn't it.  It just took me sometime to see it.


Jackie

On Tue, 2006-04-04 at 13:46, Jacqueline Scoggins wrote:
> I am running torque-1.1.0p2.  I just recently reconfigured my queuing
> system to allow more than one queue based on the type of job you want to
> run certain limits will be set.
> 
> The problem I am having for one queue only called "serial" is that all
> jobs in this queue are going to the first node in the nodes file and to
> no other machines.  If that node is down they stay in the Q state
> otherwise all jobs from this queue goes there, runs on node0 even if the
> system load average is above 2.0.
> 
> Here are my configurations:
> 
> qmgr -c 'p s '
> #
> # Create queues and set their attributes.
> #
> #
> # Create and define queue batch
> #
> create queue batch
> set queue batch queue_type = Route
> set queue batch route_destinations = sh_16
> set queue batch route_destinations += sh_32
> set queue batch route_destinations += dc_16
> set queue batch route_destinations += parallel
> set queue batch route_destinations += serial
> set queue batch enabled = True
> set queue batch started = True
> #
> # Create and define queue dc_16
> #
> create queue dc_16
> set queue dc_16 queue_type = Execution
> set queue dc_16 resources_max.ncpus = 32
> set queue dc_16 resources_max.nodect = 16
> set queue dc_16 resources_max.walltime = 12:00:00
> set queue dc_16 resources_min.ncpus = 17
> set queue dc_16 resources_min.nodect = 4
> set queue dc_16 resources_min.nodes = 4:dualcore
> set queue dc_16 resources_default.walltime = 12:00:00
> set queue dc_16 enabled = True
> set queue dc_16 started = True
> #
> # Create and define queue serial
> #
> create queue serial
> set queue serial queue_type = Execution
> set queue serial enabled = True
> set queue serial started = True
> #
> # Create and define queue sh_16
> #
> create queue sh_16
> set queue sh_16 queue_type = Execution
> set queue sh_16 resources_max.ncpus = 31
> set queue sh_16 resources_max.nodect = 16
> set queue sh_16 resources_max.walltime = 12:00:00
> set queue sh_16 resources_min.ncpus = 17
> set queue sh_16 resources_min.nodect = 8
> set queue sh_16 resources_min.nodes = 8:shared
> set queue sh_16 resources_default.walltime = 12:00:00
> set queue sh_16 enabled = True
> set queue sh_16 started = True
> #
> # Create and define queue parallel
> #
> create queue parallel
> set queue parallel queue_type = Execution
> set queue parallel resources_min.ncpus = 2
> set queue parallel resources_min.nodes = 2:ppn=2
> set queue parallel enabled = True
> set queue parallel started = True
> #
> # Create and define queue sh_32
> #
> create queue sh_32
> set queue sh_32 queue_type = Execution
> set queue sh_32 resources_max.walltime = 06:00:00
> set queue sh_32 resources_min.ncpus = 34
> set queue sh_32 resources_min.nodect = 17
> set queue sh_32 resources_min.nodes = 17:shared
> set queue sh_32 resources_default.walltime = 06:00:00
> set queue sh_32 enabled = True
> set queue sh_32 started = True
> #
> # Set server attributes.
> #
> set server scheduling = True
> set server max_running = 1000
> set server max_user_run = 200
> set server default_queue = batch
> set server log_events = 511
> set server mail_from = adm
> set server query_other_jobs = True
> set server scheduler_iteration = 60
> set server node_ping_rate = 300
> set server node_check_rate = 600
> set server tcp_timeout = 6
> 
> 
> qstat -qR
> 
> server: hbar.lbl.gov
> 
> Queue            Memory CPU Time Walltime Node Run Que Lm  State
> ---------------- ------ -------- -------- ---- --- --- --  -----
> batch              --      --       	--     --    0   0   --   E R
> dc_16              --      --      12:00:00   16    0   0   --   E R
> serial             --      --            --     --    2   2   --   E R
> sh_16              --      --      12:00:00   16    0   0   --   E R
> parallel           --      --           --     --    0   1   --   E R
> sh_32              --      --       06:00:00  --    0   0   --   E R
>                                                            --- ---
>                                                             2   3
> 
> # more config.nodes
> $clienthost 192.168.2.10
> $clienthost 192.168.1.200
> $restricted 192.168.2.10
> $logevent 255
> $ideal_load 1.75
> $max_load 1.99
> 
> # cat sched_priv/config
> 
> 
> round_robin: false      all
> by_queue: false         prime
> by_queue: false         non_prime
> 
> strict_fifo: false      ALL
> fair_share: false       ALL
> 
> help_starving_jobs      false   ALL
> 
> sort_queues     false   ALL
> 
> load_balancing: true    ALL
> 
> 
> 
> sort_by: shortest_job_first     ALL
> 
> log_filter: 256
> 
> dedicated_prefix: ded
> 
> max_starve: 24:00:00
> 
> 
> half_life: 24:00:00
> 
> unknown_shares: 10
> 
> sync_time: 1:00:00
> 
> smp_cluster_dist: round_robin  prime
> smp_cluster_dist: pack  non_prime
> 
> 
> This has been modified several times and new stuff added.  Any advice
> would be appreciated at this time.
> 
> Thanks
> 
> Jackie
> 
> 
> _______________________________________________
> torqueusers mailing list
> torqueusers at supercluster.org
> http://www.supercluster.org/mailman/listinfo/torqueusers



More information about the torqueusers mailing list