[torqueusers] Serial jobs only running on one node
Jacqueline Scoggins
jscoggins at lbl.gov
Thu Apr 6 14:53:24 MDT 2006
Ok. I figured it out.
It is very simple:
If no location was specified on the Run Job request, but the job
requests nodes,then cluster nodes which match the request are allocated
if possible.
In your job script or on command line for serial jobs you have to state:
-l ncpus=1,nodes
Simple isn't it. It just took me sometime to see it.
Jackie
On Tue, 2006-04-04 at 13:46, Jacqueline Scoggins wrote:
> I am running torque-1.1.0p2. I just recently reconfigured my queuing
> system to allow more than one queue based on the type of job you want to
> run certain limits will be set.
>
> The problem I am having for one queue only called "serial" is that all
> jobs in this queue are going to the first node in the nodes file and to
> no other machines. If that node is down they stay in the Q state
> otherwise all jobs from this queue goes there, runs on node0 even if the
> system load average is above 2.0.
>
> Here are my configurations:
>
> qmgr -c 'p s '
> #
> # Create queues and set their attributes.
> #
> #
> # Create and define queue batch
> #
> create queue batch
> set queue batch queue_type = Route
> set queue batch route_destinations = sh_16
> set queue batch route_destinations += sh_32
> set queue batch route_destinations += dc_16
> set queue batch route_destinations += parallel
> set queue batch route_destinations += serial
> set queue batch enabled = True
> set queue batch started = True
> #
> # Create and define queue dc_16
> #
> create queue dc_16
> set queue dc_16 queue_type = Execution
> set queue dc_16 resources_max.ncpus = 32
> set queue dc_16 resources_max.nodect = 16
> set queue dc_16 resources_max.walltime = 12:00:00
> set queue dc_16 resources_min.ncpus = 17
> set queue dc_16 resources_min.nodect = 4
> set queue dc_16 resources_min.nodes = 4:dualcore
> set queue dc_16 resources_default.walltime = 12:00:00
> set queue dc_16 enabled = True
> set queue dc_16 started = True
> #
> # Create and define queue serial
> #
> create queue serial
> set queue serial queue_type = Execution
> set queue serial enabled = True
> set queue serial started = True
> #
> # Create and define queue sh_16
> #
> create queue sh_16
> set queue sh_16 queue_type = Execution
> set queue sh_16 resources_max.ncpus = 31
> set queue sh_16 resources_max.nodect = 16
> set queue sh_16 resources_max.walltime = 12:00:00
> set queue sh_16 resources_min.ncpus = 17
> set queue sh_16 resources_min.nodect = 8
> set queue sh_16 resources_min.nodes = 8:shared
> set queue sh_16 resources_default.walltime = 12:00:00
> set queue sh_16 enabled = True
> set queue sh_16 started = True
> #
> # Create and define queue parallel
> #
> create queue parallel
> set queue parallel queue_type = Execution
> set queue parallel resources_min.ncpus = 2
> set queue parallel resources_min.nodes = 2:ppn=2
> set queue parallel enabled = True
> set queue parallel started = True
> #
> # Create and define queue sh_32
> #
> create queue sh_32
> set queue sh_32 queue_type = Execution
> set queue sh_32 resources_max.walltime = 06:00:00
> set queue sh_32 resources_min.ncpus = 34
> set queue sh_32 resources_min.nodect = 17
> set queue sh_32 resources_min.nodes = 17:shared
> set queue sh_32 resources_default.walltime = 06:00:00
> set queue sh_32 enabled = True
> set queue sh_32 started = True
> #
> # Set server attributes.
> #
> set server scheduling = True
> set server max_running = 1000
> set server max_user_run = 200
> set server default_queue = batch
> set server log_events = 511
> set server mail_from = adm
> set server query_other_jobs = True
> set server scheduler_iteration = 60
> set server node_ping_rate = 300
> set server node_check_rate = 600
> set server tcp_timeout = 6
>
>
> qstat -qR
>
> server: hbar.lbl.gov
>
> Queue Memory CPU Time Walltime Node Run Que Lm State
> ---------------- ------ -------- -------- ---- --- --- -- -----
> batch -- -- -- -- 0 0 -- E R
> dc_16 -- -- 12:00:00 16 0 0 -- E R
> serial -- -- -- -- 2 2 -- E R
> sh_16 -- -- 12:00:00 16 0 0 -- E R
> parallel -- -- -- -- 0 1 -- E R
> sh_32 -- -- 06:00:00 -- 0 0 -- E R
> --- ---
> 2 3
>
> # more config.nodes
> $clienthost 192.168.2.10
> $clienthost 192.168.1.200
> $restricted 192.168.2.10
> $logevent 255
> $ideal_load 1.75
> $max_load 1.99
>
> # cat sched_priv/config
>
>
> round_robin: false all
> by_queue: false prime
> by_queue: false non_prime
>
> strict_fifo: false ALL
> fair_share: false ALL
>
> help_starving_jobs false ALL
>
> sort_queues false ALL
>
> load_balancing: true ALL
>
>
>
> sort_by: shortest_job_first ALL
>
> log_filter: 256
>
> dedicated_prefix: ded
>
> max_starve: 24:00:00
>
>
> half_life: 24:00:00
>
> unknown_shares: 10
>
> sync_time: 1:00:00
>
> smp_cluster_dist: round_robin prime
> smp_cluster_dist: pack non_prime
>
>
> This has been modified several times and new stuff added. Any advice
> would be appreciated at this time.
>
> Thanks
>
> Jackie
>
>
> _______________________________________________
> torqueusers mailing list
> torqueusers at supercluster.org
> http://www.supercluster.org/mailman/listinfo/torqueusers
More information about the torqueusers
mailing list