[torqueusers] Job Allocation problem
Mark Meenan
mjm-www at dcs.gla.ac.uk
Tue Mar 21 03:26:01 MST 2006
I have come across an interesting problem. A user of the cluster
submitted a couple of hundred jobs to the queue - by default they went
into the feed queue and the resources requested were such that they were
moved to the long queue, which then filled up to the max_queuable limit.
The jobs that remain then were moved to the parallel queue (which is
logical even if I did not anticipate it). However the jobs when they
went into the parallel queue were allocated 8 nodes - which is not the
behaviour I would have expected. details of the queue structure are
copied below.
I am currently running torque-1.2.0p2, although intend to upgrade to the
latest version in the near future.
The jobs are fed into short, long, verylong, parallel
> create queue short
> set queue short queue_type = Execution
> set queue short max_queuable = 500
> set queue short max_running = 120
> set queue short resources_max.cput = 02:00:00
> set queue short resources_max.nodect = 1
> set queue short resources_max.nodes = 1
> set queue short resources_max.walltime = 04:00:00
> set queue short resources_default.cput = 01:00:00
> set queue short resources_default.nodes = 1
> set queue short resources_default.walltime = 04:00:00
> set queue short enabled = True
> set queue short started = True
> #
> # Create and define queue feed
> #
> create queue feed
> set queue feed queue_type = Route
> set queue feed max_queuable = 3000
> set queue feed route_destinations = short
> set queue feed route_destinations += long
> set queue feed route_destinations += verylong
> set queue feed route_destinations += parallel
> set queue feed enabled = True
> set queue feed started = True
> #
> # Create and define queue long
> #
> create queue long
> set queue long queue_type = Execution
> set queue long max_queuable = 160
> set queue long max_running = 40
> set queue long resources_max.cput = 30:00:00
> set queue long resources_max.nodect = 1
> set queue long resources_max.nodes = 1
> set queue long resources_max.walltime = 50:00:00
> set queue long resources_default.cput = 24:00:00
> set queue long resources_default.nodes = 1
> set queue long resources_default.walltime = 50:00:00
> set queue long enabled = True
> set queue long started = True
> #
> # Create and define queue verylong
> #
> create queue verylong
> set queue verylong queue_type = Execution
> set queue verylong max_queuable = 20
> set queue verylong max_running = 10
> set queue verylong resources_max.nodect = 1
> set queue verylong resources_max.nodes = 1
> set queue verylong resources_min.cput = 24:00:01
> set queue verylong resources_min.walltime = 24:00:01
> set queue verylong resources_default.nodes = 1
> set queue verylong resources_default.walltime = 288:00:00
> set queue verylong enabled = True
> set queue verylong started = True
> #
> # Create and define queue parallel
> #
> create queue parallel
> set queue parallel queue_type = Execution
> set queue parallel max_queuable = 10
> set queue parallel max_running = 5
> set queue parallel resources_max.nodect = 8
> set queue parallel resources_max.nodes = 8
> set queue parallel resources_default.nodes = 1
> set queue parallel resources_default.walltime = 288:00:00
> set queue parallel enabled = True
> set queue parallel started = True
>
I have since added the following lines of configuration and expect that
this will solve the particular problem, but I would like to know the
reason why the resources allocated was 8 nodes
> set queue parallel resources_min.nodect = 2
> set queue parallel resources_min.nodes = 2
> set queue parallel resources_default.nodect = 2
> set queue parallel resources_default.nodes = 2
Thanks
Mark Meenan
University of Glasgow
Computing Service
More information about the torqueusers
mailing list