[torqueusers] parallel jobs - not so much

Gus Correa gus at ldeo.columbia.edu
Fri Nov 1 13:42:44 MDT 2013


Hi Jack

1. You need to enable job scheduling:

qmgr -c 'set server scheduling = True'


2. I am not sure I understood right, but it looks like to
me that your $TMPDIR is on a local disk on the compute nodes, right?

Staging executables and data in and out to local disk
is possible (but may be painful).
If you want to do this for parallel jobs, you need to copy
the executable and data to *all* $TMPDIR disks on each node 
participating in that particular job.
This is probably why your 2-node job fails.
And this is also why staging in is painful (copying to all nodes).


3. Instead of item 2. above, it is much easier to have a NFS shared
directory (or directories) across the whole cluster where
the data and executables live.  In this case you don't need to
copy anything to local disks.
It is enough to add to the job script a like like this:

cd $PBS_O_WORKDIR

which will cd to the directory from which the job was launched with qsub
(and is presumably the work directory where the executable and data are).

For a modest sized cluster (which yours seems to be, 30 nodes or so) NFS 
should work OK, unless the jobs are very IO intensive.

I hope this helps,
Gus Correa

On 11/01/2013 03:11 PM, Jack Vant wrote:
> First, pardon me for my obvious ignorance.  I'm new to hpc
> administration and I've managed to get torque and maui working with some
> queues, but I now have eager grad students and professors who want to
> use their cluster.  We're having a problem when we try and run parallel
> jobs with openmpi.
>
> More specifically, we're trying to run parallel jobs with openmpi while
> making use of the $tmpdir variable as a staging area of sorts.  The
> theory is that users with large jobs would copy their executables,
> configs, etc. into the $tmpdir and when done copy the results back out.
>   We can run a job against one node, but when we try a couple the
> $tmpdir is only created on one node and the job fails to run.  We're
> trying to emulate something we've seen explained well by the folks at
> Wayne State  Something like this to do all the copying:
>
> cd $TMPDIR
> cp /wsu/home/at/at80/at8036/pbs/job_name/script_file $TMPDIR
> cp /wsu/home/at/at80/at8036/pbs/job_name/data_file $TMPDIR
>
> $TMPDIR/$myexefile
>
> mv $TMPDIR/* /wsu/home/at/at80/at8036/pbs/job_name/completed/.
>
> I'm leaving out the PBS directives stanza.
>
> We are using Torque version 4.2.2 and Maui 3.3.1.  We haven't tweaked
> Maui at all.  It's the default install right now, and the Torque setup
> is pretty minimal with queues and server that look like this:
>
> So after many experiments we are wondering why torque perseverates on
> node-32 or node-22? (which is where the $tmpdir gets created)  Why
> doesn't it setup the environment on the other nodes even though they are
> allocated?
>
> I'm including the results of qmgr -c 'p s'
>
> #
>
> # Create queues and set their attributes.
>
> #
>
> #
>
> # Create and define queue cpu.q
>
> #
>
> create queue cpu.q
>
> set queue cpu.q queue_type = Execution
>
> set queue cpu.q acl_host_enable = False
>
> set queue cpu.q acl_hosts = node-29.cm.cluster
>
> set queue cpu.q acl_hosts += node-28.cm.cluster
>
> set queue cpu.q acl_hosts += node-27.cm.cluster
>
> set queue cpu.q acl_hosts += node-26.cm.cluster
>
> set queue cpu.q acl_hosts += node-25.cm.cluster
>
> set queue cpu.q acl_hosts += node-24.cm.cluster
>
> set queue cpu.q acl_hosts += node-23.cm.cluster
>
> set queue cpu.q acl_hosts += node-32.cm.cluster
>
> set queue cpu.q acl_hosts += node-31.cm.cluster
>
> set queue cpu.q acl_hosts += node-30.cm.cluster
>
> set queue cpu.q resources_max.walltime = 240:00:00
>
> set queue cpu.q resources_min.walltime = 00:00:00
>
> set queue cpu.q resources_default.neednodes = cpu
>
> set queue cpu.q enabled = True
>
> set queue cpu.q started = True
>
> #
>
> # Create and define queue gpu.q
>
> #
>
> create queue gpu.q
>
> set queue gpu.q queue_type = Execution
>
> set queue gpu.q acl_host_enable = False
>
> set queue gpu.q acl_hosts = node-19.cm.cluster
>
> set queue gpu.q acl_hosts += node-09.cm.cluster
>
> set queue gpu.q acl_hosts += node-18.cm.cluster
>
> set queue gpu.q acl_hosts += node-08.cm.cluster
>
> set queue gpu.q acl_hosts += node-17.cm.cluster
>
> set queue gpu.q acl_hosts += node-07.cm.cluster
>
> set queue gpu.q acl_hosts += node-16.cm.cluster
>
> set queue gpu.q acl_hosts += node-06.cm.cluster
>
> set queue gpu.q acl_hosts += node-15.cm.cluster
>
> set queue gpu.q acl_hosts += node-05.cm.cluster
>
> set queue gpu.q acl_hosts += node-14.cm.cluster
>
> set queue gpu.q acl_hosts += node-04.cm.cluster
>
> set queue gpu.q acl_hosts += node-13.cm.cluster
>
> set queue gpu.q acl_hosts += node-03.cm.cluster
>
> set queue gpu.q acl_hosts += node-22.cm.cluster
>
> set queue gpu.q acl_hosts += node-12.cm.cluster
>
> set queue gpu.q acl_hosts += node-02.cm.cluster
>
> set queue gpu.q acl_hosts += node-21.cm.cluster
>
> set queue gpu.q acl_hosts += node-11.cm.cluster
>
> set queue gpu.q acl_hosts += node-01.cm.cluster
>
> set queue gpu.q acl_hosts += node-20.cm.cluster
>
> set queue gpu.q acl_hosts += node-10.cm.cluster
>
> set queue gpu.q resources_max.walltime = 240:00:00
>
> set queue gpu.q resources_min.walltime = 00:00:00
>
> set queue gpu.q resources_default.neednodes = gpu
>
> set queue gpu.q enabled = True
>
> set queue gpu.q started = True
>
> #
>
> # Create and define queue batch
>
> #
>
> create queue batch
>
> set queue batch queue_type = Execution
>
> set queue batch acl_host_enable = False
>
> set queue batch acl_hosts = node-29.cm.cluster
>
> set queue batch acl_hosts += node-19.cm.cluster
>
> set queue batch acl_hosts += node-09.cm.cluster
>
> set queue batch acl_hosts += node-28.cm.cluster
>
> set queue batch acl_hosts += node-18.cm.cluster
>
> set queue batch acl_hosts += node-08.cm.cluster
>
> set queue batch acl_hosts += node-27.cm.cluster
>
> set queue batch acl_hosts += node-17.cm.cluster
>
> set queue batch acl_hosts += node-07.cm.cluster
>
> set queue batch acl_hosts += node-26.cm.cluster
>
> set queue batch acl_hosts += node-16.cm.cluster
>
> set queue batch acl_hosts += node-06.cm.cluster
>
> set queue batch acl_hosts += node-25.cm.cluster
>
> set queue batch acl_hosts += node-15.cm.cluster
>
> set queue batch acl_hosts += node-05.cm.cluster
>
> set queue batch acl_hosts += node-24.cm.cluster
>
> set queue batch acl_hosts += node-14.cm.cluster
>
> set queue batch acl_hosts += node-04.cm.cluster
>
> set queue batch acl_hosts += node-23.cm.cluster
>
> set queue batch acl_hosts += node-13.cm.cluster
>
> set queue batch acl_hosts += node-03.cm.cluster
>
> set queue batch acl_hosts += node-32.cm.cluster
>
> set queue batch acl_hosts += node-22.cm.cluster
>
> set queue batch acl_hosts += node-12.cm.cluster
>
> set queue batch acl_hosts += node-02.cm.cluster
>
> set queue batch acl_hosts += node-31.cm.cluster
>
> set queue batch acl_hosts += node-21.cm.cluster
>
> set queue batch acl_hosts += node-11.cm.cluster
>
> set queue batch acl_hosts += node-01.cm.cluster
>
> set queue batch acl_hosts += node-30.cm.cluster
>
> set queue batch acl_hosts += node-20.cm.cluster
>
> set queue batch acl_hosts += node-10.cm.cluster
>
> set queue batch resources_max.walltime = 240:00:00
>
> set queue batch resources_min.walltime = 00:00:00
>
> set queue batch resources_default.walltime = 01:00:00
>
> set queue batch enabled = True
>
> set queue batch started = True
>
> #
>
> # Create and define queue all.q
>
> #
>
> create queue all.q
>
> set queue all.q queue_type = Execution
>
> set queue all.q acl_host_enable = False
>
> set queue all.q acl_hosts = node-29.cm.cluster
>
> set queue all.q acl_hosts += node-19.cm.cluster
>
> set queue all.q acl_hosts += node-09.cm.cluster
>
> set queue all.q acl_hosts += node-28.cm.cluster
>
> set queue all.q acl_hosts += node-18.cm.cluster
>
> set queue all.q acl_hosts += node-08.cm.cluster
>
> set queue all.q acl_hosts += node-27.cm.cluster
>
> set queue all.q acl_hosts += node-17.cm.cluster
>
> set queue all.q acl_hosts += node-07.cm.cluster
>
> set queue all.q acl_hosts += node-26.cm.cluster
>
> set queue all.q acl_hosts += node-16.cm.cluster
>
> set queue all.q acl_hosts += node-06.cm.cluster
>
> set queue all.q acl_hosts += node-25.cm.cluster
>
> set queue all.q acl_hosts += node-15.cm.cluster
>
> set queue all.q acl_hosts += node-05.cm.cluster
>
> set queue all.q acl_hosts += node-24.cm.cluster
>
> set queue all.q acl_hosts += node-14.cm.cluster
>
> set queue all.q acl_hosts += node-04.cm.cluster
>
> set queue all.q acl_hosts += node-23.cm.cluster
>
> set queue all.q acl_hosts += node-13.cm.cluster
>
> set queue all.q acl_hosts += node-03.cm.cluster
>
> set queue all.q acl_hosts += node-32.cm.cluster
>
> set queue all.q acl_hosts += node-22.cm.cluster
>
> set queue all.q acl_hosts += node-12.cm.cluster
>
> set queue all.q acl_hosts += node-02.cm.cluster
>
> set queue all.q acl_hosts += node-31.cm.cluster
>
> set queue all.q acl_hosts += node-21.cm.cluster
>
> set queue all.q acl_hosts += node-11.cm.cluster
>
> set queue all.q acl_hosts += node-01.cm.cluster
>
> set queue all.q acl_hosts += node-30.cm.cluster
>
> set queue all.q acl_hosts += node-20.cm.cluster
>
> set queue all.q acl_hosts += node-10.cm.cluster
>
> set queue all.q resources_max.walltime = 240:00:00
>
> set queue all.q resources_min.walltime = 00:00:00
>
> set queue all.q resources_default.neednodes = all
>
> set queue all.q enabled = True
>
> set queue all.q started = True
>
> #
>
> # Set server attributes.
>
> #
>
> set server acl_hosts = master.cm.cluster
>
> set server default_queue = batch
>
> set server log_events = 511
>
> set server mail_from = adm
>
> set server scheduler_iteration = 600
>
> set server node_check_rate = 150
>
> set server tcp_timeout = 300
>
> set server job_stat_rate = 45
>
> set server poll_jobs = True
>
> set server mom_job_sync = True
>
> set server next_job_number = 372
>
> set server moab_array_compatible = True
>
> [roo
>
>
>
>
> --
> Jack Vant
> System Engineer - HPC
> Office of Information Technology
> Boise State University
> 208-426-4446
> 208-863-0031
>
>
> _______________________________________________
> torqueusers mailing list
> torqueusers at supercluster.org
> http://www.supercluster.org/mailman/listinfo/torqueusers



More information about the torqueusers mailing list