[torqueusers] parallel jobs - not so much

Jack Vant jvant at boisestate.edu
Fri Nov 1 13:11:19 MDT 2013


First, pardon me for my obvious ignorance.  I'm new to hpc administration
and I've managed to get torque and maui working with some queues, but I now
have eager grad students and professors who want to use their cluster.
 We're having a problem when we try and run parallel jobs with openmpi.

More specifically, we're trying to run parallel jobs with openmpi while
making use of the $tmpdir variable as a staging area of sorts.  The theory
is that users with large jobs would copy their executables, configs, etc.
into the $tmpdir and when done copy the results back out.  We can run a job
against one node, but when we try a couple the $tmpdir is only created on
one node and the job fails to run.  We're trying to emulate something we've
seen explained well by the folks at Wayne State  Something like this to do
all the copying:

cd $TMPDIR
cp /wsu/home/at/at80/at8036/pbs/job_name/script_file $TMPDIR
cp /wsu/home/at/at80/at8036/pbs/job_name/data_file $TMPDIR

$TMPDIR/$myexefile

mv $TMPDIR/* /wsu/home/at/at80/at8036/pbs/job_name/completed/.

I'm leaving out the PBS directives stanza.

We are using Torque version 4.2.2 and Maui 3.3.1.  We haven't tweaked Maui
at all.  It's the default install right now, and the Torque setup is pretty
minimal with queues and server that look like this:

So after many experiments we are wondering why torque perseverates on
node-32 or node-22? (which is where the $tmpdir gets created)  Why doesn't
it setup the environment on the other nodes even though they are allocated?

I'm including the results of qmgr -c 'p s'

#

# Create queues and set their attributes.

#

#

# Create and define queue cpu.q

#

create queue cpu.q

set queue cpu.q queue_type = Execution

set queue cpu.q acl_host_enable = False

set queue cpu.q acl_hosts = node-29.cm.cluster

set queue cpu.q acl_hosts += node-28.cm.cluster

set queue cpu.q acl_hosts += node-27.cm.cluster

set queue cpu.q acl_hosts += node-26.cm.cluster

set queue cpu.q acl_hosts += node-25.cm.cluster

set queue cpu.q acl_hosts += node-24.cm.cluster

set queue cpu.q acl_hosts += node-23.cm.cluster

set queue cpu.q acl_hosts += node-32.cm.cluster

set queue cpu.q acl_hosts += node-31.cm.cluster

set queue cpu.q acl_hosts += node-30.cm.cluster

set queue cpu.q resources_max.walltime = 240:00:00

set queue cpu.q resources_min.walltime = 00:00:00

set queue cpu.q resources_default.neednodes = cpu

set queue cpu.q enabled = True

set queue cpu.q started = True

#

# Create and define queue gpu.q

#

create queue gpu.q

set queue gpu.q queue_type = Execution

set queue gpu.q acl_host_enable = False

set queue gpu.q acl_hosts = node-19.cm.cluster

set queue gpu.q acl_hosts += node-09.cm.cluster

set queue gpu.q acl_hosts += node-18.cm.cluster

set queue gpu.q acl_hosts += node-08.cm.cluster

set queue gpu.q acl_hosts += node-17.cm.cluster

set queue gpu.q acl_hosts += node-07.cm.cluster

set queue gpu.q acl_hosts += node-16.cm.cluster

set queue gpu.q acl_hosts += node-06.cm.cluster

set queue gpu.q acl_hosts += node-15.cm.cluster

set queue gpu.q acl_hosts += node-05.cm.cluster

set queue gpu.q acl_hosts += node-14.cm.cluster

set queue gpu.q acl_hosts += node-04.cm.cluster

set queue gpu.q acl_hosts += node-13.cm.cluster

set queue gpu.q acl_hosts += node-03.cm.cluster

set queue gpu.q acl_hosts += node-22.cm.cluster

set queue gpu.q acl_hosts += node-12.cm.cluster

set queue gpu.q acl_hosts += node-02.cm.cluster

set queue gpu.q acl_hosts += node-21.cm.cluster

set queue gpu.q acl_hosts += node-11.cm.cluster

set queue gpu.q acl_hosts += node-01.cm.cluster

set queue gpu.q acl_hosts += node-20.cm.cluster

set queue gpu.q acl_hosts += node-10.cm.cluster

set queue gpu.q resources_max.walltime = 240:00:00

set queue gpu.q resources_min.walltime = 00:00:00

set queue gpu.q resources_default.neednodes = gpu

set queue gpu.q enabled = True

set queue gpu.q started = True

#

# Create and define queue batch

#

create queue batch

set queue batch queue_type = Execution

set queue batch acl_host_enable = False

set queue batch acl_hosts = node-29.cm.cluster

set queue batch acl_hosts += node-19.cm.cluster

set queue batch acl_hosts += node-09.cm.cluster

set queue batch acl_hosts += node-28.cm.cluster

set queue batch acl_hosts += node-18.cm.cluster

set queue batch acl_hosts += node-08.cm.cluster

set queue batch acl_hosts += node-27.cm.cluster

set queue batch acl_hosts += node-17.cm.cluster

set queue batch acl_hosts += node-07.cm.cluster

set queue batch acl_hosts += node-26.cm.cluster

set queue batch acl_hosts += node-16.cm.cluster

set queue batch acl_hosts += node-06.cm.cluster

set queue batch acl_hosts += node-25.cm.cluster

set queue batch acl_hosts += node-15.cm.cluster

set queue batch acl_hosts += node-05.cm.cluster

set queue batch acl_hosts += node-24.cm.cluster

set queue batch acl_hosts += node-14.cm.cluster

set queue batch acl_hosts += node-04.cm.cluster

set queue batch acl_hosts += node-23.cm.cluster

set queue batch acl_hosts += node-13.cm.cluster

set queue batch acl_hosts += node-03.cm.cluster

set queue batch acl_hosts += node-32.cm.cluster

set queue batch acl_hosts += node-22.cm.cluster

set queue batch acl_hosts += node-12.cm.cluster

set queue batch acl_hosts += node-02.cm.cluster

set queue batch acl_hosts += node-31.cm.cluster

set queue batch acl_hosts += node-21.cm.cluster

set queue batch acl_hosts += node-11.cm.cluster

set queue batch acl_hosts += node-01.cm.cluster

set queue batch acl_hosts += node-30.cm.cluster

set queue batch acl_hosts += node-20.cm.cluster

set queue batch acl_hosts += node-10.cm.cluster

set queue batch resources_max.walltime = 240:00:00

set queue batch resources_min.walltime = 00:00:00

set queue batch resources_default.walltime = 01:00:00

set queue batch enabled = True

set queue batch started = True

#

# Create and define queue all.q

#

create queue all.q

set queue all.q queue_type = Execution

set queue all.q acl_host_enable = False

set queue all.q acl_hosts = node-29.cm.cluster

set queue all.q acl_hosts += node-19.cm.cluster

set queue all.q acl_hosts += node-09.cm.cluster

set queue all.q acl_hosts += node-28.cm.cluster

set queue all.q acl_hosts += node-18.cm.cluster

set queue all.q acl_hosts += node-08.cm.cluster

set queue all.q acl_hosts += node-27.cm.cluster

set queue all.q acl_hosts += node-17.cm.cluster

set queue all.q acl_hosts += node-07.cm.cluster

set queue all.q acl_hosts += node-26.cm.cluster

set queue all.q acl_hosts += node-16.cm.cluster

set queue all.q acl_hosts += node-06.cm.cluster

set queue all.q acl_hosts += node-25.cm.cluster

set queue all.q acl_hosts += node-15.cm.cluster

set queue all.q acl_hosts += node-05.cm.cluster

set queue all.q acl_hosts += node-24.cm.cluster

set queue all.q acl_hosts += node-14.cm.cluster

set queue all.q acl_hosts += node-04.cm.cluster

set queue all.q acl_hosts += node-23.cm.cluster

set queue all.q acl_hosts += node-13.cm.cluster

set queue all.q acl_hosts += node-03.cm.cluster

set queue all.q acl_hosts += node-32.cm.cluster

set queue all.q acl_hosts += node-22.cm.cluster

set queue all.q acl_hosts += node-12.cm.cluster

set queue all.q acl_hosts += node-02.cm.cluster

set queue all.q acl_hosts += node-31.cm.cluster

set queue all.q acl_hosts += node-21.cm.cluster

set queue all.q acl_hosts += node-11.cm.cluster

set queue all.q acl_hosts += node-01.cm.cluster

set queue all.q acl_hosts += node-30.cm.cluster

set queue all.q acl_hosts += node-20.cm.cluster

set queue all.q acl_hosts += node-10.cm.cluster

set queue all.q resources_max.walltime = 240:00:00

set queue all.q resources_min.walltime = 00:00:00

set queue all.q resources_default.neednodes = all

set queue all.q enabled = True

set queue all.q started = True

#

# Set server attributes.

#

set server acl_hosts = master.cm.cluster

set server default_queue = batch

set server log_events = 511

set server mail_from = adm

set server scheduler_iteration = 600

set server node_check_rate = 150

set server tcp_timeout = 300

set server job_stat_rate = 45

set server poll_jobs = True

set server mom_job_sync = True

set server next_job_number = 372

set server moab_array_compatible = True

[roo



-- 
Jack Vant
System Engineer - HPC
Office of Information Technology
Boise State University
208-426-4446
208-863-0031
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20131101/0dcb9cb6/attachment.html 


More information about the torqueusers mailing list