[torqueusers] Incorrect $PBS_NODEFILE using TORQUE/Maui

Matthias Schoepfer mschoepf at techfak.uni-bielefeld.de
Tue Mar 2 05:01:18 MST 2010


Hi!

I am using torque/maui. When I submit a job

qsub -l nodes=4:ppn=8 trympi.sh

the job will only get executed on one node, although showq will
correctly reserve 4 nodes and 8 cores per node for the job. The
PBS_NODEFILE is also incorrect, as it only lists one node.

Here are the examples: /* Thanks in advance  */

trympi.sh:

#PBS -N mpitest
#PBS -o mpitest.out
#PBS -e mpitest.err

echo $PBS_JOBID
echo "Start time :"
date

echo "PBS_NODEFILE = $PBS_NODEFILE"
cat $PBS_NODEFILE

cd mpitest
mpirun --mca btl self,tcp mpitest

echo "End Time :"
date

mpitest.out:

1193.macabeo
Start time :
Tue Mar  2 12:51:15 CET 2010
PBS_NODEFILE = /var/spool/torque/aux//1193.macabeo
node04
node04
node04
node04
node04
node04
node04
node04
End Time :
Tue Mar  2 12:51:24 CET 2010

cat /var/spool/torque/server_priv/nodes
node00 np=8
node01 np=8
node02 np=8
node03 np=8
node04 np=8
node05 np=8
node06 np=8
node07 np=8
node08 np=8
node09 np=8
node10 np=8
node11 np=8
node12 np=8
node13 np=8
node14 np=8
node15 np=8

qmgr -c 'p s'

#
# Create queues and set their attributes.
#
#
# Create and define queue debug
#
create queue debug
set queue debug queue_type = Execution
set queue debug Priority = 100
set queue debug max_running = 16
set queue debug resources_max.nodes = 16
set queue debug resources_max.walltime = 00:05:00
set queue debug resources_default.mem = 1024mb
set queue debug resources_default.nodes = 1
set queue debug resources_default.walltime = 00:05:00
set queue debug enabled = True
set queue debug started = True
#
# Create and define queue default
#
create queue default
set queue default queue_type = Route
set queue default max_running = 128
set queue default route_destinations = short
set queue default route_destinations += medium
set queue default route_destinations += long
set queue default enabled = True
set queue default started = True
#
# Create and define queue long
#
create queue long
set queue long queue_type = Execution
set queue long Priority = 40
set queue long max_running = 16
set queue long resources_max.walltime = 720:00:00
set queue long resources_min.cput = 72:00:01
set queue long resources_min.walltime = 72:00:01
set queue long resources_default.mem = 2048mb
set queue long resources_default.nodes = 1
set queue long resources_default.walltime = 240:00:00
set queue long enabled = True
set queue long started = True
#
# Create and define queue medium
#
create queue medium
set queue medium queue_type = Execution
set queue medium Priority = 70
set queue medium max_running = 128
set queue medium resources_max.walltime = 72:00:00
set queue medium resources_min.cput = 02:00:01
set queue medium resources_min.walltime = 02:00:01
set queue medium resources_default.mem = 2048mb
set queue medium resources_default.nodes = 1
set queue medium resources_default.walltime = 08:00:00
set queue medium enabled = True
set queue medium started = True
#
# Create and define queue short
#
create queue short
set queue short queue_type = Execution
set queue short Priority = 90
set queue short max_running = 128
set queue short resources_max.walltime = 02:00:00
set queue short resources_default.mem = 2048mb
set queue short resources_default.nodes = 1
set queue short resources_default.walltime = 00:20:00
set queue short enabled = True
set queue short started = True
#
# Set server attributes.
#
set server scheduling = True
set server max_user_run = 16
set server acl_hosts = macabeo
set server operators = mschoepf
set server operators += root
set server default_queue = default
set server log_events = 511
set server mail_from = c3-admins at lists.cit-ec.uni-bielefeld.de
set server resources_default.nodes = 1
set server resources_default.walltime = 02:00:00
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server next_job_number = 1194

cat /etc/maui.cfg

SERVERHOST macabeo
SERVERPORT 42559
ADMIN1 root mschoepf
RMCFG[0] TYPE=PBS
NODEMAXLOAD 7.75
SERVERMODE NORMAL
LOGLEVEL 2
ENABLEMULTIREQJOBS TRUE
NODEAVAILABILITYPOLICY COMBINED
RESOURCELIMITPOLICY  MEM:EXTENDEDVIOLATION:CANCEL
# reserve 16 processors during primetime for jobs requiring less than 2
hours to complete
#SRNAME[0]        fast
#SRTASKCOUNT[0]   16
#SRDAYS[0]        MON TUE WED THU FRI
#SRSTARTTIME[0]   9:00:00
#SRENDTIME[0]     18:00:00
#SRMAXTIME[0]     2:00:00

# prioritize jobs for Fairshare, XFactor, and Resources

#RESOURCEWEIGHT   20
#XFACTORWEIGHT    100
#FAIRSHAREWEIGHT  100

# disable SMP node sharing

#NODEACCESSPOLICY  SINGLEJOB

#ALLOCATIONPOLICY  CPULOAD
JOBNODEMATCHPOLICY EXACTNODE
NODEACCESSPOLICY SHARED



-- 

MfG  Matthias Schoepfer

email:mschoepf at techfak.uni-bielefeld.de, PGP-Key auf Anfrage

	      		       --- Werbung ---
				Math Problems?
                                     Call
                   0190-((10x)(13i)²)-(sin(xy)(log(y)))³


More information about the torqueusers mailing list