[torqueusers] Incorrect $PBS_NODEFILE using TORQUE/Maui
Matthias Schoepfer
mschoepf at techfak.uni-bielefeld.de
Tue Mar 2 05:01:18 MST 2010
Hi!
I am using torque/maui. When I submit a job
qsub -l nodes=4:ppn=8 trympi.sh
the job will only get executed on one node, although showq will
correctly reserve 4 nodes and 8 cores per node for the job. The
PBS_NODEFILE is also incorrect, as it only lists one node.
Here are the examples: /* Thanks in advance */
trympi.sh:
#PBS -N mpitest
#PBS -o mpitest.out
#PBS -e mpitest.err
echo $PBS_JOBID
echo "Start time :"
date
echo "PBS_NODEFILE = $PBS_NODEFILE"
cat $PBS_NODEFILE
cd mpitest
mpirun --mca btl self,tcp mpitest
echo "End Time :"
date
mpitest.out:
1193.macabeo
Start time :
Tue Mar 2 12:51:15 CET 2010
PBS_NODEFILE = /var/spool/torque/aux//1193.macabeo
node04
node04
node04
node04
node04
node04
node04
node04
End Time :
Tue Mar 2 12:51:24 CET 2010
cat /var/spool/torque/server_priv/nodes
node00 np=8
node01 np=8
node02 np=8
node03 np=8
node04 np=8
node05 np=8
node06 np=8
node07 np=8
node08 np=8
node09 np=8
node10 np=8
node11 np=8
node12 np=8
node13 np=8
node14 np=8
node15 np=8
qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue debug
#
create queue debug
set queue debug queue_type = Execution
set queue debug Priority = 100
set queue debug max_running = 16
set queue debug resources_max.nodes = 16
set queue debug resources_max.walltime = 00:05:00
set queue debug resources_default.mem = 1024mb
set queue debug resources_default.nodes = 1
set queue debug resources_default.walltime = 00:05:00
set queue debug enabled = True
set queue debug started = True
#
# Create and define queue default
#
create queue default
set queue default queue_type = Route
set queue default max_running = 128
set queue default route_destinations = short
set queue default route_destinations += medium
set queue default route_destinations += long
set queue default enabled = True
set queue default started = True
#
# Create and define queue long
#
create queue long
set queue long queue_type = Execution
set queue long Priority = 40
set queue long max_running = 16
set queue long resources_max.walltime = 720:00:00
set queue long resources_min.cput = 72:00:01
set queue long resources_min.walltime = 72:00:01
set queue long resources_default.mem = 2048mb
set queue long resources_default.nodes = 1
set queue long resources_default.walltime = 240:00:00
set queue long enabled = True
set queue long started = True
#
# Create and define queue medium
#
create queue medium
set queue medium queue_type = Execution
set queue medium Priority = 70
set queue medium max_running = 128
set queue medium resources_max.walltime = 72:00:00
set queue medium resources_min.cput = 02:00:01
set queue medium resources_min.walltime = 02:00:01
set queue medium resources_default.mem = 2048mb
set queue medium resources_default.nodes = 1
set queue medium resources_default.walltime = 08:00:00
set queue medium enabled = True
set queue medium started = True
#
# Create and define queue short
#
create queue short
set queue short queue_type = Execution
set queue short Priority = 90
set queue short max_running = 128
set queue short resources_max.walltime = 02:00:00
set queue short resources_default.mem = 2048mb
set queue short resources_default.nodes = 1
set queue short resources_default.walltime = 00:20:00
set queue short enabled = True
set queue short started = True
#
# Set server attributes.
#
set server scheduling = True
set server max_user_run = 16
set server acl_hosts = macabeo
set server operators = mschoepf
set server operators += root
set server default_queue = default
set server log_events = 511
set server mail_from = c3-admins at lists.cit-ec.uni-bielefeld.de
set server resources_default.nodes = 1
set server resources_default.walltime = 02:00:00
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server next_job_number = 1194
cat /etc/maui.cfg
SERVERHOST macabeo
SERVERPORT 42559
ADMIN1 root mschoepf
RMCFG[0] TYPE=PBS
NODEMAXLOAD 7.75
SERVERMODE NORMAL
LOGLEVEL 2
ENABLEMULTIREQJOBS TRUE
NODEAVAILABILITYPOLICY COMBINED
RESOURCELIMITPOLICY MEM:EXTENDEDVIOLATION:CANCEL
# reserve 16 processors during primetime for jobs requiring less than 2
hours to complete
#SRNAME[0] fast
#SRTASKCOUNT[0] 16
#SRDAYS[0] MON TUE WED THU FRI
#SRSTARTTIME[0] 9:00:00
#SRENDTIME[0] 18:00:00
#SRMAXTIME[0] 2:00:00
# prioritize jobs for Fairshare, XFactor, and Resources
#RESOURCEWEIGHT 20
#XFACTORWEIGHT 100
#FAIRSHAREWEIGHT 100
# disable SMP node sharing
#NODEACCESSPOLICY SINGLEJOB
#ALLOCATIONPOLICY CPULOAD
JOBNODEMATCHPOLICY EXACTNODE
NODEACCESSPOLICY SHARED
--
MfG Matthias Schoepfer
email:mschoepf at techfak.uni-bielefeld.de, PGP-Key auf Anfrage
--- Werbung ---
Math Problems?
Call
0190-((10x)(13i)²)-(sin(xy)(log(y)))³
More information about the torqueusers
mailing list