[torqueusers] problems with parallel jobs

Albino Aveleda bino at coc.ufrj.br
Wed Mar 14 12:49:24 MDT 2007


Hi,

I have a cluster with 32 nodes where each node has 2 cpus.
I made the parallel test job called by "ptest.job" (see bellow).
In this job I want to use 8 cpus, then I defined in my job
"-l nodes=4:ppn=2". But, unfortunally this don't work. When I submited 
this job the torque select only 2 cpus in the first node. You can see 
this output and the output of
job bellow. The output command "pbsnodes -a" show all 32
nodes with 2 cpus per node.
I show my torque configuration bellow too.

What is wrong in my configuration?

Best regards,
Bibo

+++ file ptest.job +++
#PBS -l nodes=4:ppn=2
#PBS -l walltime=00:05:00
#PBS -j oe
#PBS -N ptest

# change directory
cd ${PBS_O_WORKDIR}
/bin/hostname
# get the number of nodes
NUM_NODES=`cat $PBS_NODEFILE | wc -l`
echo "Num. nodes = $NUM_NODES"
cat $PBS_NODEFILE

sleep 60
+++ end file +++

+++ qstat -f +++
Job Id: 29.adm
    Job_Name = ptest
    Job_Owner = bino at adm
    job_state = R
    queue = b_8cpus
    server = adm
    Checkpoint = u
    ctime = Wed Mar 14 14:22:18 2007
    exec_host = node-1-01/1+node-1-01/0
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Wed Mar 14 14:22:19 2007
    Priority = 0
    qtime = Wed Mar 14 14:22:18 2007
    Rerunable = True
    Resource_List.ncpus = 8
    Resource_List.nodect = 4
    Resource_List.nodes = 4:ppn=2
    Resource_List.walltime = 00:05:00
    comment = Job started on Wed Mar 14 at 14:22
    etime = Wed Mar 14 14:22:18 2007
+++

+++ output file ptest.o29 +++
node-1-01
Num. nodes = 2
node-1-01
node-1-01
+++ end file +++

+++ file /var/spool/torque/server_priv/nodes +++
node-1-01 np=2
node-1-02 np=2
...
node-1-32 np=2
+++ end file +++

+++ torque configuration +++
#
# Create queues and set their attributes.
#
#
# Create and define queue default
#
create queue default
set queue default queue_type = Route
set queue default max_running = 64
set queue default route_destinations = b_8cpus
set queue default route_destinations += b_4cpus
set queue default route_destinations += b_2cpus
set queue default route_destinations += b_1cpu
set queue default enabled = True
set queue default started = True
#
# Create and define queue b_2cpus
#
create queue b_2cpus
set queue b_2cpus queue_type = Execution
set queue b_2cpus Priority = 180
set queue b_2cpus max_running = 32
set queue b_2cpus resources_min.ncpus = 2
set queue b_2cpus resources_min.nodect = 1
set queue b_2cpus resources_default.ncpus = 2
set queue b_2cpus resources_default.nodect = 1
set queue b_2cpus resources_default.nodes = 1
set queue b_2cpus enabled = True
set queue b_2cpus started = True
#
# Create and define queue b_1cpu
#
create queue b_1cpu
set queue b_1cpu queue_type = Execution
set queue b_1cpu Priority = 200
set queue b_1cpu max_running = 64
set queue b_1cpu resources_default.ncpus = 1
set queue b_1cpu resources_default.nodect = 1
set queue b_1cpu resources_default.nodes = 1
set queue b_1cpu max_user_run = 4
set queue b_1cpu enabled = True
set queue b_1cpu started = True
#
# Create and define queue b_4cpus
#
create queue b_4cpus
set queue b_4cpus queue_type = Execution
set queue b_4cpus Priority = 160
set queue b_4cpus max_running = 16
set queue b_4cpus resources_min.ncpus = 3
set queue b_4cpus resources_min.nodect = 2
set queue b_4cpus resources_default.ncpus = 4
set queue b_4cpus resources_default.nodect = 2
set queue b_4cpus resources_default.nodes = 2
set queue b_4cpus enabled = True
set queue b_4cpus started = True
#
# Create and define queue b_8cpus
#
create queue b_8cpus
set queue b_8cpus queue_type = Execution
set queue b_8cpus Priority = 140
set queue b_8cpus max_running = 8
set queue b_8cpus resources_min.ncpus = 5
set queue b_8cpus resources_min.nodect = 3
set queue b_8cpus resources_default.ncpus = 8
set queue b_8cpus resources_default.nodect = 4
set queue b_8cpus resources_default.nodes = 4
set queue b_8cpus enabled = True
set queue b_8cpus started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_host_enable = False
set server managers = root at adm
set server operators = root at adm
set server default_queue = default
set server log_events = 511
set server mail_from = pbs
set server query_other_jobs = True
set server resources_available.nodect = 32
set server resources_default.ncpus = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server node_pack = False
set server pbs_version = 2.1.7
+++ end torque configuration +++

[]'s,
Bibo



More information about the torqueusers mailing list