[torqueusers] problems with parallel jobs

Albino Aveleda bino at coc.ufrj.br
Thu Mar 15 09:58:49 MDT 2007


Hi Pat,

The problem is that the torque doesn´t reserve the nodes to run. Please
look in the output command of "qstat -f":
exec_host = node-1-01/1+node-1-01/0

In my job I select 4x2 = 8 cpus (-l nodes=4:ppn=2) and the torque reverved
only two cpus in node-1-01.
If I submit another job the torque will reserve two cpus in another node,
node-1-02 per example.

I have another script with mpi that running in another cluster. This 
script bellow is only for allocate test of nodes.

I think that the problem is something about configuration, but I don´t
know what.

Best Regards,
Bibo

Quoting pat.o'bryant at exxonmobil.com:

> Bibo,
>    I believe the parallel code is responsible for distributing your job
> across multiple nodes, not Torque. Here is part of one of my jobs on a
> cluster that uses "lam", a parallel job execution process. Note that some
> mention is made of the method("ssh") that is used to communicate with the
> parallel nodes. Also, "mpirun" is a standard binary for distributing
> parallel work. Do a "google" search on "mpi" and that may also help.
>
> export LAMRSH="/usr/bin/ssh"
> export TMPDIR=/dev/shm
> lamboot -v /tmp/lam_boot.$PBS_JOBID
> mpirun C /users/jwobrya/c.code/mpi/hello++
> lamhalt
>
> Good luck,
>
> J.W. (Pat) O'Bryant,Jr.
> Business Line Infrastructure
> Technical Systems, HPC
> Office: 713-431-7022
>
>
>
>
>             Albino Aveleda
>             <bino at coc.ufrj
>             .br>                                                       To
>             Sent by:                 torqueusers at supercluster.org
>             torqueusers-bo                                             cc
>             unces at superclu
>             ster.org                                              Subject
>                                      [torqueusers] problems with parallel
>                                      jobs
>             03/14/07 03:49
>             PM
>
>
>
>
>
>
>
>
> Hi,
>
> I have a cluster with 32 nodes where each node has 2 cpus.
> I made the parallel test job called by "ptest.job" (see bellow).
> In this job I want to use 8 cpus, then I defined in my job
> "-l nodes=4:ppn=2". But, unfortunally this don't work. When I submited
> this job the torque select only 2 cpus in the first node. You can see
> this output and the output of
> job bellow. The output command "pbsnodes -a" show all 32
> nodes with 2 cpus per node.
> I show my torque configuration bellow too.
>
> What is wrong in my configuration?
>
> Best regards,
> Bibo
>
> +++ file ptest.job +++
> #PBS -l nodes=4:ppn=2
> #PBS -l walltime=00:05:00
> #PBS -j oe
> #PBS -N ptest
>
> # change directory
> cd ${PBS_O_WORKDIR}
> /bin/hostname
> # get the number of nodes
> NUM_NODES=`cat $PBS_NODEFILE | wc -l`
> echo "Num. nodes = $NUM_NODES"
> cat $PBS_NODEFILE
>
> sleep 60
> +++ end file +++
>
> +++ qstat -f +++
> Job Id: 29.adm
>    Job_Name = ptest
>    Job_Owner = bino at adm
>    job_state = R
>    queue = b_8cpus
>    server = adm
>    Checkpoint = u
>    ctime = Wed Mar 14 14:22:18 2007
>    exec_host = node-1-01/1+node-1-01/0
>    Hold_Types = n
>    Join_Path = oe
>    Keep_Files = n
>    Mail_Points = a
>    mtime = Wed Mar 14 14:22:19 2007
>    Priority = 0
>    qtime = Wed Mar 14 14:22:18 2007
>    Rerunable = True
>    Resource_List.ncpus = 8
>    Resource_List.nodect = 4
>    Resource_List.nodes = 4:ppn=2
>    Resource_List.walltime = 00:05:00
>    comment = Job started on Wed Mar 14 at 14:22
>    etime = Wed Mar 14 14:22:18 2007
> +++
>
> +++ output file ptest.o29 +++
> node-1-01
> Num. nodes = 2
> node-1-01
> node-1-01
> +++ end file +++
>
> +++ file /var/spool/torque/server_priv/nodes +++
> node-1-01 np=2
> node-1-02 np=2
> ...
> node-1-32 np=2
> +++ end file +++
>
> +++ torque configuration +++
> #
> # Create queues and set their attributes.
> #
> #
> # Create and define queue default
> #
> create queue default
> set queue default queue_type = Route
> set queue default max_running = 64
> set queue default route_destinations = b_8cpus
> set queue default route_destinations += b_4cpus
> set queue default route_destinations += b_2cpus
> set queue default route_destinations += b_1cpu
> set queue default enabled = True
> set queue default started = True
> #
> # Create and define queue b_2cpus
> #
> create queue b_2cpus
> set queue b_2cpus queue_type = Execution
> set queue b_2cpus Priority = 180
> set queue b_2cpus max_running = 32
> set queue b_2cpus resources_min.ncpus = 2
> set queue b_2cpus resources_min.nodect = 1
> set queue b_2cpus resources_default.ncpus = 2
> set queue b_2cpus resources_default.nodect = 1
> set queue b_2cpus resources_default.nodes = 1
> set queue b_2cpus enabled = True
> set queue b_2cpus started = True
> #
> # Create and define queue b_1cpu
> #
> create queue b_1cpu
> set queue b_1cpu queue_type = Execution
> set queue b_1cpu Priority = 200
> set queue b_1cpu max_running = 64
> set queue b_1cpu resources_default.ncpus = 1
> set queue b_1cpu resources_default.nodect = 1
> set queue b_1cpu resources_default.nodes = 1
> set queue b_1cpu max_user_run = 4
> set queue b_1cpu enabled = True
> set queue b_1cpu started = True
> #
> # Create and define queue b_4cpus
> #
> create queue b_4cpus
> set queue b_4cpus queue_type = Execution
> set queue b_4cpus Priority = 160
> set queue b_4cpus max_running = 16
> set queue b_4cpus resources_min.ncpus = 3
> set queue b_4cpus resources_min.nodect = 2
> set queue b_4cpus resources_default.ncpus = 4
> set queue b_4cpus resources_default.nodect = 2
> set queue b_4cpus resources_default.nodes = 2
> set queue b_4cpus enabled = True
> set queue b_4cpus started = True
> #
> # Create and define queue b_8cpus
> #
> create queue b_8cpus
> set queue b_8cpus queue_type = Execution
> set queue b_8cpus Priority = 140
> set queue b_8cpus max_running = 8
> set queue b_8cpus resources_min.ncpus = 5
> set queue b_8cpus resources_min.nodect = 3
> set queue b_8cpus resources_default.ncpus = 8
> set queue b_8cpus resources_default.nodect = 4
> set queue b_8cpus resources_default.nodes = 4
> set queue b_8cpus enabled = True
> set queue b_8cpus started = True
> #
> # Set server attributes.
> #
> set server scheduling = True
> set server acl_host_enable = False
> set server managers = root at adm
> set server operators = root at adm
> set server default_queue = default
> set server log_events = 511
> set server mail_from = pbs
> set server query_other_jobs = True
> set server resources_available.nodect = 32
> set server resources_default.ncpus = 1
> set server resources_default.nodect = 1
> set server resources_default.nodes = 1
> set server scheduler_iteration = 600
> set server node_check_rate = 150
> set server tcp_timeout = 6
> set server node_pack = False
> set server pbs_version = 2.1.7
> +++ end torque configuration +++
>
> []'s,
> Bibo
>



More information about the torqueusers mailing list