[torqueusers] pbsnodes reports the same job running many times

Leonardo Gregory Brunnet leon at if.ufrgs.br
Wed Apr 18 16:26:55 MDT 2012


Dear All,

In a fresh installed torque/maui cluster the server reports
repeated execution of a job in a given  node. (There is no job running 
mpi)!.

The output for pbsnodes for one given node gives:

node131
      state = job-exclusive
      np = 4
      properties = quadcore
      ntype = cluster
      jobs = 0/78898.master.cluster.XX.XX.XX, 
1/78898.master.cluster.XX.XX.XX, 2/78898.master.cluster.XX.XX.XX, 
3/78898.master.XX.XX.XX
      status = 
rectime=1334786811,varattr=,jobs=78898.master.cluster.if.ufrgs.br,state=free,netload=2914588064,gres=,loadave=1.00,ncpus=4,physmem=3985876kb,availmem=4649240kb,totmem=5062188kb,idletime=535832,nusers=2,nsessions=2,sessions=2804 
8224,uname=Linux node131 2.6.23-1-amd64 #1 SMP Fri Oct 12 23:45:48 UTC 
2007 x86_64,opsys=linux
      gpus = 0

But, if we log in that node we will see what was expected, a single job.
Since the torque server (or maui) "believes" all cpu's of that node are 
working,
no other jobs are sent.  Any clues ?

Thanks for the help!

Leonardo

Below, you find the output for
# qmgr -c "p s"

#
# Create queues and set their attributes.
#
#
# Create and define queue padrao
#
create queue padrao
set queue padrao queue_type = Execution
set queue padrao resources_default.nodes = 7
set queue padrao resources_default.walltime = 01:00:00
set queue padrao max_user_run = 5
set queue padrao enabled = True
set queue padrao started = True
#
# Create and define queue um_mes
#
create queue um_mes
set queue um_mes queue_type = Execution
set queue um_mes resources_max.nodes = 7
set queue um_mes resources_default.nodes = 7
set queue um_mes resources_default.walltime = 720:00:00
set queue um_mes max_user_run = 5
set queue um_mes enabled = True
set queue um_mes started = True
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Create and define queue um_dia
#
create queue um_dia
set queue um_dia queue_type = Execution
set queue um_dia resources_max.nodes = 7
set queue um_dia resources_default.nodes = 7
set queue um_dia resources_default.walltime = 24:00:00
set queue um_dia max_user_run = 7
set queue um_dia enabled = True
set queue um_dia started = True
#
# Create and define queue uma_semana
#
create queue uma_semana
set queue uma_semana queue_type = Execution
set queue uma_semana resources_max.nodes = 7
set queue uma_semana resources_default.nodes = 7
set queue uma_semana resources_default.walltime = 168:00:00
set queue uma_semana max_user_run = 5
set queue uma_semana enabled = True
set queue uma_semana started = True
#
# Create and define queue route
#
create queue route
set queue route queue_type = Route
set queue route route_destinations = padrao
set queue route route_destinations += padrao2
set queue route enabled = True
set queue route started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = master.cluster.XX.XX.XX
set server acl_hosts += clusterapg
set server managers = root at master.cluster.XX.XX.XX
set server operators = root at master.cluster.XX.XX.XX
set server default_queue = padrao
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
set server next_job_number = 79033

-- 
Leonardo Gregory Brunnet                  E-mail: leon at if.ufrgs.br
Instituto de Fisica - UFRGS               http://pcleon.if.ufrgs.br
91501-970 Porto Alegre, RS, BRASIL        Phone: (51) 33 08 72 51
FAX +55 51 33 08 72 86                     C.P. 15051
Linux User: 39314



More information about the torqueusers mailing list