[torqueusers] -lnodes=X ignored??

Jeremy Enos jenos at ncsa.uiuc.edu
Tue Nov 13 16:18:32 MST 2007


I've got a 16 node cluster.  I'm using pbs_sched for a scheduler.  When 
I submit a job requesting multiple nodes, I only get one, ever.  I used 
the contributed torque.setup script to initialize my server.  I don't 
think it's merely a misconfigured queue or something though, as the job 
actually launches.  Per my understanding, what happens below by user 
jenos shouldn't be possible.  How does a job requiring 8 nodes actually 
get into the R state w/ just one?

________________________________________________________________
[jenos at qp test]$ echo "sleep 30" |qsub -l nodes=8
44.qp
[jenos at qp test]$ qstat -n

qp:
                                                                   
Req'd  Req'd   Elap
Job ID               Username Queue    Jobname    SessID NDS   TSK 
Memory Time  S Time
-------------------- -------- -------- ---------- ------ ----- --- 
------ ----- - -----
44.qp                jenos    batch    STDIN       23650     8  --    
--  24:00 R   --
   qp01/0
[jenos at qp test]$
________________________________________________________________


Other info possibly relevant below.
thx-

    Jeremy




[root at qp ~]# qmgr -c "p s"
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1:ppn=4
set queue batch resources_default.walltime = 24:00:00
set queue batch resources_available.nodect = 999999
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server managers = root at qp
set server operators = root at qp
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server resources_available.nodect = 999999
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server pbs_version = 2.2.1
set server keep_completed = 300
[root at qp ~]# qmgr -c "l s"
Server qp
        server_state = Active
        scheduling = True
        total_jobs = 1
        state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 
Exiting:0
        managers = root at qp
        operators = root at qp
        default_queue = batch
        log_events = 511
        mail_from = adm
        resources_available.nodect = 999999
        resources_assigned.nodect = 0
        scheduler_iteration = 600
        node_check_rate = 150
        tcp_timeout = 6
        mom_job_sync = True
        pbs_version = 2.2.1
        keep_completed = 300
        net_counter = 5 0 0

[root at qp ~]# cat /var/torque/server_priv/nodes
qp01 np=4 qp ib
qp02 np=4 qp ib
qp03 np=4 qp ib
qp04 np=4 qp ib
qp05 np=4 qp ib
qp06 np=4 qp ib
qp07 np=4 qp ib
qp08 np=4 qp ib
qp09 np=4 qp ib
qp10 np=4 qp ib
qp11 np=4 qp ib
qp12 np=4 qp ib
qp13 np=4 qp ib
qp14 np=4 qp ib
qp15 np=4 qp ib
qp16 np=4 qp ib
[root at qp ~]#



More information about the torqueusers mailing list