[torqueusers] Some jobs doesn´t run automaticaly

Albino Aveleda bino at coc.ufrj.br
Tue May 5 08:59:27 MDT 2009


Hi all,

I have a Altix 450 with 72 cores.
set server resources_available.ncpus = 72
set server resources_max.ncpus = 70

I disable starving jobs in sched_config.
help_starving_jobs    false   ALL

Sometimes my users submit the job and it stay in queue like 400 and 401 
jobs show below. If I check why the job don´t run a see this message:
"Not Running: Not enough cpus available". But I have cpus to run. If I 
force to run this job by qrun command it is run.
Why does the job run automaticaly? How do I fix this?

Best regards,
Bibo

user at mycomputer:~> qstat -a
mycomputer.br: Altix 450
                                                                        
  Req'd  Req'd   Elap
Job ID               Username Queue    Jobname          SessID NDS   
TSK Memory Time  S Time
-------------------- -------- -------- ---------------- ------ ----- 
--- ------ ----- - -----
260.mycomputer.br    user1    b_32core batch1           102907   --   
32    --  200:0 R 63:16
265.mycomputer.br    user1    b_32core batch1              --    --   
32    --  200:0 Q   -- 266.mycomputer.br    user1    b_32core batch1    
           --    --   32    --  200:0 Q   -- 377.mycomputer.br    user2 
    b_1core_ batch2           113028   --    1    --  800:0 R 36:56
388.mycomputer.br    user2    b_2cores batch2           109115   --    
2    --  800:0 R 45:36
389.mycomputer.br    user2    b_4cores batch2           109194   --    
4    --  800:0 R 45:35
393.mycomputer.br    user2    b_32core batch2              --    --   
32    --  800:0 Q   -- 396.mycomputer.br    user1    b_32core batch1    
           --    --   32    --  200:0 Q   -- 397.mycomputer.br    user1 
    b_32core batch1              --    --   32    --  200:0 Q   -- 
398.mycomputer.br    user1    b_32core batch1              --    --   
32    --  200:0 Q   -- 400.mycomputer.br    user3    b_1core_ batch3    
           --    --    1    --  15:00 Q   -- 401.mycomputer.br    user3 
    b_1core_ batch4              --    --    1    --  1500: Q   -- 
user at mycomputer:~> qstat -f 400
Job Id: 400.mycomputer.br
    Job_Name = batch3
    Job_Owner = user3 at mycomputer.br
    job_state = Q
    queue = b_1core_unlim
    server = mycomputer.br
    Checkpoint = u
    ctime = Mon May  4 18:18:40 2009
    Error_Path = mycomputer.br:/home/users/user3/batch3.e400
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = ae
    Mail_Users = user3 at gmail.com
    mtime = Mon May  4 18:18:40 2009
    Output_Path = mycomputer.br:/home/users/user3/batch3.o400
    Priority = 0
    qtime = Mon May  4 18:18:40 2009
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.walltime = 15:00:00
    Shell_Path_List = /bin/csh
    comment = Not Running: Not enough cpus available
    etime = Mon May  4 18:18:40 2009
    submit_args = batch3.pbs

user at mycomputer:~> qstat -q

server: mycomputer

Queue            Memory CPU Time Walltime Node  Run Que Lm  State
---------------- ------ -------- -------- ----  --- --- --  -----
b_32cores_12hs     --      --    12:00:00   --    0   0  1   E R
b_1core_12hs       --      --    12:00:00   --    0   0  6   E R
b_2cores_1h        --      --    01:00:00   --    0   0  6   E R
b_16cores_12hs     --      --    12:00:00   --    0   0  2   E R
b_8cores_1h        --      --    01:00:00   --    0   0  3   E R
b_32cores_unlim    --      --       --      --    1   6  1   E R
b_4cores_1h        --      --    01:00:00   --    0   0  3   E R
b_1core_6hs        --      --    06:00:00   --    0   0 10   E R
b_2cores_unlim     --      --       --      --    1   0  2   E R
b_1core_1h         --      --    01:00:00   --    0   0 10   E R
b_4cores_6hs       --      --    06:00:00   --    0   0  3   E R
b_16cores_1h       --      --    01:00:00   --    0   0  3   E R
b_4cores_12hs      --      --    12:00:00   --    0   0  3   E R
b_2cores_6hs       --      --    06:00:00   --    0   0  6   E R
b_4cores_unlim     --      --       --      --    1   0  2   E R
b_32cores_6hs      --      --    06:00:00   --    0   0  1   E R
b_16cores_unlim    --      --       --      --    0   0  2   E R
b_2cores_12hs      --      --    12:00:00   --    0   0  6   E R
b_32cores_1h       --      --    01:00:00   --    0   0  1   E R
b_16cores_6hs      --      --    06:00:00   --    0   0  2   E R
b_1core_unlim      --      --       --      --    1   2  3   E R
default            --      --       --      --    0   0 64   E R
b_8cores_unlim     --      --       --      --    0   0  3   E R
b_8cores_6hs       --      --    06:00:00   --    0   0  3   E R
b_8cores_12hs      --      --    12:00:00   --    0   0  3   E R
                                               ----- -----
                                                   4     8
user at mycomputer:~> pbsnodes -a
mycomputer
     state = free
     np = 70
     ntype = cluster
     jobs = 0/377.mycomputer.br, 0/389.mycomputer.br, 
0/388.mycomputer.br, 0/260.mycomputer.br
     status = opsys=linux,uname=Linux mycomputer 2.6.16.60-0.27-default 
#1 SMP Mon Jul 28 12:20:03 UTC 2008 ia64,sessions=15417 102907 109115 
109194 
113028,nsessions=5,nusers=3,idletime=60787,totmem=144828944kb,availmem=135536368kb,physmem=144828944kb,ncpus=72,loadave=39.00,netload=24366636537,state=free,jobs=260.mycomputer.br 388.mycomputer.br 389.mycomputer.br 
377.mycomputer.br,varattr=,rectime=1241508607

venus:~ # qrun 400
venus:~ # qrun 401
venus:~ # qstat -a

mycomputer.brufrj.br: Altix 450
                                                                        
  Req'd  Req'd   Elap
Job ID               Username Queue    Jobname          SessID NDS   
TSK Memory Time  S Time
-------------------- -------- -------- ---------------- ------ ----- 
--- ------ ----- - -----
260.mycomputer.br    user1    b_32core batch1           102907   --   
32    --  200:0 R 63:22
265.mycomputer.br    user1    b_32core batch1              --    --   
32    --  200:0 Q   -- 266.mycomputer.br    user1    b_32core batch1    
           --    --   32    --  200:0 Q   -- 377.mycomputer.br    user2 
    b_1core_ batch2           113028   --    1    --  800:0 R 37:02
388.mycomputer.br    user2    b_2cores batch2           109115   --    
2    --  800:0 R 45:42
389.mycomputer.br    user2    b_4cores batch2           109194   --    
4    --  800:0 R 45:41
393.mycomputer.br    user2    b_32core batch2              --    --   
32    --  800:0 Q   -- 396.mycomputer.br    user1    b_32core batch1    
           --    --   32    --  200:0 Q   -- 397.mycomputer.br    user1 
    b_32core batch1              --    --   32    --  200:0 Q   -- 
398.mycomputer.br    user1    b_32core batch1              --    --   
32    --  200:0 Q   -- 400.mycomputer.br    user3    b_1core_ batch3    
        134844   --    1    --  15:00 R   -- 401.mycomputer.br    user3 
    b_1core_ batch4           134898   --    1    --  1500: R   --




More information about the torqueusers mailing list