[torqueusers] jobs won't start with torque scheduler

etienne gondet etienne.gondet at mercator-ocean.fr
Fri Feb 3 09:22:00 MST 2006


If you only use pbs_sched and not (yet) maui or moab for better 
scheduling you have a default options
called starving jobs set to true and 24 hours. It means that no jobs can 
start if there is at least one job
waiting for longer than 24 hours.

It's in /var/spool/PBS/sched_priv/sched_config

Just have a look :
grep -in starv sched_config

help_starving_jobs  true    ALL
max_starve: 24:00:00

Just set False or switch to maui.

        Sincerely yours,
                    Etienne Gondet
                    GIP MERCATOR-Ocean

Tomas Pevny a écrit:

>Hi,
>I have a peculiar problem with a torque scheduler. I have several queues on 
>the server -- High, MatlabQueue and medium.
>MatlabQueue has a limit on the number of running jobs set to 50, since I have
>only 50 licenses of matlab. Other two queues has the limit set to 80, which
>is the number of processors / cores on the cluster.
>My problem is that the jobs from other queues than MatlabQueue do not start.
>Jobs from the MatlabQueue are schedulled to start immediatly when other job
>finish. The result is that there are 30 free processors and the scheduller 
>does not start any job from other queus on these free processors. When I run
>qstat -f on one of the queued jobs, it shows this
>
>Job Id: 1750.master.bw01.binghamton.edu
>    Job_Name = gapsOutguess-91-100-1
>    Job_Owner = tomik at master.bw01.binghamton.edu
>    job_state = Q
>    queue = medium
>    server = master.bw01.binghamton.edu
>    Checkpoint = u
>    ctime = Tue Jan 31 13:06:45 2006
>    Error_Path =
>master.bw01.binghamton.edu:/home/tomik/submitScripts/gapsOutgu
>        ess-91-100-1.e1750
>    Hold_Types = n
>    Join_Path = oe
>    Keep_Files = n
>    Mail_Points = a
>    mtime = Tue Jan 31 13:06:45 2006
>    Output_Path =
>master.bw01.binghamton.edu:/home/tomik/submitScripts/output/g
>        apsOutguess-91-100-1.out
>    Priority = 0
>    qtime = Tue Jan 31 13:06:45 2006
>    Rerunable = False
>    Resource_List.neednodes = 1:ppn=1
>    Resource_List.nodect = 1
>    Resource_List.nodes = 1:ppn=1
>    substate = 10
>    Variable_List = PBS_O_HOME=/home/tomik,PBS_O_LANG=en_US.UTF-8,
>        PBS_O_LOGNAME=tomik,
>       
> PBS_O_PATH=/opt/maui/bin:/opt/torque/bin:/opt/bin:/opt/hdfview/bin:/op
> t/hdf/bin:/opt/ncarg/bin:/opt/mpich/p4-pathscale/bin:/opt/mpiexec/x86_6
> 4/bin:/usr/kerberos/bin:/opt/java/jdk1.5.0/bin:/opt/gm/sbin:/opt/gm/bin
>
>        :/usr/lib64/ccache/bin:/usr/local/bin:/bin:/usr/bin:/usr/X11R6/bin:/o
>        :pt
>
>        /java/jdk1.5.0/jre/bin:/opt/pathscale/bin:/home/tomik/bin,
>        PBS_O_MAIL=/var/spool/mail/tomik,PBS_O_SHELL=/bin/bash,
>        PBS_O_HOST=master.bw01.binghamton.edu,
>     
> PBS_O_WORKDIR=/home/tomik/submitScripts,QF=91,PERCENT=100,START_INDEX=1,
> STEP=1,PBS_O_QUEUE=medium
>    euser = tomik
>    egroup = users
>    queue_rank = 1657
>    queue_type = E
>    comment = Not Running: Draining system to allow starving job to run
>    etime = Tue Jan 31 13:06:45 2006
>
>I have also run pbsnodes -a and I see that there all nodes are marked as free 
>so they are supposed to accept new jobs.
>
>This is not the first time this has happened. Before I have reboot server, 
>delete all jobs from queues etc. Since this time it has happened immedietly 
>after the reboot, I would like to found thel cause of this behaviour. I have 
>searched the internet but I did not find anything. Does anybody have a 
>similar experiences? 
>
>This is how the PBS scheduller is set up:
>qmgr -c "print server"
># Create queues and set their attributes.
>#
>#
># Create and define queue MatlabQueue
>#
>create queue MatlabQueue
>set queue MatlabQueue queue_type = Execution
>set queue MatlabQueue Priority = 75
>set queue MatlabQueue max_running = 50
>set queue MatlabQueue max_user_run = 200
>set queue MatlabQueue enabled = True
>set queue MatlabQueue started = True
>#
># Create and define queue default
>#
>create queue default
>set queue default queue_type = Execution
>set queue default Priority = 75
>set queue default max_running = 100
>set queue default max_user_run = 200
>set queue default enabled = True
>set queue default started = True
>#
># Create and define queue high
>#
>create queue high
>set queue high queue_type = Execution
>set queue high Priority = 100
>set queue high max_running = 100
>set queue high max_user_run = 80
>set queue high enabled = True
>set queue high started = True
>#
># Create and define queue medium
>#
>create queue medium
>set queue medium queue_type = Execution
>set queue medium Priority = 75
>set queue medium max_running = 100
>set queue medium max_user_run = 200
>set queue medium enabled = True
>set queue medium started = True
>#
># Set server attributes.
>#
>set server scheduling = True
>set server max_running = 80
>set server max_user_run = 80
>set server acl_host_enable = True
>set server acl_hosts = master.bw01.binghamton.edu
>set server acl_hosts += *.bw01.binghamton.edu
>set server acl_hosts += localhost.localdomain
>set server managers = mpiadmin at master.bw01.binghamton.edu
>set server managers += mpiadmin at localhost.localdomain
>set server managers += root at master.bw01.binghamton.edu
>set server managers += root at localhost.localdomain
>set server default_queue = MatlabQueue
>set server log_events = 127
>set server mail_from = pbsadmin
>set server query_other_jobs = True
>set server resources_default.neednodes = 1
>set server resources_default.nodect = 1
>set server resources_default.nodes = 1
>set server scheduler_iteration = 150
>set server node_ping_rate = 150
>set server node_check_rate = 300
>set server tcp_timeout = 6
>set server comment = Torque Server @ master.bw01.binghamton.edu
>set server node_pack = False
>set server job_stat_rate = 30
>
>Thanks for any help and suggestions.
>Tomas Pevny
>
>-------------------------------------------------------
>_______________________________________________
>torqueusers mailing list
>torqueusers at supercluster.org
>http://www.supercluster.org/mailman/listinfo/torqueusers
>
>
> 
>
>
>  
>




More information about the torqueusers mailing list