[torqueusers] Not Running: Not enough memory available

Derek Gottlieb derek at asc.edu
Thu Aug 28 13:05:44 MDT 2008


Carlos,

It doesn't look like you specified how much memory or how many cpus you
wanted, but only specified that you wanted the "long" queue.  It looks
like you only specified the following resource constraints on that queue:

set queue long resources_max.cput = 720:00:00
set queue long resources_max.mem = 20000mb
set queue long resources_default.cput = 720:00:00
set queue long resources_default.walltime = 720:00:00

Did you check what resources (cpus/memory) torque thought your job
requested?  This is just a guess, but since you didn't specify a
default.mem, it may have assumed you wanted max.mem which I think should
be all the memory in your cluster (20GB).  Perhaps someone more familiar
with the nuances of torque queue configs would like to chime in?

Derek

Carlos López wrote:
> Hi! :-) I can´t seem to find how to solve the next problem...
> 
> I have a 10 node cluster with 2 GB of RAM in each node and 2 processors,
> I configured my next queues:
> 
> ------
> 
> create queue medium
> set queue medium queue_type = Execution
> set queue medium max_user_queuable = 20
> set queue medium max_running = 20
> set queue medium acl_hosts =
> ollin+ollin-1+ollin-2+ollin-3+ollin-4+ollin-5+ollin-6+ollin-7+ollin-8+ollin-9+ollin-10
> 
> set queue medium resources_max.cput = 240:00:00
> set queue medium resources_max.mem = 20000mb
> set queue medium resources_default.cput = 240:00:00
> set queue medium resources_default.walltime = 240:00:00
> set queue medium max_user_run = 20
> set queue medium enabled = True
> set queue medium started = True
> 
> 
> create queue prueba
> set queue prueba queue_type = Execution
> set queue prueba max_user_queuable = 20
> set queue prueba max_running = 20
> set queue prueba acl_hosts =
> ollin+ollin-1+ollin-2+ollin-3+ollin-4+ollin-5+ollin-6+ollin-7+ollin-8+ollin-9+ollin-10
> 
> set queue prueba resources_max.cput = 02:00:00
> set queue prueba resources_max.mem = 20000mb
> set queue prueba resources_default.cput = 02:00:00
> set queue prueba resources_default.walltime = 02:00:00
> set queue prueba max_user_run = 20
> set queue prueba enabled = True
> set queue prueba started = True
> 
> 
> create queue long
> set queue long queue_type = Execution
> set queue long max_user_queuable = 20
> set queue long max_running = 20
> set queue long acl_hosts =
> ollin+ollin-1+ollin-2+ollin-3+ollin-4+ollin-5+ollin-6+ollin-7+ollin-8+ollin-9+ollin-10
> 
> set queue long resources_max.cput = 720:00:00
> set queue long resources_max.mem = 20000mb
> set queue long resources_default.cput = 720:00:00
> set queue long resources_default.walltime = 720:00:00
> set queue long max_user_run = 20
> set queue long enabled = True
> set queue long started = True
> 
> create queue short
> set queue short queue_type = Execution
> set queue short max_user_queuable = 20
> set queue short max_running = 20
> set queue short acl_hosts =
> ollin+ollin-1+ollin-2+ollin-3+ollin-4+ollin-5+ollin-6+ollin-7+ollin-8+ollin-9+ollin-10
> 
> set queue short resources_max.cput = 48:00:00
> set queue short resources_max.mem = 20000mb
> set queue short resources_default.cput = 48:00:00
> set queue short resources_default.walltime = 48:00:00
> set queue short max_user_run = 20
> set queue short enabled = True
> set queue short started = True
> 
> 
> set server scheduling = True
> set server max_running = 40
> set server operators = root at ollin.fisica.unam.mx
> set server default_queue = medium
> set server log_events = 511
> set server mail_from = adm
> set server query_other_jobs = True
> set server resources_available.ncpus = 22
> set server resources_available.nodect = 11
> set server resources_available.nodes = 11
> set server resources_default.nodes = 1
> set server scheduler_iteration = 60
> set server node_check_rate = 150
> set server tcp_timeout = 6
> set server node_pack = False
> set server log_level = 1
> set server pbs_version = 2.2.1
> 
> ------------
> 
> I sent an script job with qsub, wich was:
> --------
> #!/bin/tcsh
> #PBS -q long
> 
> 
> cd /home/abl/2009-1/mp1
> w2web
> runsp_lapw  -ec 0.0001 -i 100 -NI
> --------
> 
> now I try to send one more wich is:
> 
> -------
> #!/bin/tcsh                                                                                                                                                
> 
> #PBS -q
> long                                                                                                                                               
> 
> 
> 
> cd /home/abl/2009-1/delta_012/mp1
> w2web
> runsp_lapw  -ec 0.0001 -i 100 -NI
> ---------
> 
> and when I check it with qstat -f it says:
> 
>    comment = Not Running: Not enough memory available
> 
> 
> As long as I know, this program uses at most 400 MB of RAM, why is it
> not working? :-( can anybody give me a hand with this???
> 
> Thank you very much
> Carlos
> 



More information about the torqueusers mailing list