[torqueusers] Jobs not executed by torque

Samatha Kottha samatha.kottha at tu-dresden.de
Fri Jun 29 03:16:29 MDT 2007


Hi,

We have torque 2.1.8 and maui 3.2.6.p18 and they worked fine for some
time. But couple of days ago some files in PBS_HOME are deleted by
accident and some are corrupted. We did not have the backup yet as this
is new system. So, I cleaned all the directories and reinstalled and
configured a default queue. But when I submit the job, they are not
executed. Here is the output:  At the end you can find the queue
configuration. I guess it has something to do with
resources_assigned.ncpus and resources_assigned.nodect to do and I tried
to set them to 1 as root, it complains insufficient permissions. There
is no error messages in server log but one line mom log. I did not any
of these problems before and do not have a clue where the problem is
coming from?

Thank you,
Samatha

zimd0022 at hector:~> echo "sleep 10000" | qsub
27.hector.zih.tu-dresden.de
zimd0022 at hector:~> checkjob 27


checking job 27

State: Running
Creds:  user:zimd0022  group:medi  class:gridbatch  qos:DEFAULT
WallTime: 00:00:00 of 12:00:00
SubmitTime: Fri Jun 29 11:00:47
  (Time Queued  Total: 00:00:01  Eligible: 00:00:01)

StartTime: Fri Jun 29 11:00:48
Total Tasks: 1

Req[0]  TaskCount: 1  Partition: DEFAULT
Network: [NONE]  Memory >= 0  Disk >= 0  Swap >= 0
Opsys: [NONE]  Arch: [NONE]  Features: [NONE]
NodeCount: 1
Allocated Nodes:
[hector:1]


IWD: [NONE]  Executable:  [NONE]
Bypass: 0  StartCount: 1
PartitionMask: [ALL]
Flags:       RESTARTABLE

Reservation '27' (00:00:00 -> 12:00:00  Duration: 12:00:00)
PE:  1.00  StartPriority:  1

zimd0022 at hector:~> qstat
Job id              Name             User            Time Use S Queue
------------------- ---------------- --------------- -------- - -----
27.hector           STDIN            zimd0022        00:00:00 E gridbatch
zimd0022 at hector:~> qstat
zimd0022 at hector:~> checkjob 27
ERROR:    'checkjob' failed
ERROR:  cannot locate job '27'

zimd0022 at hector:~>

System mails reveal
***************************
Aborted by PBS Server
Job cannot be executed
See Administrator for help
Unable to copy file /var/spool/torque/spool/26.hector.z.OU to
/home/zimd0022/STDIN.o26
Unable to copy file /var/spool/torque/spool/26.hector.z.ER to
/home/zimd0022/STDIN.e26

mom_log
********************************
06/29/2007 10:43:58;0100;   pbs_mom;Req;;Type StatusJob request received
from PBS_Server at hector.zih.tu-dresden.de, sock=12
06/29/2007 10:43:58;0100;   pbs_mom;Req;;Type ModifyJob request received
from PBS_Server at hector.zih.tu-dresden.de, sock=10
06/29/2007 10:43:58;0008;   pbs_mom;Job;24.hector.zih.tu-dresden.de;Job
Modified at request of PBS_Server at hector.zih.tu-dresden.de
06/29/2007 10:43:58;0100;   pbs_mom;Req;;Type CopyFiles request received
from PBS_Server at hector.zih.tu-dresden.de, sock=13
06/29/2007 10:44:06;0100;   pbs_mom;Req;;Type DeleteJob request received
from PBS_Server at hector.zih.tu-dresden.de, sock=13
06/29/2007 10:46:26;0100;   pbs_mom;Req;;Type QueueJob request received
from PBS_Server at hector.zih.tu-dresden.de, sock=10
06/29/2007 10:46:26;0100;   pbs_mom;Req;;Type JobScript request received
from PBS_Server at hector.zih.tu-dresden.de, sock=10
06/29/2007 10:46:26;0100;   pbs_mom;Req;;Type ReadyToCommit request
received from PBS_Server at hector.zih.tu-dresden.de, sock=10
06/29/2007 10:46:26;0100;   pbs_mom;Req;;Type Commit request received
from PBS_Server at hector.zih.tu-dresden.de, sock=10
06/29/2007 10:46:26;0001;   pbs_mom;Job;TMomFinalizeJob3;job not
started, Failure job exec failure, before files staged, no retry
06/29/2007 10:46:26;0008;   pbs_mom;Req;send_sisters;sending ABORT to
sisters


********************
hector:~ # qmgr
Max open servers: 4
Qmgr: list queue gridbatch
Queue gridbatch
        queue_type = Execution
        Priority = 100
        max_queuable = 200
        total_jobs = 0
        state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0
Exiting:0
        max_running = 12
        resources_max.ncpus = 1
        resources_max.nodect = 12
        resources_max.walltime = 72:00:00
        resources_default.nodes = nodes=1:ppn=1
        resources_default.walltime = 12:00:00
        acl_group_enable = False
        acl_groups = astro,c3,dgtest,hep,ingrid,kern,medi,text,wesent
        mtime = Tue Jun 26 15:18:23 2007
        resources_available.nodect = 12
        resources_assigned.ncpus = 0
        resources_assigned.nodect = 0
        enabled = True
        started = True

Qmgr: set queue gridbatch resources_assigned.ncpus=1
qmgr obj=gridbatch svr=default: Cannot set attribute, read only or
insufficient permission  resources_assigned.ncpus
Qmgr: set queue gridbatch resources_assigned.nodect = 1
qmgr obj=gridbatch svr=default: Cannot set attribute, read only or
insufficient permission  resources_assigned.nodect
Qmgr: set queue gridbatch total_jobs=1
qmgr obj=gridbatch svr=default: Cannot set attribute, read only or
insufficient permission  total_jobs
Qmgr: p s
#
# Create queues and set their attributes.
#
#
# Create and define queue gridbatch
#
create queue gridbatch
set queue gridbatch queue_type = Execution
set queue gridbatch Priority = 100
set queue gridbatch max_queuable = 200
set queue gridbatch max_running = 12
set queue gridbatch resources_max.ncpus = 1
set queue gridbatch resources_max.nodect = 12
set queue gridbatch resources_max.walltime = 72:00:00
set queue gridbatch resources_default.nodes = nodes=1:ppn=1
set queue gridbatch resources_default.walltime = 12:00:00
set queue gridbatch acl_group_enable = False
set queue gridbatch acl_groups = astro
set queue gridbatch acl_groups += c3
set queue gridbatch acl_groups += dgtest
set queue gridbatch acl_groups += hep
set queue gridbatch acl_groups += ingrid
set queue gridbatch acl_groups += kern
set queue gridbatch acl_groups += medi
set queue gridbatch acl_groups += text
set queue gridbatch acl_groups += wesent
set queue gridbatch resources_available.nodect = 12
set queue gridbatch enabled = True
set queue gridbatch started = True
#
# Set server attributes.
#
set server scheduling = True
set server managers = root at hector.zih.tu-dresden.de
set server operators = root at hector.zih.tu-dresden.de
set server default_queue = gridbatch
set server log_events = 511
set server mail_from = adm
set server resources_default.nodes = 1
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server pbs_version = 2.1.8
Qmgr: q
hector:~ #

-- 
Samatha Kottha
Zentrum für Informationsdienste und Hochleistungsrechnen (ZIH)
Technische Universität Dresden			Tel: (+49) 351 463-38776
Room 1019					Fax: (+49) 351 463-38245
Noethnitzer Straße 46 
01187 Dresden
Germany 



More information about the torqueusers mailing list