[torquedev] preemption and dedicated nodes

Garrick Staples garrick at usc.edu
Wed Jun 16 16:49:58 MDT 2010


(Moving this from mauiuser to torquedev)

On Thu, Jun 10, 2010 at 03:05:27PM -0700, Garrick Staples alleged:
> Has anyone noticed that maui starts up preemptor jobs on top of preemptee jobs
> on dedicated nodes?
> 
> With dedicated nodes, I expect that only one job will be running at a time. But
> Maui is running the new jobs immediately after calling pbs_rerunjob(). The old
> job doesn't have a chance to exit yet resulting in 2 jobs running at the same
> time (for a short time).

Would anyone like my fix (so far)? I also fixed a second problem in torque
which is that preemptee jobs are just SIGKILL'd instead of given a reasonable
SIGTERM, followed by kill_delay'd SIGKILL.

This patch is against torque 2.1-fixes.  This fix has 2 parts. The first was to
replace the SIGKILL with SIGTERM+job_delete_nanny that obeys kill_delay. This
gives jobs a chance to checkpoint themselves.

The second part is a hook near the top of req_runjob() that looks for existing
jobs in the RERUN substate. If so, then a work task is created to delay job
start. The caller (maui) is left hanging while this happens.

-- 
Garrick Staples, GNU/Linux HPCC SysAdmin
University of Southern California

Life is Good!
-------------- next part --------------
Index: src/server/req_jobobit.c
===================================================================
--- src/server/req_jobobit.c	(revision 3692)
+++ src/server/req_jobobit.c	(working copy)
@@ -1056,6 +1056,7 @@
   struct batch_request *preq;
 
   int                 IsFaked;
+  extern void remove_job_delete_nanny(struct job *);
 
   if (ptask->wt_type != WORK_Deferred_Reply) 
     {
@@ -1073,6 +1074,8 @@
     return;
     }
 
+  remove_job_delete_nanny(pjob);
+
   switch (pjob->ji_qs.ji_substate) 
     {
     case JOB_SUBSTATE_RERUN:
Index: src/server/req_runjob.c
===================================================================
--- src/server/req_runjob.c	(revision 3692)
+++ src/server/req_runjob.c	(working copy)
@@ -107,6 +107,7 @@
 #include "svrfunc.h"
 #include "net_connect.h"
 #include "pbs_proto.h"
+#include "pbs_nodes.h"
 
 #ifdef HAVE_NETINET_IN_H
 #include <netinet/in.h>
@@ -133,6 +134,7 @@
 static int  svr_stagein A_((job *,struct batch_request *,int,int)); 
 static int  svr_strtjob2 A_((job *,struct batch_request *));
 static job *chk_job_torun A_((struct batch_request *,int));
+static int chk_job_forpreemptees A_((job *,struct batch_request *));
 static int  assign_hosts A_((job *,char *,int,char *,char *));
 
 /* Global Data Items: */
@@ -160,6 +162,7 @@
 job  *DispatchJob[20];
 char *DispatchNode[20];
 
+static void resume_runjob(struct work_task *ptask);
 
 
 /*
@@ -196,6 +199,15 @@
     return;
     }
 
+  if (chk_job_forpreemptees(pjob,preq) == 0)
+    {
+    /* delay - come back to here in a little bit */
+
+    free_nodes(pjob);
+
+    return;
+    }
+
   if (preq->rq_conn == scheduler_sock)
     ++scheduler_jobct;	/* see scheduler_close() */
 
@@ -1248,9 +1260,83 @@
   }  /* END chk_job_torun() */
 
 
+  /* FIXME:
+    foreach node
+       foreach job
+         if job substate is rerun
+             set_task(resume_runjob)
+             return
+  */
 
+int chk_job_forpreemptees(
 
+  job                  *pjob,  /* I */
+  struct batch_request *preq)  /* I */
+  {
+  char   *nodestr, *hostlist;
+  struct pbsnode *pnode;
+  struct work_task *pwt;
+  struct pbssubn *np;
+  struct jobinfo *jp;
 
+  if (pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET)
+    {
+    hostlist=strdup(pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);
+  
+    nodestr = strtok(hostlist,"+");
+
+    while (nodestr != NULL)
+      {  /* foreach node ... */
+      if ((pnode=find_nodebyname(nodestr)) != NULL)
+        {
+        for (np = pnode->nd_psn;np != NULL;np = np->next)
+          {  /* foreach subnode ... */
+          for (jp = np->jobs;jp != NULL;jp = jp->next)
+            {  /* foreach job ... */
+            if ((jp->job != NULL) && (jp->job != pjob))
+              {
+              if (jp->job->ji_qs.ji_substate == JOB_SUBSTATE_RERUN)
+                {
+
+                if (!(pwt = set_task(WORK_Timed,time_now + 5,resume_runjob,(void *)preq)))
+                  {
+                  req_reject(PBSE_SYSTEM,0,preq,NULL,NULL);
+                  }
+
+                sprintf(log_buffer,"job start delayed by preemptee job %s on node %s",
+                  jp->job->ji_qs.ji_jobid,
+                  nodestr);
+
+                log_event(
+                  PBSEVENT_JOB,
+                  PBS_EVENTCLASS_JOB,
+                  pjob->ji_qs.ji_jobid,
+                  log_buffer);
+
+                free(hostlist);
+
+                return(0);
+                } /* END if job is rerun */
+              } /* END if job */
+
+            } /* END for each job */
+          } /* END foreach subnode */
+        } /* END if find node */
+
+      nodestr = strtok(NULL,"+");
+      } /* END while(nodestr) */
+
+    free(hostlist);
+    }
+
+  return(1);
+}
+
+
+
+
+
+
 /*
  * assign_hosts - assign hosts (nodes) to job by the following rules:
  *	1. use nodes that are "given"; from exec_host when required by
@@ -1438,6 +1524,15 @@
   return(rc);
   }  /* END assign_hosts() */
 
+static void resume_runjob(
 
+  struct work_task *ptask)
+
+  {
+  req_runjob((struct batch_request *)ptask->wt_parm1);
+
+  return;
+  }
+
 /* END req_runjob.c */
 
Index: src/server/req_delete.c
===================================================================
--- src/server/req_delete.c	(revision 3692)
+++ src/server/req_delete.c	(working copy)
@@ -955,19 +955,37 @@
     }
   else if (rc == PBSE_UNKJOBID)
     {
-    sprintf(log_buffer,"job delete nanny returned, but does not exist on mom");
+    if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
+      {
+      /* great, rerun worked. Nothing to do here */
+      sprintf(log_buffer,"job delete nanny returned, job is queued now");
+
+      LOG_EVENT(
+        PBSEVENT_ERROR, 
+        PBS_EVENTCLASS_JOB,
+        preq_sig->rq_ind.rq_signal.rq_jid, 
+        log_buffer);
+fprintf(stderr,"%s",log_buffer);
+      }
+
+    else
+      {
+      /* great, delete worked. */
+      sprintf(log_buffer,"job delete nanny returned, but does not exist on mom");
  
-    LOG_EVENT(
-      PBSEVENT_ERROR, 
-      PBS_EVENTCLASS_JOB,
-      preq_sig->rq_ind.rq_signal.rq_jid, 
-      log_buffer);
+      LOG_EVENT(
+        PBSEVENT_ERROR, 
+        PBS_EVENTCLASS_JOB,
+        preq_sig->rq_ind.rq_signal.rq_jid, 
+        log_buffer);
 
-    free_nodes(pjob);
+fprintf(stderr,"%s",log_buffer);
+      free_nodes(pjob);
  
-    set_resc_assigned(pjob,DECR);
+      set_resc_assigned(pjob,DECR);
  
-    job_purge(pjob);
+      job_purge(pjob);
+      }
     }
 
   /* free task */
Index: src/server/req_rerun.c
===================================================================
--- src/server/req_rerun.c	(revision 3692)
+++ src/server/req_rerun.c	(working copy)
@@ -105,6 +105,7 @@
 
 extern char *msg_manager;
 extern char *msg_jobrerun;
+extern time_t time_now;
 
 
 /*
@@ -165,6 +166,7 @@
 	struct batch_request *preq;
 {
 	job		 *pjob;
+        extern struct work_task *apply_job_delete_nanny A_((struct job *,int));
 
 	if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
 		return;
@@ -188,9 +190,11 @@
 		return;
 	}
 
+        apply_job_delete_nanny(pjob,time_now + 60);
+
 	/* ask MOM to kill off the job */
 
-	if (issue_signal(pjob, "SIGKILL", post_rerun, 0) != 0) {
+	if (issue_signal(pjob, "SIGTERM", post_rerun, 0) != 0) {
 		req_reject(PBSE_MOMREJECT, 0, preq,NULL,NULL);
 		return;
 	}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
Url : http://www.supercluster.org/pipermail/torquedev/attachments/20100616/7ed1e7ac/attachment.bin 


More information about the torquedev mailing list