[torquedev] preemption and dedicated nodes

Garrick Staples garrick at usc.edu
Wed Jun 16 17:45:33 MDT 2010


On Wed, Jun 16, 2010 at 05:34:46PM -0600, Ken Nielson alleged:
> 
> 
> ----- Original Message -----
> From: "Garrick Staples" <garrick at usc.edu>
> To: torquedev at supercluster.org
> Sent: Wednesday, June 16, 2010 4:49:58 PM
> Subject: Re: [torquedev] preemption and dedicated nodes
> 
> (Moving this from mauiuser to torquedev)
> 
> On Thu, Jun 10, 2010 at 03:05:27PM -0700, Garrick Staples alleged:
> >> Has anyone noticed that maui starts up preemptor jobs on top of preemptee jobs
> >> on dedicated nodes?
> >> 
> >> With dedicated nodes, I expect that only one job will be running at a time. But
> >> Maui is running the new jobs immediately after calling pbs_rerunjob(). The old
> >> job doesn't have a chance to exit yet resulting in 2 jobs running at the same
> >> time (for a short time).
> 
> >Would anyone like my fix (so far)? I also fixed a second problem in torque
> >which is that preemptee jobs are just SIGKILL'd instead of given a reasonable
> >SIGTERM, followed by kill_delay'd SIGKILL.
> 
> >This patch is against torque 2.1-fixes.  This fix has 2 parts. The first was to
> >replace the SIGKILL with SIGTERM+job_delete_nanny that obeys kill_delay. This
> >gives jobs a chance to checkpoint themselves.
> 
> >The second part is a hook near the top of req_runjob() that looks for existing
> >jobs in the RERUN substate. If so, then a work task is created to delay job
> >start. The caller (maui) is left hanging while this happens.
> 
> We would certainly be interested.

Good thing it was attached :)

The only real downside that I can think of is that the caller of pbs_runjob()
(the scheduler) is made to wait possibly indefinitely. I'm not sure what to do
about that yet.

A second version is attached that lets pbs_asynrunjob() work without waiting.

-- 
Garrick Staples, GNU/Linux HPCC SysAdmin
University of Southern California

Life is Good!
-------------- next part --------------
Index: src/include/batch_request.h
===================================================================
--- src/include/batch_request.h	(revision 3692)
+++ src/include/batch_request.h	(working copy)
@@ -285,7 +285,7 @@
 	long	  rq_time;	/* time batch request created		*/
 	char	  rq_user[PBS_MAXUSER+1];     /* user name request is from    */
 	char	  rq_host[PBS_MAXHOSTNAME+1]; /* name of host sending request */
-int rq_XXXX;
+        int       rq_replied;   /* has replied to client */
 	void  	 *rq_extra;	/* optional ptr to extra info		*/
 int rq_XXXY;
 	char	 *rq_extend;	/* request "extension" data		*/
Index: src/server/req_jobobit.c
===================================================================
--- src/server/req_jobobit.c	(revision 3692)
+++ src/server/req_jobobit.c	(working copy)
@@ -1056,6 +1056,7 @@
   struct batch_request *preq;
 
   int                 IsFaked;
+  extern void remove_job_delete_nanny(struct job *);
 
   if (ptask->wt_type != WORK_Deferred_Reply) 
     {
@@ -1073,6 +1074,8 @@
     return;
     }
 
+  remove_job_delete_nanny(pjob);
+
   switch (pjob->ji_qs.ji_substate) 
     {
     case JOB_SUBSTATE_RERUN:
Index: src/server/reply_send.c
===================================================================
--- src/server/reply_send.c	(revision 3692)
+++ src/server/reply_send.c	(working copy)
@@ -298,6 +298,8 @@
 	struct batch_request *preq;
 {
 
+        if (preq->rq_replied) return;
+        preq->rq_replied           = 1;
 	preq->rq_reply.brp_code    = PBSE_NONE;
 	preq->rq_reply.brp_auxcode = 0;
 	preq->rq_reply.brp_choice  = BATCH_REPLY_CHOICE_NULL;
@@ -376,6 +378,9 @@
   char msgbuf[ERR_MSG_SIZE + 256 + 1];
   char msgbuf2[ERR_MSG_SIZE + 256 + 1];
 
+  if (preq->rq_replied) return;
+  preq->rq_replied = 1;
+
   set_err_msg(code,msgbuf);
 
   snprintf(msgbuf2,sizeof(msgbuf2),"%s",msgbuf);
Index: src/server/req_runjob.c
===================================================================
--- src/server/req_runjob.c	(revision 3692)
+++ src/server/req_runjob.c	(working copy)
@@ -107,6 +107,7 @@
 #include "svrfunc.h"
 #include "net_connect.h"
 #include "pbs_proto.h"
+#include "pbs_nodes.h"
 
 #ifdef HAVE_NETINET_IN_H
 #include <netinet/in.h>
@@ -133,6 +134,7 @@
 static int  svr_stagein A_((job *,struct batch_request *,int,int)); 
 static int  svr_strtjob2 A_((job *,struct batch_request *));
 static job *chk_job_torun A_((struct batch_request *,int));
+static int chk_job_forpreemptees A_((job *,struct batch_request *));
 static int  assign_hosts A_((job *,char *,int,char *,char *));
 
 /* Global Data Items: */
@@ -160,6 +162,7 @@
 job  *DispatchJob[20];
 char *DispatchNode[20];
 
+static void resume_runjob(struct work_task *ptask);
 
 
 /*
@@ -213,14 +216,19 @@
   /* If async run, reply now; otherwise reply is handled in */
   /* post_sendmom or post_stagein */
 
-  /* perhaps node assignment should be handled immediately in async run? */
-
   if ((preq != NULL) && 
       (preq->rq_type == PBS_BATCH_AsyrunJob)) 
     {
     reply_ack(preq);
+    }
 
-    preq = NULL;  /* cleared so we don't try to reuse */
+  if (chk_job_forpreemptees(pjob,preq) == 0)
+    {
+    /* delay - a work task was created to come back to here in a little bit */
+
+    free_nodes(pjob);
+
+    return;
     }
 
   /* NOTE:  nodes assigned to job in svr_startjob() */
@@ -1248,9 +1256,83 @@
   }  /* END chk_job_torun() */
 
 
+  /* FIXME:
+    foreach node
+       foreach job
+         if job substate is rerun
+             set_task(resume_runjob)
+             return
+  */
 
+int chk_job_forpreemptees(
 
+  job                  *pjob,  /* I */
+  struct batch_request *preq)  /* I */
+  {
+  char   *nodestr, *hostlist;
+  struct pbsnode *pnode;
+  struct work_task *pwt;
+  struct pbssubn *np;
+  struct jobinfo *jp;
 
+  if (pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET)
+    {
+    hostlist=strdup(pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);
+  
+    nodestr = strtok(hostlist,"+");
+
+    while (nodestr != NULL)
+      {  /* foreach node ... */
+      if ((pnode=find_nodebyname(nodestr)) != NULL)
+        {
+        for (np = pnode->nd_psn;np != NULL;np = np->next)
+          {  /* foreach subnode ... */
+          for (jp = np->jobs;jp != NULL;jp = jp->next)
+            {  /* foreach job ... */
+            if ((jp->job != NULL) && (jp->job != pjob))
+              {
+              if (jp->job->ji_qs.ji_substate == JOB_SUBSTATE_RERUN)
+                {
+
+                if (!(pwt = set_task(WORK_Timed,time_now + 5,resume_runjob,(void *)preq)))
+                  {
+                  req_reject(PBSE_SYSTEM,0,preq,NULL,NULL);
+                  }
+
+                sprintf(log_buffer,"job start delayed by preemptee job %s on node %s",
+                  jp->job->ji_qs.ji_jobid,
+                  nodestr);
+
+                log_event(
+                  PBSEVENT_JOB,
+                  PBS_EVENTCLASS_JOB,
+                  pjob->ji_qs.ji_jobid,
+                  log_buffer);
+
+                free(hostlist);
+
+                return(0);
+                } /* END if job is rerun */
+              } /* END if job */
+
+            } /* END for each job */
+          } /* END foreach subnode */
+        } /* END if find node */
+
+      nodestr = strtok(NULL,"+");
+      } /* END while(nodestr) */
+
+    free(hostlist);
+    }
+
+  return(1);
+}
+
+
+
+
+
+
 /*
  * assign_hosts - assign hosts (nodes) to job by the following rules:
  *	1. use nodes that are "given"; from exec_host when required by
@@ -1438,6 +1520,15 @@
   return(rc);
   }  /* END assign_hosts() */
 
+static void resume_runjob(
 
+  struct work_task *ptask)
+
+  {
+  req_runjob((struct batch_request *)ptask->wt_parm1);
+
+  return;
+  }
+
 /* END req_runjob.c */
 
Index: src/server/req_delete.c
===================================================================
--- src/server/req_delete.c	(revision 3692)
+++ src/server/req_delete.c	(working copy)
@@ -955,19 +955,37 @@
     }
   else if (rc == PBSE_UNKJOBID)
     {
-    sprintf(log_buffer,"job delete nanny returned, but does not exist on mom");
+    if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
+      {
+      /* great, rerun worked. Nothing to do here */
+      sprintf(log_buffer,"job delete nanny returned, job is queued now");
+
+      LOG_EVENT(
+        PBSEVENT_ERROR, 
+        PBS_EVENTCLASS_JOB,
+        preq_sig->rq_ind.rq_signal.rq_jid, 
+        log_buffer);
+fprintf(stderr,"%s",log_buffer);
+      }
+
+    else
+      {
+      /* great, delete worked. */
+      sprintf(log_buffer,"job delete nanny returned, but does not exist on mom");
  
-    LOG_EVENT(
-      PBSEVENT_ERROR, 
-      PBS_EVENTCLASS_JOB,
-      preq_sig->rq_ind.rq_signal.rq_jid, 
-      log_buffer);
+      LOG_EVENT(
+        PBSEVENT_ERROR, 
+        PBS_EVENTCLASS_JOB,
+        preq_sig->rq_ind.rq_signal.rq_jid, 
+        log_buffer);
 
-    free_nodes(pjob);
+fprintf(stderr,"%s",log_buffer);
+      free_nodes(pjob);
  
-    set_resc_assigned(pjob,DECR);
+      set_resc_assigned(pjob,DECR);
  
-    job_purge(pjob);
+      job_purge(pjob);
+      }
     }
 
   /* free task */
Index: src/server/req_rerun.c
===================================================================
--- src/server/req_rerun.c	(revision 3692)
+++ src/server/req_rerun.c	(working copy)
@@ -105,6 +105,7 @@
 
 extern char *msg_manager;
 extern char *msg_jobrerun;
+extern time_t time_now;
 
 
 /*
@@ -165,6 +166,7 @@
 	struct batch_request *preq;
 {
 	job		 *pjob;
+        extern struct work_task *apply_job_delete_nanny A_((struct job *,int));
 
 	if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
 		return;
@@ -188,9 +190,11 @@
 		return;
 	}
 
+        apply_job_delete_nanny(pjob,time_now + 60);
+
 	/* ask MOM to kill off the job */
 
-	if (issue_signal(pjob, "SIGKILL", post_rerun, 0) != 0) {
+	if (issue_signal(pjob, "SIGTERM", post_rerun, 0) != 0) {
 		req_reject(PBSE_MOMREJECT, 0, preq,NULL,NULL);
 		return;
 	}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
Url : http://www.supercluster.org/pipermail/torquedev/attachments/20100616/af847a05/attachment.bin 


More information about the torquedev mailing list