[torquedev] preemption and dedicated nodes
Garrick Staples
garrick at usc.edu
Wed Jun 16 17:45:33 MDT 2010
On Wed, Jun 16, 2010 at 05:34:46PM -0600, Ken Nielson alleged:
>
>
> ----- Original Message -----
> From: "Garrick Staples" <garrick at usc.edu>
> To: torquedev at supercluster.org
> Sent: Wednesday, June 16, 2010 4:49:58 PM
> Subject: Re: [torquedev] preemption and dedicated nodes
>
> (Moving this from mauiuser to torquedev)
>
> On Thu, Jun 10, 2010 at 03:05:27PM -0700, Garrick Staples alleged:
> >> Has anyone noticed that maui starts up preemptor jobs on top of preemptee jobs
> >> on dedicated nodes?
> >>
> >> With dedicated nodes, I expect that only one job will be running at a time. But
> >> Maui is running the new jobs immediately after calling pbs_rerunjob(). The old
> >> job doesn't have a chance to exit yet resulting in 2 jobs running at the same
> >> time (for a short time).
>
> >Would anyone like my fix (so far)? I also fixed a second problem in torque
> >which is that preemptee jobs are just SIGKILL'd instead of given a reasonable
> >SIGTERM, followed by kill_delay'd SIGKILL.
>
> >This patch is against torque 2.1-fixes. This fix has 2 parts. The first was to
> >replace the SIGKILL with SIGTERM+job_delete_nanny that obeys kill_delay. This
> >gives jobs a chance to checkpoint themselves.
>
> >The second part is a hook near the top of req_runjob() that looks for existing
> >jobs in the RERUN substate. If so, then a work task is created to delay job
> >start. The caller (maui) is left hanging while this happens.
>
> We would certainly be interested.
Good thing it was attached :)
The only real downside that I can think of is that the caller of pbs_runjob()
(the scheduler) is made to wait possibly indefinitely. I'm not sure what to do
about that yet.
A second version is attached that lets pbs_asynrunjob() work without waiting.
--
Garrick Staples, GNU/Linux HPCC SysAdmin
University of Southern California
Life is Good!
-------------- next part --------------
Index: src/include/batch_request.h
===================================================================
--- src/include/batch_request.h (revision 3692)
+++ src/include/batch_request.h (working copy)
@@ -285,7 +285,7 @@
long rq_time; /* time batch request created */
char rq_user[PBS_MAXUSER+1]; /* user name request is from */
char rq_host[PBS_MAXHOSTNAME+1]; /* name of host sending request */
-int rq_XXXX;
+ int rq_replied; /* has replied to client */
void *rq_extra; /* optional ptr to extra info */
int rq_XXXY;
char *rq_extend; /* request "extension" data */
Index: src/server/req_jobobit.c
===================================================================
--- src/server/req_jobobit.c (revision 3692)
+++ src/server/req_jobobit.c (working copy)
@@ -1056,6 +1056,7 @@
struct batch_request *preq;
int IsFaked;
+ extern void remove_job_delete_nanny(struct job *);
if (ptask->wt_type != WORK_Deferred_Reply)
{
@@ -1073,6 +1074,8 @@
return;
}
+ remove_job_delete_nanny(pjob);
+
switch (pjob->ji_qs.ji_substate)
{
case JOB_SUBSTATE_RERUN:
Index: src/server/reply_send.c
===================================================================
--- src/server/reply_send.c (revision 3692)
+++ src/server/reply_send.c (working copy)
@@ -298,6 +298,8 @@
struct batch_request *preq;
{
+ if (preq->rq_replied) return;
+ preq->rq_replied = 1;
preq->rq_reply.brp_code = PBSE_NONE;
preq->rq_reply.brp_auxcode = 0;
preq->rq_reply.brp_choice = BATCH_REPLY_CHOICE_NULL;
@@ -376,6 +378,9 @@
char msgbuf[ERR_MSG_SIZE + 256 + 1];
char msgbuf2[ERR_MSG_SIZE + 256 + 1];
+ if (preq->rq_replied) return;
+ preq->rq_replied = 1;
+
set_err_msg(code,msgbuf);
snprintf(msgbuf2,sizeof(msgbuf2),"%s",msgbuf);
Index: src/server/req_runjob.c
===================================================================
--- src/server/req_runjob.c (revision 3692)
+++ src/server/req_runjob.c (working copy)
@@ -107,6 +107,7 @@
#include "svrfunc.h"
#include "net_connect.h"
#include "pbs_proto.h"
+#include "pbs_nodes.h"
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
@@ -133,6 +134,7 @@
static int svr_stagein A_((job *,struct batch_request *,int,int));
static int svr_strtjob2 A_((job *,struct batch_request *));
static job *chk_job_torun A_((struct batch_request *,int));
+static int chk_job_forpreemptees A_((job *,struct batch_request *));
static int assign_hosts A_((job *,char *,int,char *,char *));
/* Global Data Items: */
@@ -160,6 +162,7 @@
job *DispatchJob[20];
char *DispatchNode[20];
+static void resume_runjob(struct work_task *ptask);
/*
@@ -213,14 +216,19 @@
/* If async run, reply now; otherwise reply is handled in */
/* post_sendmom or post_stagein */
- /* perhaps node assignment should be handled immediately in async run? */
-
if ((preq != NULL) &&
(preq->rq_type == PBS_BATCH_AsyrunJob))
{
reply_ack(preq);
+ }
- preq = NULL; /* cleared so we don't try to reuse */
+ if (chk_job_forpreemptees(pjob,preq) == 0)
+ {
+ /* delay - a work task was created to come back to here in a little bit */
+
+ free_nodes(pjob);
+
+ return;
}
/* NOTE: nodes assigned to job in svr_startjob() */
@@ -1248,9 +1256,83 @@
} /* END chk_job_torun() */
+ /* FIXME:
+ foreach node
+ foreach job
+ if job substate is rerun
+ set_task(resume_runjob)
+ return
+ */
+int chk_job_forpreemptees(
+ job *pjob, /* I */
+ struct batch_request *preq) /* I */
+ {
+ char *nodestr, *hostlist;
+ struct pbsnode *pnode;
+ struct work_task *pwt;
+ struct pbssubn *np;
+ struct jobinfo *jp;
+ if (pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET)
+ {
+ hostlist=strdup(pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);
+
+ nodestr = strtok(hostlist,"+");
+
+ while (nodestr != NULL)
+ { /* foreach node ... */
+ if ((pnode=find_nodebyname(nodestr)) != NULL)
+ {
+ for (np = pnode->nd_psn;np != NULL;np = np->next)
+ { /* foreach subnode ... */
+ for (jp = np->jobs;jp != NULL;jp = jp->next)
+ { /* foreach job ... */
+ if ((jp->job != NULL) && (jp->job != pjob))
+ {
+ if (jp->job->ji_qs.ji_substate == JOB_SUBSTATE_RERUN)
+ {
+
+ if (!(pwt = set_task(WORK_Timed,time_now + 5,resume_runjob,(void *)preq)))
+ {
+ req_reject(PBSE_SYSTEM,0,preq,NULL,NULL);
+ }
+
+ sprintf(log_buffer,"job start delayed by preemptee job %s on node %s",
+ jp->job->ji_qs.ji_jobid,
+ nodestr);
+
+ log_event(
+ PBSEVENT_JOB,
+ PBS_EVENTCLASS_JOB,
+ pjob->ji_qs.ji_jobid,
+ log_buffer);
+
+ free(hostlist);
+
+ return(0);
+ } /* END if job is rerun */
+ } /* END if job */
+
+ } /* END for each job */
+ } /* END foreach subnode */
+ } /* END if find node */
+
+ nodestr = strtok(NULL,"+");
+ } /* END while(nodestr) */
+
+ free(hostlist);
+ }
+
+ return(1);
+}
+
+
+
+
+
+
/*
* assign_hosts - assign hosts (nodes) to job by the following rules:
* 1. use nodes that are "given"; from exec_host when required by
@@ -1438,6 +1520,15 @@
return(rc);
} /* END assign_hosts() */
+static void resume_runjob(
+ struct work_task *ptask)
+
+ {
+ req_runjob((struct batch_request *)ptask->wt_parm1);
+
+ return;
+ }
+
/* END req_runjob.c */
Index: src/server/req_delete.c
===================================================================
--- src/server/req_delete.c (revision 3692)
+++ src/server/req_delete.c (working copy)
@@ -955,19 +955,37 @@
}
else if (rc == PBSE_UNKJOBID)
{
- sprintf(log_buffer,"job delete nanny returned, but does not exist on mom");
+ if (pjob->ji_qs.ji_state == JOB_STATE_QUEUED)
+ {
+ /* great, rerun worked. Nothing to do here */
+ sprintf(log_buffer,"job delete nanny returned, job is queued now");
+
+ LOG_EVENT(
+ PBSEVENT_ERROR,
+ PBS_EVENTCLASS_JOB,
+ preq_sig->rq_ind.rq_signal.rq_jid,
+ log_buffer);
+fprintf(stderr,"%s",log_buffer);
+ }
+
+ else
+ {
+ /* great, delete worked. */
+ sprintf(log_buffer,"job delete nanny returned, but does not exist on mom");
- LOG_EVENT(
- PBSEVENT_ERROR,
- PBS_EVENTCLASS_JOB,
- preq_sig->rq_ind.rq_signal.rq_jid,
- log_buffer);
+ LOG_EVENT(
+ PBSEVENT_ERROR,
+ PBS_EVENTCLASS_JOB,
+ preq_sig->rq_ind.rq_signal.rq_jid,
+ log_buffer);
- free_nodes(pjob);
+fprintf(stderr,"%s",log_buffer);
+ free_nodes(pjob);
- set_resc_assigned(pjob,DECR);
+ set_resc_assigned(pjob,DECR);
- job_purge(pjob);
+ job_purge(pjob);
+ }
}
/* free task */
Index: src/server/req_rerun.c
===================================================================
--- src/server/req_rerun.c (revision 3692)
+++ src/server/req_rerun.c (working copy)
@@ -105,6 +105,7 @@
extern char *msg_manager;
extern char *msg_jobrerun;
+extern time_t time_now;
/*
@@ -165,6 +166,7 @@
struct batch_request *preq;
{
job *pjob;
+ extern struct work_task *apply_job_delete_nanny A_((struct job *,int));
if ((pjob = chk_job_request(preq->rq_ind.rq_rerun, preq)) == 0)
return;
@@ -188,9 +190,11 @@
return;
}
+ apply_job_delete_nanny(pjob,time_now + 60);
+
/* ask MOM to kill off the job */
- if (issue_signal(pjob, "SIGKILL", post_rerun, 0) != 0) {
+ if (issue_signal(pjob, "SIGTERM", post_rerun, 0) != 0) {
req_reject(PBSE_MOMREJECT, 0, preq,NULL,NULL);
return;
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
Url : http://www.supercluster.org/pipermail/torquedev/attachments/20100616/af847a05/attachment.bin
More information about the torquedev
mailing list