[torqueusers] Torque/maui node failure policy
Marcus R. Epperson
mrepper at sandia.gov
Wed Jun 20 17:11:55 MDT 2007
On 06/18/2007 07:21 PM, Marcus R. Epperson wrote:
> On 06/18/2007 06:30 PM, Garrick Staples wrote:
>> On Mon, Jun 18, 2007 at 05:28:27PM -0700, Peter Wyckoff alleged:
>>> [...]
>>> Is there a way to configure torque to do nothing other than the head
>>> node?
>>> Or do nothing no matter what ? (since head node failures should be
>>> rare as
>>> opposed to other nodes).
>>
>> TORQUE doesn't cancel jobs when sister nodes go down. You might be
>> seeing Maui do that, it has a 5 minute job delete hardwired in there.
>
> Doesn't the MS kill the job if one or more IM polls fails? We see these
> in our MS logs fairly often:
> [...]
>
> I assumed you'd have to comment out the "send_sisters(pjob,IM_POLL_JOB)"
> portion of the main mom loop to avoid that. Maybe I'm missing something
> though.
Replying to myself here, but disabling this behavior is something I've wanted before too. Maybe we can make it a configurable option. The attached patch (for 2.1.8) should allow you to disable the killing of the job when polling a sister node fails. Just set a line in mom_priv/config like so:
$fatal_job_poll_failure false
You'll still see the same "POLL failed from node ___" messages in the mom logs and syslog, it just won't terminate the job.
-Marcus
-------------- next part --------------
--- torque-2.1.8.orig/src/resmom/mom_comm.c 2007-02-14 09:23:09.000000000 -0700
+++ torque-2.1.8/src/resmom/mom_comm.c 2007-06-20 14:12:56.000000000 -0600
@@ -165,6 +165,7 @@ char noglobid[] = "none";
extern int LOGLEVEL;
extern long TJobStartBlockTime;
+extern int FatalJobPollFailure;
/* external functions */
@@ -1150,7 +1151,12 @@ void node_bailout(
log_err(-1,id,log_buffer);
- pjob->ji_nodekill = np->hn_node;
+ /*
+ ** Setting ji_nodekill will result in the job being killed.
+ ** Do it unless we're told not to via config parameter.
+ */
+ if (FatalJobPollFailure)
+ pjob->ji_nodekill = np->hn_node;
break;
--- torque-2.1.8.orig/src/resmom/mom_main.c 2007-02-25 18:35:12.000000000 -0700
+++ torque-2.1.8/src/resmom/mom_main.c 2007-06-20 15:51:42.000000000 -0600
@@ -159,6 +159,7 @@
int ServerStatUpdateInterval = DEFAULT_SERVER_STAT_UPDATES;
int CheckPollTime = CHECK_POLL_TIME;
+int FatalJobPollFailure = 1;
double cputfactor = 1.00;
unsigned int default_server_port = 0;
@@ -311,6 +312,7 @@ static unsigned long setcheckpointscript
static unsigned long setdownonerror(char *);
static unsigned long setstatusupdatetime(char *);
static unsigned long setcheckpolltime(char *);
+static unsigned long setfataljobpollfailure(char *);
static unsigned long settmpdir(char *);
static unsigned long setlogfilemaxsize(char *);
static unsigned long setlogfilerolldepth(char *);
@@ -348,6 +350,7 @@ static struct specials {
{ "tmpdir", settmpdir },
{ "log_file_max_size", setlogfilemaxsize},
{ "log_file_roll_depth", setlogfilerolldepth},
+ { "fatal_job_poll_failure", setfataljobpollfailure},
{ NULL, NULL } };
@@ -1692,6 +1695,58 @@ static u_long setdownonerror(
} /* END setdownonerror() */
+static u_long setfataljobpollfailure(
+
+ char *Value) /* I */
+
+ {
+ static char id[] = "setfataljobpollfailure";
+ int enable = -1;
+
+ log_record(PBSEVENT_SYSTEM,PBS_EVENTCLASS_SERVER,id,Value);
+
+ if (Value == NULL)
+ {
+ /* FAILURE */
+
+ return(0);
+ }
+
+ /* accept various forms of "true", "yes", and "1" */
+ switch (Value[0])
+ {
+ case 't':
+ case 'T':
+ case 'y':
+ case 'Y':
+ case '1':
+
+ enable = 1;
+
+ break;
+
+ /* accept various forms of "false", "no", and "0" */
+ case 'f':
+ case 'F':
+ case 'n':
+ case 'N':
+ case '0':
+
+ enable = 0;
+
+ break;
+
+ }
+
+ if (enable != -1)
+ {
+ FatalJobPollFailure = enable;
+ }
+
+ return(1);
+ } /* END setfataljobpollfailure() */
+
+
static u_long setenablemomrestart(
char *Value) /* I */
@@ -3925,6 +3980,18 @@ int rm_request(
sprintf(output,"check_poll_time=%d",
CheckPollTime);
}
+ else if (!strncasecmp(name,"fatal_job_poll_failure",strlen("fatal_job_poll_failure")))
+ {
+ /* set or report fatal_job_poll_failure */
+
+ if ( (*curr == '=') && ((*curr)+1 != '\0' ))
+ {
+ setfataljobpollfailure(curr+1);
+ }
+
+ sprintf(output,"fatal_job_poll_failure=%d",
+ FatalJobPollFailure);
+ }
else if (!strncasecmp(name,"jobstartblocktime",strlen("jobstartblocktime")))
{
/* set or report jobstartblocktime */
More information about the torqueusers
mailing list