[torqueusers] Torque/maui node failure policy

Marcus R. Epperson mrepper at sandia.gov
Wed Jun 20 17:11:55 MDT 2007


On 06/18/2007 07:21 PM, Marcus R. Epperson wrote:
> On 06/18/2007 06:30 PM, Garrick Staples wrote:
>> On Mon, Jun 18, 2007 at 05:28:27PM -0700, Peter Wyckoff alleged:
>>> [...]
>>> Is there a way to configure torque to do nothing other than the head 
>>> node?
>>> Or do nothing no matter what ? (since head node failures should be 
>>> rare as
>>> opposed to other nodes).
>>
>> TORQUE doesn't cancel jobs when sister nodes go down.  You might be
>> seeing Maui do that, it has a 5 minute job delete hardwired in there.
> 
> Doesn't the MS kill the job if one or more IM polls fails?  We see these 
> in our MS logs fairly often:
> [...]
> 
> I assumed you'd have to comment out the "send_sisters(pjob,IM_POLL_JOB)" 
> portion of the main mom loop to avoid that.  Maybe I'm missing something 
> though.

Replying to myself here, but disabling this behavior is something I've wanted before too.  Maybe we can make it a configurable option.  The attached patch (for 2.1.8) should allow you to disable the killing of the job when polling a sister node fails.  Just set a line in mom_priv/config like so:

$fatal_job_poll_failure false

You'll still see the same "POLL failed from node ___" messages in the mom logs and syslog, it just won't terminate the job.

-Marcus
-------------- next part --------------
--- torque-2.1.8.orig/src/resmom/mom_comm.c	2007-02-14 09:23:09.000000000 -0700
+++ torque-2.1.8/src/resmom/mom_comm.c	2007-06-20 14:12:56.000000000 -0600
@@ -165,6 +165,7 @@ char noglobid[] = "none";
 
 extern int LOGLEVEL;
 extern long TJobStartBlockTime;
+extern int FatalJobPollFailure;
 
 /* external functions */
 
@@ -1150,7 +1151,12 @@ void node_bailout(
 
         log_err(-1,id,log_buffer);
 
-        pjob->ji_nodekill = np->hn_node;
+        /*
+        ** Setting ji_nodekill will result in the job being killed.
+        ** Do it unless we're told not to via config parameter.
+        */
+        if (FatalJobPollFailure)
+          pjob->ji_nodekill = np->hn_node;
 
         break;
 
--- torque-2.1.8.orig/src/resmom/mom_main.c	2007-02-25 18:35:12.000000000 -0700
+++ torque-2.1.8/src/resmom/mom_main.c	2007-06-20 15:51:42.000000000 -0600
@@ -159,6 +159,7 @@
 
 int             ServerStatUpdateInterval = DEFAULT_SERVER_STAT_UPDATES;
 int             CheckPollTime            = CHECK_POLL_TIME;
+int             FatalJobPollFailure      = 1;
 
 double		cputfactor = 1.00;
 unsigned int	default_server_port = 0;
@@ -311,6 +312,7 @@ static unsigned long setcheckpointscript
 static unsigned long setdownonerror(char *);
 static unsigned long setstatusupdatetime(char *);
 static unsigned long setcheckpolltime(char *);
+static unsigned long setfataljobpollfailure(char *);
 static unsigned long settmpdir(char *);
 static unsigned long setlogfilemaxsize(char *);
 static unsigned long setlogfilerolldepth(char *);
@@ -348,6 +350,7 @@ static struct specials {
     { "tmpdir",       settmpdir },
     { "log_file_max_size", setlogfilemaxsize},
     { "log_file_roll_depth", setlogfilerolldepth},
+    { "fatal_job_poll_failure", setfataljobpollfailure},
     { NULL,           NULL } };
 
 
@@ -1692,6 +1695,58 @@ static u_long setdownonerror(
   }  /* END setdownonerror() */
 
 
+static u_long setfataljobpollfailure(
+
+  char *Value)  /* I */
+
+  {
+  static char   id[] = "setfataljobpollfailure";
+  int           enable = -1;
+
+  log_record(PBSEVENT_SYSTEM,PBS_EVENTCLASS_SERVER,id,Value);
+
+  if (Value == NULL)
+    {
+    /* FAILURE */
+
+    return(0);
+    }
+
+  /* accept various forms of "true", "yes", and "1" */
+  switch (Value[0])
+    {
+    case 't':
+    case 'T':
+    case 'y':
+    case 'Y':
+    case '1':
+
+      enable = 1;
+    
+      break;
+
+  /* accept various forms of "false", "no", and "0" */
+    case 'f':
+    case 'F':
+    case 'n':
+    case 'N':
+    case '0':
+
+      enable = 0;
+    
+      break;
+
+    }
+
+  if (enable != -1)
+    {
+    FatalJobPollFailure = enable;
+    }
+
+  return(1);
+  }  /* END setfataljobpollfailure() */
+
+
 static u_long setenablemomrestart(
 
   char *Value)  /* I */
@@ -3925,6 +3980,18 @@ int rm_request(
             sprintf(output,"check_poll_time=%d",
               CheckPollTime);
             }
+          else if (!strncasecmp(name,"fatal_job_poll_failure",strlen("fatal_job_poll_failure")))
+            {
+            /* set or report fatal_job_poll_failure */
+
+            if ( (*curr == '=') && ((*curr)+1 != '\0' ))
+              {
+              setfataljobpollfailure(curr+1);
+              }
+
+            sprintf(output,"fatal_job_poll_failure=%d",
+              FatalJobPollFailure);
+            }
           else if (!strncasecmp(name,"jobstartblocktime",strlen("jobstartblocktime")))
             {
             /* set or report jobstartblocktime */


More information about the torqueusers mailing list