[Mauiusers] Patch for the default partition handling

Eygene Ryabinkin rea+maui at grid.kiae.ru
Wed Mar 21 06:02:12 MDT 2007


Good day.

I discovered that the PDEF statement is not honored in my Maui
installation: I have the statements like
-----
GROUPCFG[mygroup] PLIST=prod:testing PDEF=prod
-----
but the jobs are scheduled to the 'testing' partition first that
is against the 'PDEF' semantics, if I understand the things correctly.
The code showed that there are no checks for the default partition
at all. The only function that does such check is MJobGetPAL, but
it is called with the 'NULL' as the fourth argument, so no default
partition checking is done.

The attached are three patches that are enabling the 'PDEF' processing.
The one thing I am not 100% sure in is that the first scheduling
pass that selects the jobs on their default partitions is not
followed by the backfill procedure. I feel that it is right to do
the backfilling only after the queue filling, but I can be wrong.

Patches were written and tested on the maui 3.2.6p16, since the
3.2.6p18 is not working on our production cluster where this patch
was tested. But they are applying to the p18 sources without errors
and with some minimal offsets.

Patches are documented, so I am shutting up.
-- 
Eygene Ryabinkin, RRC KI
-------------- next part --------------
Subject: [PATCH] Prepare the MQueueSelectJobs() for the two-pass scheduling.

In order to get the 'PDEF' statement to work correcly we should get
two-pass scheduling for the given partition: first, the jobs for
which the partition is the default should be considered and then
the rest of the jobs should be considered for scheduling. Note that
we should walk over all partitions at the first pass and only then
the second pass over all partitions and the rest of the jobs must
be done: we should select ALL jobs that can fit to their default
partitions.

The current patch is a no-op from the functional point of view.
It just encapsulates the single job checking to the local function
MQueueCheckSingleJob() that was taken from the original body of the
MQueueSelectJobs().

Patch was tested on the RRC-KI Grid cluster and yet showed no
regressions on its daily operations.

Signed-off-by: Eygene Ryabinkin <rea+maui at grid.kiae.ru>
---
 src/moab/MPolicy.c |  578 ++++++++++++++++++++++++++++------------------------
 1 files changed, 308 insertions(+), 270 deletions(-)

diff --git a/src/moab/MPolicy.c b/src/moab/MPolicy.c
index a0ae474..18e6b15 100644
--- a/src/moab/MPolicy.c
+++ b/src/moab/MPolicy.c
@@ -147,10 +147,19 @@ extern mres_t     *MRes[];
 
 */
 
-/* NYI:  must handle effqduration */
-
+static int MQueueCheckSingleJob(
+  mjob_t	*J,
+  int		*Reason,
+  mpar_t	*P,
+  mpar_t	*GP,
+  int            PLevel,
+  int            MaxNC,
+  int            MaxPC,
+  unsigned long  MaxWCLimit,
+  int            OrigPIndex,
+  mbool_t        UpdateStats);
 
-   
+/* NYI:  must handle effqduration */
 
 int MQueueSelectJobs(
 
@@ -171,27 +180,14 @@ int MQueueSelectJobs(
 
   mjob_t  *J;
 
-  char     DValue[MAX_MNAME];
-  enum MJobDependEnum DType;
-
   mpar_t  *P;
   mpar_t  *GP;
 
-  long     PS;
-
   int      LReason[MAX_MREJREASON];
-  int      PReason;
 
   int     *Reason;
 
   int      PIndex;
-  int      PReq;
-
-  mreq_t  *RQ;
-
-  double   PE;
-
-  char     tmpLine[MAX_MLINE];
 
   const char *FName = "MQueueSelectJobs";
 
@@ -267,368 +263,410 @@ int MQueueSelectJobs(
       continue;
       }
 
-    RQ = J->Req[0]; /* FIXME */
+    if (MQueueCheckSingleJob(J, Reason, P, GP, PLevel,
+	MaxNC, MaxPC, MaxWCLimit, OrigPIndex, UpdateStats) == FAILURE)
+      continue;
 
-    /* if job removed */
+    /* NOTE:  effective queue duration not yet properly supported */
 
-    if (J->Name[0] == '\0')
-      {
-      Reason[marCorruption]++;
+    J->EffQueueDuration = (MSched.Time > J->SystemQueueTime) ? 
+      MSched.Time - J->SystemQueueTime : 0;
+ 
+    /* add job to destination queue */
 
-      continue;
-      }
+    DBG(5,fSCHED) DPrint("INFO:     job '%s' added to queue at slot %d\n",
+      J->Name,
+      sindex);
 
-    if (UpdateStats == TRUE)
-      {
-      J->BlockReason = 0;
+    DstQ[sindex++] = SrcQ[jindex];
+    }  /* END for (jindex) */
 
-      if (J->State == mjsIdle)
-        MStat.IdleJobs++;
-      }
+  /* terminate list */
 
-    PReq = MJobGetProcCount(J);
-    MJobGetPE(J,P,&PE);
-    PS   = (long)PReq * J->SpecWCLimit[0];
+  DstQ[sindex] = -1;
 
-    /* check partition */
+  DBG(1,fSCHED)
+    {
+    DBG(1,fSCHED) DPrint("INFO:     total jobs selected in partition %s: %d/%-d ",
+      MAList[ePartition][PIndex],
+      sindex,
+      jindex);
 
-    if (OrigPIndex != -1)
+    for (index = 0;index < MAX_MREJREASON;index++)
       {
-      if ((P->Index == 0) && !(J->Flags & (1 << mjfSpan)))
+      if (Reason[index] != 0)
         {
-        /* why?  what does partition '0' mean in partition mode? */
+        fprintf(mlog.logfp,"[%s: %d]",
+          MAllocRejType[index],
+          Reason[index]);
+        }
+      }    /* END for (index) */
 
-        DBG(3,fSCHED) DPrint("INFO:     job %s not considered for spanning\n",
-          J->Name);
+    fprintf(mlog.logfp,"\n");
+    }
 
-        Reason[marPartitionAccess]++;
+  if (sindex == 0)
+    return(FAILURE);
 
-        continue;
-        }
-      else if ((P->Index != 0) && (J->Flags & (1 << mjfSpan)))
-        {
-        DBG(3,fSCHED) DPrint("INFO:     spanning job %s not considered for partition scheduling\n",
-          J->Name);
+  return(SUCCESS);
+  }  /* END MQueueSelectJobs() */
 
-        Reason[marPartitionAccess]++;
+/*
+ * Helper for MQueueSelectJobs: performs the single job evaluation.
+ * Returns SUCCESS if job can be queued and FAILURE otherwise.
+ */
+static int MQueueCheckSingleJob(
+  mjob_t	*J,
+  int		*Reason,
+  mpar_t	*P,
+  mpar_t	*GP,
+  int            PLevel,
+  int            MaxNC,
+  int            MaxPC,
+  unsigned long  MaxWCLimit,
+  int            OrigPIndex,
+  mbool_t        UpdateStats)
 
-        continue;
-        }
+  {
+  char     DValue[MAX_MNAME];
+  enum MJobDependEnum DType;
 
-      if ((P->Index > 0) && (MUBMCheck(P->Index,J->PAL) == FAILURE))
-        {
-        DBG(7,fSCHED) DPrint("INFO:     job %s not considered for partition %s (allowed %s)\n",
-          J->Name,
-          P->Name,
-          MUListAttrs(ePartition,J->PAL[0]));
+  long     PS;
 
-        Reason[marPartitionAccess]++;
+  int      PReason;
 
-        continue;
-        }
-      }   /* END if (OrigPIndex != -1) */
+  int      PReq;
 
-    /* check job state */
+  mreq_t  *RQ;
 
-    if ((J->State != mjsIdle) && (J->State != mjsSuspended))
-      {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle state '%s')\n",
-        J->Name,
-        MJobState[J->State]);
+  double   PE;
 
-      Reason[marState]++;
+  char     tmpLine[MAX_MLINE];
 
-      if ((MaxNC == MAX_MNODE) && 
-          (MaxWCLimit == MAX_MTIME) && 
-          (J->R != NULL))
-        {
-        if ((J->State != mjsStarting) && (J->State != mjsRunning))
-          MResDestroy(&J->R);
-        }
+  const char *FName = "MQueueCheckSingleJob";
 
-      continue;
-      }
+  RQ = J->Req[0]; /* FIXME */
 
-    /* check if job has been previously scheduled or deferred */
+  /* if job removed */
 
-    if ((J->EState != mjsIdle) && (J->EState != mjsSuspended))
+  if (J->Name[0] == '\0')
+    {
+    Reason[marCorruption]++;
+
+    return(FAILURE);
+    }
+
+  if (UpdateStats == TRUE)
+    {
+    J->BlockReason = 0;
+
+    if (J->State == mjsIdle)
+      MStat.IdleJobs++;
+    }
+
+  PReq = MJobGetProcCount(J);
+  /* XXX: PE is unused? */
+  MJobGetPE(J,P,&PE);
+  PS   = (long)PReq * J->SpecWCLimit[0];
+
+  /* check partition */
+
+  if (OrigPIndex != -1)
+    {
+    if ((P->Index == 0) && !(J->Flags & (1 << mjfSpan)))
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle expected state: '%s')\n",
-        J->Name,
-        MJobState[J->EState]);
+      /* why?  what does partition '0' mean in partition mode? */
 
-      Reason[marEState]++;
+      DBG(3,fSCHED) DPrint("INFO:     job %s not considered for spanning\n",
+        J->Name);
 
-      if ((MaxNC == MAX_MNODE) && (MaxWCLimit == MAX_MTIME) && (J->R != NULL))
-        {
-        if ((J->EState != mjsStarting) && (J->EState != mjsRunning))
-          MResDestroy(&J->R);
-        }
+      Reason[marPartitionAccess]++;
 
-      continue;
+      return(FAILURE);
       }
+    else if ((P->Index != 0) && (J->Flags & (1 << mjfSpan)))
+      {
+      DBG(3,fSCHED) DPrint("INFO:     spanning job %s not considered for partition scheduling\n",
+        J->Name);
 
-    /* check available procs */
+      Reason[marPartitionAccess]++;
+
+      return(FAILURE);
+      }
 
-    if (PReq > P->CRes.Procs)
+    if ((P->Index > 0) && (MUBMCheck(P->Index,J->PAL) == FAILURE))
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds configured procs: %d > %d)\n",
+      DBG(7,fSCHED) DPrint("INFO:     job %s not considered for partition %s (allowed %s)\n",
         J->Name,
         P->Name,
-        PReq,
-        P->CRes.Procs);
+        MUListAttrs(ePartition,J->PAL[0]));
 
-      Reason[marNodeCount]++;
+      Reason[marPartitionAccess]++;
 
-      if (P->Index <= 0)
-        {
-        if (J->R != NULL)
-          MResDestroy(&J->R);
+      return(FAILURE);
+      }
+    }   /* END if (OrigPIndex != -1) */
 
-        if (J->Hold == 0)
-          {
-          MJobSetHold(
-            J,
-            (1 << mhDefer),
-            MSched.DeferTime,
-            mhrNoResources,
-            "exceeds partition configured procs");
-          }
-        }
+  /* check job state */
 
-      continue;
+  if ((J->State != mjsIdle) && (J->State != mjsSuspended))
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle state '%s')\n",
+      J->Name,
+      MJobState[J->State]);
+
+    Reason[marState]++;
+
+    if ((MaxNC == MAX_MNODE) && 
+        (MaxWCLimit == MAX_MTIME) && 
+        (J->R != NULL))
+      {
+      if ((J->State != mjsStarting) && (J->State != mjsRunning))
+        MResDestroy(&J->R);
       }
 
-    /* check partition specific limits */
+    return(FAILURE);
+    }
 
-    if (MJobCheckLimits(
-          J,
-          PLevel,
-          P,
-          (1 << mlSystem),
-          tmpLine) == FAILURE)
+  /* check if job has been previously scheduled or deferred */
+
+  if ((J->EState != mjsIdle) && (J->EState != mjsSuspended))
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected (job in non-idle expected state: '%s')\n",
+      J->Name,
+      MJobState[J->EState]);
+
+    Reason[marEState]++;
+
+    if ((MaxNC == MAX_MNODE) && (MaxWCLimit == MAX_MTIME) && (J->R != NULL))
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (%s)\n",
-        J->Name,
-        P->Name,
-        tmpLine);
+      if ((J->EState != mjsStarting) && (J->EState != mjsRunning))
+        MResDestroy(&J->R);
+      }
 
-      Reason[marSystemLimits]++;
+    return(FAILURE);
+    }
 
-      if (P->Index <= 0)
-        {
-        if (J->R != NULL)
-          MResDestroy(&J->R);
+  /* check available procs */
+
+  if (PReq > P->CRes.Procs)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds configured procs: %d > %d)\n",
+      J->Name,
+      P->Name,
+      PReq,
+      P->CRes.Procs);
 
+    Reason[marNodeCount]++;
+
+    if (P->Index <= 0)
+      {
+      if (J->R != NULL)
+        MResDestroy(&J->R);
+
+      if (J->Hold == 0)
+        {
         MJobSetHold(
           J,
           (1 << mhDefer),
           MSched.DeferTime,
-          mhrSystemLimits,
-          "exceeds system proc/job limit");
+          mhrNoResources,
+          "exceeds partition configured procs");
         }
+      }
 
-      continue;
-      }  /* END if (MJobCheckLimits() == FAILURE) */
-
-    /* check job size */
-
-    if (PReq > MaxPC)
-      {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window size: %d > %d)\n",
-        J->Name,
-        P->Name,
-        PReq,
-        MaxPC);
+    return(FAILURE);
+    }
 
-      Reason[marNodeCount]++;
+  /* check partition specific limits */
 
-      continue;
-      }
+  if (MJobCheckLimits(
+        J,
+        PLevel,
+        P,
+        (1 << mlSystem),
+        tmpLine) == FAILURE)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (%s)\n",
+      J->Name,
+      P->Name,
+      tmpLine);
 
-    /* check job duration */
+    Reason[marSystemLimits]++;
 
-    if (J->SpecWCLimit[0] > MaxWCLimit)
+    if (P->Index <= 0)
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window time: %ld > %ld)\n",
-        J->Name,
-        P->Name,
-        J->SpecWCLimit[0],
-        MaxWCLimit);
-
-      Reason[marTime]++;
+      if (J->R != NULL)
+        MResDestroy(&J->R);
 
-      continue;
+      MJobSetHold(
+        J,
+        (1 << mhDefer),
+        MSched.DeferTime,
+        mhrSystemLimits,
+        "exceeds system proc/job limit");
       }
 
-    /* check partition class support */
-
-    if (P->Index > 0)
-      {
-      if (MUNumListGetCount(J->StartPriority,RQ->DRes.PSlot,P->CRes.PSlot,0,NULL) == FAILURE)
-        {
-        DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (classes not supported '%s')\n",
-          J->Name,
-          P->Name,
-          MUCAListToString(RQ->DRes.PSlot,P->CRes.PSlot,NULL));
+    return(FAILURE);
+    }  /* END if (MJobCheckLimits() == FAILURE) */
 
-        Reason[marClass]++;
+  /* check job size */
 
-        if (J->R != NULL)
-          MResDestroy(&J->R);
+  if (PReq > MaxPC)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window size: %d > %d)\n",
+      J->Name,
+      P->Name,
+      PReq,
+      MaxPC);
 
-        continue;
-        }
-      }      /* END if (PIndex) */
+    Reason[marNodeCount]++;
 
-    if (MJobCheckDependency(J,&DType,DValue) == FAILURE)
-      {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected (dependent on job '%s' %s)\n",
-        J->Name,
-        DValue,
-        MJobDependType[DType]);
+    return(FAILURE);
+    }
 
-      if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
-        {
-        J->SystemQueueTime = MSched.Time;
-        }
+  /* check job duration */
 
-      Reason[marDepend]++;
+  if (J->SpecWCLimit[0] > MaxWCLimit)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected in partition %s (exceeds window time: %ld > %ld)\n",
+      J->Name,
+      P->Name,
+      J->SpecWCLimit[0],
+      MaxWCLimit);
 
-      if ((MaxNC == MAX_MNODE) &&
-          (MaxWCLimit == MAX_MTIME) &&
-          (J->R != NULL))
-        {
-        MResDestroy(&J->R);
-        }
+    Reason[marTime]++;
 
-      continue;
-      }  /* END if (MJobCheckDependency(J,&JDepend) == FAILURE) */
+    return(FAILURE);
+    }
 
-    /* check partition active job policies */
+  /* check partition class support */
 
-    if (MJobCheckPolicies(
-          J,
-          PLevel,
-          (1 << mlActive),
-          P,   /* NOTE:  may set to &MPar[0] */
-          &PReason,
-          NULL,
-          MAX_MTIME) == FAILURE)
+  if (P->Index > 0)
+    {
+    if (MUNumListGetCount(J->StartPriority,RQ->DRes.PSlot,P->CRes.PSlot,0,NULL) == FAILURE)
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (policy failure: '%s')\n",
+      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (classes not supported '%s')\n",
         J->Name,
         P->Name,
-        MPolicyRejection[PReason]);
-
-      if (PLevel == ptHARD)
-        {
-        if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
-          {
-          J->SystemQueueTime = MSched.Time;
-          }
-        }
+        MUCAListToString(RQ->DRes.PSlot,P->CRes.PSlot,NULL));
 
-      Reason[marPolicy]++;
+      Reason[marClass]++;
 
-      if ((MaxNC == MAX_MNODE) && 
-          (MaxWCLimit == MAX_MTIME) && 
-          (J->R != NULL))
-        {
+      if (J->R != NULL)
         MResDestroy(&J->R);
-        }
 
-      continue;
+      return(FAILURE);
       }
+    }      /* END if (PIndex) */
 
-    J->Cred.U->MTime = MSched.Time;
-    J->Cred.G->MTime = MSched.Time;
+  if (MJobCheckDependency(J,&DType,DValue) == FAILURE)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected (dependent on job '%s' %s)\n",
+      J->Name,
+      DValue,
+      MJobDependType[DType]);
+
+    if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
+      {
+      J->SystemQueueTime = MSched.Time;
+      }
 
-    if (J->Cred.A != NULL)
-      J->Cred.A->MTime = MSched.Time;
+    Reason[marDepend]++;
 
-    if (MPar[0].FSC.FSPolicy != fspNONE)
+    if ((MaxNC == MAX_MNODE) &&
+        (MaxWCLimit == MAX_MTIME) &&
+        (J->R != NULL))
       {
-      int OIndex;
+      MResDestroy(&J->R);
+      }
 
-      if (MFSCheckCap(NULL,J,P,&OIndex) == FAILURE)
-        {
-        DBG(5,fSCHED) DPrint("INFO:     job '%s' exceeds %s FS cap\n",
-          J->Name,
-          (OIndex > 0) ? MXO[OIndex] : "NONE");
- 
-        if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
-          {
-          J->SystemQueueTime = MSched.Time;
-          }
- 
-        Reason[marFairShare]++;
+    return(FAILURE);
+    }  /* END if (MJobCheckDependency(J,&JDepend) == FAILURE) */
 
-        continue;
-        }
-      }    /* END if (FS[0].FSPolicy != fspNONE) */
+  /* check partition active job policies */
 
-    /* NOTE:  idle queue policies handled in MQueueSelectAllJobs() */
+  if (MJobCheckPolicies(
+        J,
+        PLevel,
+        (1 << mlActive),
+        P,   /* NOTE:  may set to &MPar[0] */
+        &PReason,
+        NULL,
+        MAX_MTIME) == FAILURE)
+    {
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (policy failure: '%s')\n",
+      J->Name,
+      P->Name,
+      MPolicyRejection[PReason]);
 
-    if (MLocalCheckFairnessPolicy(J,MSched.Time,NULL) == FAILURE)
+    if (PLevel == ptHARD)
       {
-      DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (violates local fairness policy)\n",
-        J->Name,
-        P->Name);
-
-      if (GP->JobPrioAccrualPolicy == jpapFullPolicy) 
+      if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
         {
         J->SystemQueueTime = MSched.Time;
         }
+      }
 
-      Reason[marPolicy]++;
+    Reason[marPolicy]++;
 
-      continue;
+    if ((MaxNC == MAX_MNODE) && 
+        (MaxWCLimit == MAX_MTIME) && 
+        (J->R != NULL))
+      {
+      MResDestroy(&J->R);
       }
 
-    /* NOTE:  effective queue duration not yet properly supported */
+    return(FAILURE);
+    }
 
-    J->EffQueueDuration = (MSched.Time > J->SystemQueueTime) ? 
-      MSched.Time - J->SystemQueueTime : 0;
- 
-    /* add job to destination queue */
+  J->Cred.U->MTime = MSched.Time;
+  J->Cred.G->MTime = MSched.Time;
 
-    DBG(5,fSCHED) DPrint("INFO:     job '%s' added to queue at slot %d\n",
-      J->Name,
-      sindex);
+  if (J->Cred.A != NULL)
+    J->Cred.A->MTime = MSched.Time;
 
-    DstQ[sindex++] = SrcQ[jindex];
-    }  /* END for (jindex) */
+  if (MPar[0].FSC.FSPolicy != fspNONE)
+    {
+    int OIndex;
 
-  /* terminate list */
+    if (MFSCheckCap(NULL,J,P,&OIndex) == FAILURE)
+      {
+      DBG(5,fSCHED) DPrint("INFO:     job '%s' exceeds %s FS cap\n",
+        J->Name,
+        (OIndex > 0) ? MXO[OIndex] : "NONE");
+ 
+      if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
+        {
+        J->SystemQueueTime = MSched.Time;
+        }
+ 
+      Reason[marFairShare]++;
 
-  DstQ[sindex] = -1;
+      return(FAILURE);
+      }
+    }    /* END if (FS[0].FSPolicy != fspNONE) */
 
-  DBG(1,fSCHED)
+  /* NOTE:  idle queue policies handled in MQueueSelectAllJobs() */
+
+  if (MLocalCheckFairnessPolicy(J,MSched.Time,NULL) == FAILURE)
     {
-    DBG(1,fSCHED) DPrint("INFO:     total jobs selected in partition %s: %d/%-d ",
-      MAList[ePartition][PIndex],
-      sindex,
-      jindex);
+    DBG(6,fSCHED) DPrint("INFO:     job %s rejected, partition %s (violates local fairness policy)\n",
+      J->Name,
+      P->Name);
 
-    for (index = 0;index < MAX_MREJREASON;index++)
+    if (GP->JobPrioAccrualPolicy == jpapFullPolicy) 
       {
-      if (Reason[index] != 0)
-        {
-        fprintf(mlog.logfp,"[%s: %d]",
-          MAllocRejType[index],
-          Reason[index]);
-        }
-      }    /* END for (index) */
+      J->SystemQueueTime = MSched.Time;
+      }
 
-    fprintf(mlog.logfp,"\n");
-    }
+    Reason[marPolicy]++;
 
-  if (sindex == 0)
     return(FAILURE);
+    }
 
   return(SUCCESS);
-  }  /* END MQueueSelectJobs() */
+  }  /* END MQueueCheckSingleJob() */
 
 
 
-- 
1.5.0.3-dirty

-------------- next part --------------
Subject: [PATCH] Prepare the MSchedProcessJobs() for the two-pass scheduling.

Transformed the part of the original MJobGetPAL() function to the
new public function MJobFindDefPart() that determines the default
partition for a job.

MQueueSelectJobs() prototype was modified: the OnlyDefPart flag was
added. It enables the examination of jobs that have the passed
partition to be the default one; all other jobs are skipped in the
selection process. When OnlyDefPart is set to FALSE the original
behaviour is restored: all jobs are examined.

The patch is no-op from the functional point of view: the OnlyDefPart
argument to the MQueueSelectJobs() was set to FALSE everywhere.

Patch was tested on the RRC-KI Grid cluster and yet showed no
regressions on its daily operations.

Signed-off-by: Eygene Ryabinkin <rea+maui at grid.kiae.ru>
---
 include/moab-proto.h |    3 +-
 src/moab/MPar.c      |  107 ++++++++++++++++++++++++++++++--------------------
 src/moab/MPolicy.c   |   13 ++++++-
 src/moab/MQueue.c    |    2 +
 src/moab/MSched.c    |   16 +++++--
 src/server/UserI.c   |    1 +
 6 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/include/moab-proto.h b/include/moab-proto.h
index 468bab0..7f983dc 100644
--- a/include/moab-proto.h
+++ b/include/moab-proto.h
@@ -396,6 +396,7 @@ int MJobSetState(mjob_t *,enum MJobStateEnum);
 int MJobPreempt(mjob_t *,mjob_t **,enum MPreemptPolicyEnum,char *,int *);
 int MJobResume(mjob_t *,char *,int *);
 int MJobGetPAL(mjob_t *,int *,int *,mpar_t **);
+mpar_t *MJobFindDefPart(mjob_t *, mclass_t *, int *);
 int MJobRemove(mjob_t *);
 int MJobGetAccount(mjob_t *,mgcred_t **);
 int MJobSetCreds(mjob_t *,char *,char *,char *);
@@ -491,7 +492,7 @@ int MQueueDiagnose(mjob_t **,int *,int,mpar_t *,char *,int);
 int MQueueCheckStatus(void);
 int MQueueGetRequeueValue(int *,long,long,double *);
 int MQueueSelectAllJobs(mjob_t **,int,mpar_t *,int *,int,int,int,char *);
-int MQueueSelectJobs(int *,int *,int,int,int,unsigned long,int,int *,mbool_t);
+int MQueueSelectJobs(int *,int *,int,int,int,unsigned long,int,int *,mbool_t,mbool_t);
 int MQueueAddAJob(mjob_t *);
 int MQueueRemoveAJob(mjob_t *,int);
 int MQueueBackFill(int *,int,mpar_t *);
diff --git a/src/moab/MPar.c b/src/moab/MPar.c
index 0b63285..9f9d037 100644
--- a/src/moab/MPar.c
+++ b/src/moab/MPar.c
@@ -347,52 +347,11 @@ int MJobGetPAL(
   if (PAL != NULL)
     MUBMCopy(PAL,tmpPAL,MAX_MPAR);
  
-  /* determine allowed partition default (precedence: U,G,A,C,S,0) */
+  /* determine allowed partition default */
  
   if (PDef != NULL)
     {
-    if ((J->Cred.U->F.PDef != NULL) &&
-        (J->Cred.U->F.PDef != &MPar[0]) &&
-         MUBMCheck(((mpar_t *)J->Cred.U->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.U->F.PDef;
-      }
-    else if ((J->Cred.G->F.PDef != NULL) &&
-             (J->Cred.G->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)J->Cred.G->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.G->F.PDef;
-      }
-    else if ((J->Cred.A != NULL) &&
-             (J->Cred.A->F.PDef != NULL) &&
-             (J->Cred.A->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)J->Cred.A->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.A->F.PDef;
-      }
-    else if ((C != NULL) &&
-             (C->F.PDef != NULL) &&
-             (C->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)C->F.PDef)->Index,tmpPAL)) 
-      {
-      *PDef = (mpar_t  *)C->F.PDef;
-      }
-    else if ((J->Cred.Q != NULL) &&
-             (J->Cred.Q->F.PDef != NULL) &&
-             (J->Cred.Q->F.PDef != &MPar[0]) &&
-              MUBMCheck(((mpar_t *)J->Cred.Q->F.PDef)->Index,tmpPAL))
-      {
-      *PDef = (mpar_t  *)J->Cred.Q->F.PDef;
-      }
-    else if ((MPar[0].F.PDef != NULL) &&
-             (MPar[0].F.PDef != &MPar[0]))
-      {
-      *PDef = (mpar_t  *)MPar[0].F.PDef;
-      }
-    else
-      {
-      *PDef = &MPar[MDEF_SYSPDEF];
-      }
+    *PDef = MJobFindDefPart(J, C, tmpPAL);
  
     /* verify access to default partition */
  
@@ -439,7 +398,69 @@ int MJobGetPAL(
   return(SUCCESS);
   }  /* END MJobGetPAL() */
 
+/*
+ * Determines default partition for a job (precedence: U,G,A,C,S,0)
+ * 'PAL' is consulted to determine partition access if it is not NULL.
+ * 'C' is consulted for the default partition if it is not NULL.
+ */
+mpar_t *MJobFindDefPart(
+  mjob_t   *J,     /* I:  job                                */
+  mclass_t *C,     /* I:  job class                          */
+  int      *PAL)   /* I:  partition access list              */
+
+  {
+  mpar_t   *PDef;
+
+  if ((J->Cred.U->F.PDef != NULL) &&
+      (J->Cred.U->F.PDef != &MPar[0]) &&
+      (PAL == NULL ||
+       MUBMCheck(((mpar_t *)J->Cred.U->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.U->F.PDef;
+    }
+  else if ((J->Cred.G->F.PDef != NULL) &&
+           (J->Cred.G->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+            MUBMCheck(((mpar_t *)J->Cred.G->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.G->F.PDef;
+    }
+  else if ((J->Cred.A != NULL) &&
+           (J->Cred.A->F.PDef != NULL) &&
+           (J->Cred.A->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+            MUBMCheck(((mpar_t *)J->Cred.A->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.A->F.PDef;
+    }
+  else if ((C != NULL) &&
+           (C->F.PDef != NULL) &&
+           (C->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+            MUBMCheck(((mpar_t *)C->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)C->F.PDef;
+    }
+  else if ((J->Cred.Q != NULL) &&
+           (J->Cred.Q->F.PDef != NULL) &&
+           (J->Cred.Q->F.PDef != &MPar[0]) &&
+           (PAL == NULL ||
+	    MUBMCheck(((mpar_t *)J->Cred.Q->F.PDef)->Index,PAL)))
+    {
+    PDef = (mpar_t  *)J->Cred.Q->F.PDef;
+    }
+  else if ((MPar[0].F.PDef != NULL) &&
+           (MPar[0].F.PDef != &MPar[0]))
+    {
+    PDef = (mpar_t  *)MPar[0].F.PDef;
+    }
+  else
+    {
+    PDef = &MPar[MDEF_SYSPDEF];
+    }
 
+  return PDef;
+  }  /* END MJobFindDefPart() */
 
 
 int MParFind(
diff --git a/src/moab/MPolicy.c b/src/moab/MPolicy.c
index 18e6b15..a6e7b72 100644
--- a/src/moab/MPolicy.c
+++ b/src/moab/MPolicy.c
@@ -171,7 +171,8 @@ int MQueueSelectJobs(
   unsigned long  MaxWCLimit,    /* I */
   int            OrigPIndex,    /* I */
   int           *FReason,       /* O */
-  mbool_t        UpdateStats)   /* I:  (boolean) */
+  mbool_t        UpdateStats,   /* I:  (boolean) */
+  mbool_t        OnlyDefPart)   /* I:  (boolean) */
 
   {
   int      index;
@@ -263,6 +264,16 @@ int MQueueSelectJobs(
       continue;
       }
 
+    if (OnlyDefPart == TRUE && MJobFindDefPart(J, NULL, NULL) != P)
+      {
+      DBG(7,fSCHED) DPrint("INFO:     skipping job[%d] '%s', only default partition check requested (and current partition is %s)\n",
+        jindex,
+        J->Name,
+	P->Name);
+
+      continue;
+      }
+
     if (MQueueCheckSingleJob(J, Reason, P, GP, PLevel,
 	MaxNC, MaxPC, MaxWCLimit, OrigPIndex, UpdateStats) == FAILURE)
       continue;
diff --git a/src/moab/MQueue.c b/src/moab/MQueue.c
index 137edbe..b2afc15 100644
--- a/src/moab/MQueue.c
+++ b/src/moab/MQueue.c
@@ -446,6 +446,7 @@ int MQueueBackFill(
           AdjBFTime,
           P->Index,
           NULL,
+          FALSE,
           FALSE) == FAILURE)
       {
       DBG(5,fSCHED) DPrint("INFO:     no jobs meet BF window criteria in partition %s\n",
@@ -1516,6 +1517,7 @@ int MQueueCheckStatus()
                 MAX_MTIME,
                 -1,
                 ReasonList,
+                FALSE,
                 FALSE) == FAILURE)
             {
             strcpy(DeferMessage,"SCHED_INFO:  job cannot run.  Reason: cannot select job\n");
diff --git a/src/moab/MSched.c b/src/moab/MSched.c
index 7747e4a..b483db3 100644
--- a/src/moab/MSched.c
+++ b/src/moab/MSched.c
@@ -6943,6 +6943,7 @@ int MSchedProcessJobs(
             MAX_MTIME,
             -1,
             NULL,
+            FALSE,
             FALSE) == SUCCESS)
         {
         memcpy(MFQ,tmpQ,sizeof(MFQ));
@@ -6965,7 +6966,8 @@ int MSchedProcessJobs(
         MAX_MTIME,
         -1,
         NULL,
-        TRUE);
+        TRUE,
+        FALSE);
 
       /* schedule priority jobs */
 
@@ -6990,7 +6992,8 @@ int MSchedProcessJobs(
                 MAX_MTIME,
                 PIndex,
                 NULL,
-                TRUE) == SUCCESS)
+                TRUE,
+                FALSE) == SUCCESS)
             {
             MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);
 
@@ -7017,7 +7020,8 @@ int MSchedProcessJobs(
         MAX_MTIME,
         -1,
         NULL,
-        TRUE);
+        TRUE,
+        FALSE);
 
       if (CurrentQ[0] != -1)
         {
@@ -7049,7 +7053,8 @@ int MSchedProcessJobs(
                 MAX_MTIME,
                 PIndex,
                 NULL,
-                TRUE) == SUCCESS)
+                TRUE,
+                FALSE) == SUCCESS)
             {
             MQueueBackFill(tmpQ,ptHARD,&MPar[PIndex]);
             }
@@ -7091,7 +7096,8 @@ int MSchedProcessJobs(
     MAX_MTIME,
     -1,
     NULL,
-    TRUE);
+    TRUE,
+    FALSE);
 
   /* must sort/order MUIQ */
 
diff --git a/src/server/UserI.c b/src/server/UserI.c
index 3b34ae5..0cb350d 100644
--- a/src/server/UserI.c
+++ b/src/server/UserI.c
@@ -1790,6 +1790,7 @@ int UIJobShow(
           MAX_MTIME,
           P->Index,
           Reason,
+          FALSE,
           FALSE) == FAILURE) || (DstQ[0] == -1))
       {
       for (index = 0;index < MAX_MREJREASON;index++)
-- 
1.5.0.3-dirty

-------------- next part --------------
Subject: [PATCH] Fixed default partition handling by the two-pass scheduling.

MSchedProcessJobs() uses two-pass scheduling: first pass over all
partitions schedules jobs that can be put to their default partitions
and the second pass schedules the rest of the jobs. Backfilling is
disabled on the first pass: we should first load the queue with the
eligible jobs and only then do the backfilling.

Patch was tested on the RRC-KI Grid cluster and yet showed no
regressions on its daily operations. The default partition ('PDEF')
statement is working as expected: jobs are first scheduled to the
default partition and only after the default partition nodes are
busy they go to the rest of the partitions.

Signed-off-by: Eygene Ryabinkin <rea+maui at grid.kiae.ru>
---
 src/moab/MSched.c |   81 ++++++++++++++++++++++++++++++----------------------
 1 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/src/moab/MSched.c b/src/moab/MSched.c
index b483db3..7b9a890 100644
--- a/src/moab/MSched.c
+++ b/src/moab/MSched.c
@@ -6971,44 +6971,57 @@ int MSchedProcessJobs(
 
       /* schedule priority jobs */
 
+#ifdef M_SCHEDULE_ON_PARTITIONS
+#error Symbol M_SCHEDULE_ON_PARTITIONS is already defined. Fix me, please.
+#endif
+#define M_SCHEDULE_ON_PARTITONS(_OnlyDefPart, _DoBackfill) \
+	do {								\
+        for (PIndex = 0;PIndex < MAX_MPAR;PIndex++)			\
+          {								\
+          if (((PIndex == 0) && (MPar[2].ConfigNodes == 0)) ||		\
+              (MPar[PIndex].ConfigNodes == 0))				\
+            {								\
+            continue;							\
+            }								\
+									\
+          MOQueueInitialize(tmpQ);					\
+									\
+          if (MQueueSelectJobs(						\
+                CurrentQ,						\
+                tmpQ,							\
+                ptSOFT,							\
+                MAX_MNODE,						\
+                MAX_MTASK,						\
+                MAX_MTIME,						\
+                PIndex,							\
+                NULL,							\
+                TRUE,							\
+                _OnlyDefPart) == SUCCESS)				\
+            {								\
+            MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);			\
+									\
+            if (_DoBackfill == TRUE && MPar[PIndex].BFPolicy != ptOFF)	\
+              {								\
+              /* backfill jobs using 'soft' policy constraints */	\
+									\
+              MQueueBackFill(tmpQ,ptSOFT,&MPar[PIndex]);		\
+              }								\
+            }								\
+									\
+          MOQueueDestroy(tmpQ,FALSE);					\
+          }    /* END for (PIndex) */					\
+	  } while (0)
+
       if (CurrentQ[0] != -1)
         {
-        for (PIndex = 0;PIndex < MAX_MPAR;PIndex++)
-          {
-          if (((PIndex == 0) && (MPar[2].ConfigNodes == 0)) ||
-              (MPar[PIndex].ConfigNodes == 0))
-            {
-            continue;
-            }
-
-          MOQueueInitialize(tmpQ);
-
-          if (MQueueSelectJobs(
-                CurrentQ,
-                tmpQ,
-                ptSOFT,
-                MAX_MNODE,
-                MAX_MTASK,
-                MAX_MTIME,
-                PIndex,
-                NULL,
-                TRUE,
-                FALSE) == SUCCESS)
-            {
-            MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);
-
-            if (MPar[PIndex].BFPolicy != ptOFF)
-              {
-              /* backfill jobs using 'soft' policy constraints */
-
-              MQueueBackFill(tmpQ,ptSOFT,&MPar[PIndex]);
-              }
-            }
-
-          MOQueueDestroy(tmpQ,FALSE);
-          }    /* END for (PIndex) */
+	/* schedule jobs on their default partitions; skip backfilling  */
+	M_SCHEDULE_ON_PARTITONS(TRUE, FALSE);
+	/* schedule jobs on all partitions; do backfilling  */
+	M_SCHEDULE_ON_PARTITONS(FALSE, TRUE);
         }      /* END if (GlobalSQ[0] != -1) */
 
+#undef M_SCHEDULE_ON_PARTITONS
+
       MOQueueDestroy(CurrentQ,TRUE);
 
       MQueueSelectJobs(
-- 
1.5.0.3-dirty



More information about the mauiusers mailing list