[torquedev] Patches to add network license integration to C fifo scheduler

Joshua Weage joshua.weage at valartechnologies.com
Tue Mar 24 13:23:41 MDT 2009


Please find attached three patches for the C fifo scheduler.

The first patch changes the limit for the value of a token to 1000000,
via a new #defined value.

The second patch adds the ability to obtain the number of available
licenses/tokens via a script or program executed by the scheduler.  It
also adds a submission delay feature for jobs requesting "dynamic
tokens" to give jobs enough time to obtain licenses before running a
subsequent job.

The third patch adds documentation for the tokens attribute/resource
request to the appropriate man pages.

I have not tested these patches extensively (as in extensive
operational use), but basic testing was successful.

Regards,

Joshua Weage, PE
Valar Technologies
http://www.valartechnologies.com/

<------------------------ PATCH 1 ------------------------->
diff -cr torque-2.3.6-p0/src/include/pbs_ifl.h
torque-2.3.6-p1/src/include/pbs_ifl.h
*** torque-2.3.6-p0/src/include/pbs_ifl.h    2008-09-22 18:01:19.000000000 -0400
--- torque-2.3.6-p1/src/include/pbs_ifl.h    2009-03-21 17:46:02.000000000 -0400
***************
*** 372,377 ****
--- 372,378 ----
  #define PBS_INTERACTIVE  1 /* Support of Interactive jobs */
  #define PBS_TERM_BUF_SZ  80 /* Interactive term buffer size */
  #define PBS_TERM_CCA  6 /* Interactive term cntl char array */
+ #define PBS_MAXTOKEN 1000000 /* maximum token value */

  #define PBS_QS_VERSION  0x00020300 /* magic number used to determine
version of pbs job quick save struct */
  /* the magic number is split into 4 8-bit chunks.  the first 8 bits are
diff -cr torque-2.3.6-p0/src/lib/Libattr/attr_fn_tokens.c
torque-2.3.6-p1/src/lib/Libattr/attr_fn_tokens.c
*** torque-2.3.6-p0/src/lib/Libattr/attr_fn_tokens.c    2008-09-08
11:43:08.000000000 -0400
--- torque-2.3.6-p1/src/lib/Libattr/attr_fn_tokens.c    2009-03-21
17:47:14.000000000 -0400
***************
*** 40,46 ****
        {
        count = atof(++colon);

!       if (count > 0.0 && count < 1000.0)
          {
          ret = 0;
          }
--- 40,46 ----
        {
        count = atof(++colon);

!       if (count > 0.0 && count < PBS_MAXTOKEN)
          {
          ret = 0;
          }
diff -cr torque-2.3.6-p0/src/server/req_tokens.c
torque-2.3.6-p1/src/server/req_tokens.c
*** torque-2.3.6-p0/src/server/req_tokens.c    2008-09-08
11:43:06.000000000 -0400
--- torque-2.3.6-p1/src/server/req_tokens.c    2009-03-21
17:46:02.000000000 -0400
***************
*** 63,69 ****

      count = atof(entry);

!     if (count <= 0 || count > 1000)
        {
        err = PBSE_IVALREQ;
        }
--- 63,69 ----

      count = atof(entry);

!     if (count <= 0 || count > PBS_MAXTOKEN)
        {
        err = PBSE_IVALREQ;
        }
diff -cr torque-2.3.6-p0/src/server/resc_def_all.c
torque-2.3.6-p1/src/server/resc_def_all.c
*** torque-2.3.6-p0/src/server/resc_def_all.c    2008-12-23
15:43:08.000000000 -0500
--- torque-2.3.6-p1/src/server/resc_def_all.c    2009-03-21
17:46:02.000000000 -0400
***************
*** 1044,1050 ****
        colon++;
        count = atof(colon);

!       if ((count <= 0) || (count > 1000))
          {
          ret = PBSE_BADATVAL;
          }
--- 1044,1050 ----
        colon++;
        count = atof(colon);

!       if ((count <= 0) || (count > PBS_MAXTOKEN))
          {
          ret = PBSE_BADATVAL;
          }

<----------------------- PATCH 2 ---------------------------->
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/check.c
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/check.c
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/check.c	2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/check.c	2009-03-22
15:49:52.000000000 -0400
***************
*** 79,84 ****
--- 79,85 ----

  #include <stdio.h>
  #include <stdlib.h>
+ #include <time.h>
  #include "pbs_ifl.h"
  #include "log.h"
  #include <string.h>
***************
*** 92,97 ****
--- 93,99 ----
  #include "globals.h"
  #include "dedtime.h"
  #include "token_acct.h"
+ #include "pbs_error.h"

  /* Internal functions */
  int check_server_max_run(server_info *sinfo);
***************
*** 773,778 ****
--- 775,804 ----
    }

  /*
+  * is_dynamic_token - Determine if token is in the dynamic_token list
+  *
+  *   identifier - token identifier
+  *
+  * Returns the index or -1 on no match
+  */
+ int is_dynamic_token(char * identifier)
+   {
+   int i;
+
+   for (i=0;i<MAX_DYNTOKEN_SIZE;i++)
+     {
+     if (conf.dynamic_tokens[i].identifier != NULL)
+       {
+       if (strcmp(conf.dynamic_tokens[i].identifier,identifier)==0) return i;
+       }
+     else
+       return -1;
+     }
+
+   return -1;
+   }
+
+ /*
   *
   *    check_token_utiliztion
   *
***************
*** 802,807 ****
--- 828,834 ----
    char token_log_message[TOKEN_ACCT_MAX_RCD];

    int ret = SUCCESS;
+   int dyn_token = 0;

    tokens_req = find_resource_req(jinfo -> resreq, ATTR_tokens);

***************
*** 860,869 ****
          }
        }


!     /* Finally check if the requested plus the used exceeds the allowed */

!     if (count_used + count_requested > max_count)
        {
        ret = SERVER_TOKEN_UTILIZATION;
        /* Have to call these functions just for cleanup */
--- 887,939 ----
          }
        }

+     /* Dynamic token check added here to override previous token check. */
+     if ((i = is_dynamic_token(job_tokens->identifier)) >= 0)
+       {
+       FILE *script;
+       char line[256];
+       char *c;

!       dyn_token = 1;  /* dynamic token flag */
!
!       /* check for necessary delay time */
!       if (time(NULL) - conf.dynamic_tokens[i].time >=
conf.dynamic_token_delay)
!         {
!         script = popen(conf.dynamic_tokens[i].script,"r");
!
!         /* if script fails, set available licenses to 0 */
!         max_count = 0.0;
!         if (NULL != script)
!           {
!           c = fgets(line,255,script);
!           if (NULL != c)
!   	    {
!             max_count = atof(line);
! 	    }
!           else
!             {
!             snprintf(line,255,"%s returned nothing or does not
exist",conf.dynamic_tokens[i].script);
!             log_err(PBSE_SYSTEM,"check_token_utilization",line);
!             }
!           }
!         else
!           {
!           log_err(PBSE_SYSTEM,"check_token_utilization","popen() error");
!           }

!         pclose(script);
!         }
!       else
! 	{
! 	/* delay time not sufficient, do not block scheduler */
!         snprintf(line,255,"token delay triggered for
%s",job_tokens->identifier);
!         log_err(PBSE_NONE,"check_token_utilization",line);
!         max_count = 0.0;
! 	}
!       }
!
!     /* Finally check if the requested plus the used exceeds the allowed */
!     if ((!dyn_token && (count_used + count_requested > max_count))
|| (dyn_token && (count_requested > max_count)))
        {
        ret = SERVER_TOKEN_UTILIZATION;
        /* Have to call these functions just for cleanup */
***************
*** 872,877 ****
--- 942,949 ----
        }
      else
        {
+       /* track last successful use of a dynamic token */
+       if (dyn_token) conf.dynamic_tokens[i].time = time(NULL);
        /* Write accounting record for the event of allocating tokens
to this job */
        token_account_record(TOKEN_ACCT_RUN, jinfo->name, tokens_req->res_str);
        /* Write accounting record of what is currently being used */
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/config.h
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/config.h
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/config.h	2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/config.h	2009-03-22
14:55:18.000000000 -0400
***************
*** 119,124 ****
--- 119,126 ----
  #define PARSE_HELP_STARVING_JOBS "help_starving_jobs"
  #define PARSE_MAX_STARVE "max_starve"
  #define PARSE_SORT_QUEUES "sort_queues"
+ #define PARSE_TOKEN "tokens"
+ #define PARSE_TOKEN_DELAY "dynamic_token_delay"

  /* max sizes */
  #define MAX_HOLIDAY_SIZE 50
***************
*** 127,133 ****
  #define MAX_LOG_SIZE 100
  #define MAX_RES_NAME_SIZE 256
  #define MAX_RES_RET_SIZE 256
!

  /* messages -
   *  INFO - messages printed via info_msg
--- 129,135 ----
  #define MAX_LOG_SIZE 100
  #define MAX_RES_NAME_SIZE 256
  #define MAX_RES_RET_SIZE 256
! #define MAX_DYNTOKEN_SIZE 25

  /* messages -
   *  INFO - messages printed via info_msg
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/data_types.h
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/data_types.h
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/data_types.h	2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/data_types.h	2009-03-22
15:00:21.000000000 -0400
***************
*** 109,114 ****
--- 109,116 ----

  struct token;

+ struct dynamic_token;
+
  typedef struct state_count state_count;

  typedef struct server_info server_info;
***************
*** 131,136 ****
--- 133,140 ----

  typedef struct token token;

+ typedef struct dynamic_token dynamic_token;
+
  typedef RESOURCE_TYPE sch_resource_t;
  /* since resource values and usage values are linked */
  typedef sch_resource_t usage_t;
***************
*** 141,146 ****
--- 145,156 ----
    float count;                  /* The number of tokens available of
type identifier */
    };

+ struct dynamic_token
+   {
+   char* identifier;             /* Token identifier */
+   char* script;                 /* Script to obtain available tokens */
+   time_t time;			/* Last time this token was used */
+   };

  struct state_count
    {
***************
*** 480,485 ****
--- 490,499 ----
    int log_filter;   /* what events to filter out */
    char ded_prefix[PBS_MAXQUEUENAME +1]; /* prefix to dedicated queues */
    time_t max_starve;   /* starving threshold */
+
+   struct dynamic_token dynamic_tokens[MAX_DYNTOKEN_SIZE]; /* dynamic tokens */
+
+   time_t dynamic_token_delay; /* job start delay for dynamic tokens */
    };

  /* for description of these bits, check the PBS admin guide or
scheduler IDS */
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/parse.c
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/parse.c
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/parse.c	2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/parse.c	2009-03-22
15:48:48.000000000 -0400
***************
*** 92,98 ****
  #include "prime.h"
  #include "node_info.h"

-
  /*
   *
   * parse_config - parse the config file and set a struct config
--- 92,97 ----
***************
*** 121,126 ****
--- 120,126 ----
    int linenum = 0;  /* the current line number in the file */
    int pkey_num = 1;  /* number of prime time keys for multisort */
    int npkey_num = 1;  /* number of nonprime time keys for multisort */
+   int tok_count = 0;  /* number of dynamic tokens */
    int i;

    if ((fp = fopen(fname, "r")) == NULL)
***************
*** 317,322 ****
--- 317,339 ----
                }
              }
            }
+         else if (!strcmp(config_name, PARSE_TOKEN))
+           {
+           if (tok_count < MAX_DYNTOKEN_SIZE)
+             {
+             error = 0;
+
+             conf.dynamic_tokens[tok_count].identifier =
string_dup(config_value);
+             conf.dynamic_tokens[tok_count].script = string_dup(prime_value);
+
+             tok_count++;
+             }
+           else error = 1;
+           }
+         else if (!strcmp(config_name, PARSE_TOKEN_DELAY))
+           {
+           conf.dynamic_token_delay = num;
+           }
          }
        else
          error = 1;
***************
*** 341,346 ****
--- 358,365 ----
  int
  init_config(void)
    {
+   int i;
+
    memset(&conf, 0, sizeof(struct config));
    memset(&cstat, 0, sizeof(struct status));

***************
*** 369,374 ****
--- 388,401 ----
    else
      init_non_prime_time();

+   for (i=0;i<MAX_DYNTOKEN_SIZE;i++) {
+     conf.dynamic_tokens[i].identifier = NULL;
+     conf.dynamic_tokens[i].script = NULL;
+     conf.dynamic_tokens[i].time = 0;
+   }
+
+   conf.dynamic_token_delay = 0;
+
    return 1;
    }

***************
*** 383,390 ****
--- 410,425 ----
  int
  reinit_config(void)
    {
+   int i;
+
    free(conf.prime_sort);
    free(conf.non_prime_sort);
    free_group_tree(conf.group_root);
+
+   for (i=0;i<MAX_DYNTOKEN_SIZE;i++) {
+     free(conf.dynamic_tokens[i].identifier);
+     free(conf.dynamic_tokens[i].script);
+   }
+
    return init_config();
    }
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/sched_config
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/sched_config
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/sched_config	2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/sched_config	2009-03-22
15:05:10.000000000 -0400
***************
*** 183,185 ****
--- 183,202 ----
  # sync_time - the amount of time between syncing the usage information to disk
  #	NO PRIME OPTION
  sync_time: 1:00:00
+
+ # tokens: defines a dynamic token, where the number of available tokens are
+ # obtained by running a script rather than a maximum count based on the server
+ # tokens attribute.  This can be used to check for available licenses from
+ # a network license server.
+ # The identifier must be the same string as the token name requested
by the job
+ # and this token must be defined in the server tokens attribute.
+ # script must be the full path or proceeded by ./ for scripts
located in sched_priv
+ # tokens: identifier script
+ # tokens: matlab ./matlab.sh
+
+ # dynamic_token_delay: Used to prevent jobs requesting dynamic tokens from
+ # starting too quickly, before a previously run job actually obtains the
+ # necessary licenses.  This parameter is the number of seconds to wait
+ # before running a job requesting the same dynamic token as a previous job.
+ # dynamic_token_delay: 15
+

<--------------------------- PATCH 3 ---------------------------->
diff -cr torque-2.3.6-p3/doc/ers/pbs_resources_all.so
torque-2.3.6-p4/doc/ers/pbs_resources_all.so
*** torque-2.3.6-p3/doc/ers/pbs_resources_all.so	2009-03-21
18:09:48.000000000 -0400
--- torque-2.3.6-p4/doc/ers/pbs_resources_all.so	2009-03-21
23:00:31.000000000 -0400
***************
*** 159,164 ****
--- 159,171 ----
  This resource is provided for use by the site's scheduling policy.
  The allowable values and effect on job placement is site dependent.
  Units: string.
+ .IP tokens
+ Allows a user to specify the quantity of a consumable resource
+ required by the job.  The format of this request is the token
+ name and count separated by a colon, e.g. matlab:2.
+ This resource is provided for use by the site scheduling policy.
+ The allowable values and effect on job placement is site dependent.
+ Units: string:float.
  .LP
  .Sh EXAMPLES
  qsub \-l nodes=15,walltime=2:00:00 script
diff -cr torque-2.3.6-p3/doc/man7/pbs_server_attributes.7.in
torque-2.3.6-p4/doc/man7/pbs_server_attributes.7.in
*** torque-2.3.6-p3/doc/man7/pbs_server_attributes.7.in	2009-03-21
18:09:47.000000000 -0400
--- torque-2.3.6-p4/doc/man7/pbs_server_attributes.7.in	2009-03-21
22:40:01.000000000 -0400
***************
*** 514,519 ****
--- 514,529 ----
  .if !\n(Pb .ig Ig
  [internal type: list]
  .Ig
+ .Al tokens
+ A list of tokens which specify the maximum allowable usage count
+ for a consumable resource.  A typical use of this feature would be to
+ specify the number of available licenses for a particular software
+ package, e.g. matlab:10.  This attribute is advisory to the Scheduler,
+ it is not enforced by the server.
+ Format: string:float; default value: none.
+ .if !\n(Pb .ig Ig
+ [internal type: list]
+ .Ig
  .RE
  .LP
  .if !\n(Pb .ig Ig


More information about the torquedev mailing list