[torquedev] Patches to add network license integration to C fifo
scheduler
Joshua Weage
joshua.weage at valartechnologies.com
Tue Mar 24 13:23:41 MDT 2009
Please find attached three patches for the C fifo scheduler.
The first patch changes the limit for the value of a token to 1000000,
via a new #defined value.
The second patch adds the ability to obtain the number of available
licenses/tokens via a script or program executed by the scheduler. It
also adds a submission delay feature for jobs requesting "dynamic
tokens" to give jobs enough time to obtain licenses before running a
subsequent job.
The third patch adds documentation for the tokens attribute/resource
request to the appropriate man pages.
I have not tested these patches extensively (as in extensive
operational use), but basic testing was successful.
Regards,
Joshua Weage, PE
Valar Technologies
http://www.valartechnologies.com/
<------------------------ PATCH 1 ------------------------->
diff -cr torque-2.3.6-p0/src/include/pbs_ifl.h
torque-2.3.6-p1/src/include/pbs_ifl.h
*** torque-2.3.6-p0/src/include/pbs_ifl.h 2008-09-22 18:01:19.000000000 -0400
--- torque-2.3.6-p1/src/include/pbs_ifl.h 2009-03-21 17:46:02.000000000 -0400
***************
*** 372,377 ****
--- 372,378 ----
#define PBS_INTERACTIVE 1 /* Support of Interactive jobs */
#define PBS_TERM_BUF_SZ 80 /* Interactive term buffer size */
#define PBS_TERM_CCA 6 /* Interactive term cntl char array */
+ #define PBS_MAXTOKEN 1000000 /* maximum token value */
#define PBS_QS_VERSION 0x00020300 /* magic number used to determine
version of pbs job quick save struct */
/* the magic number is split into 4 8-bit chunks. the first 8 bits are
diff -cr torque-2.3.6-p0/src/lib/Libattr/attr_fn_tokens.c
torque-2.3.6-p1/src/lib/Libattr/attr_fn_tokens.c
*** torque-2.3.6-p0/src/lib/Libattr/attr_fn_tokens.c 2008-09-08
11:43:08.000000000 -0400
--- torque-2.3.6-p1/src/lib/Libattr/attr_fn_tokens.c 2009-03-21
17:47:14.000000000 -0400
***************
*** 40,46 ****
{
count = atof(++colon);
! if (count > 0.0 && count < 1000.0)
{
ret = 0;
}
--- 40,46 ----
{
count = atof(++colon);
! if (count > 0.0 && count < PBS_MAXTOKEN)
{
ret = 0;
}
diff -cr torque-2.3.6-p0/src/server/req_tokens.c
torque-2.3.6-p1/src/server/req_tokens.c
*** torque-2.3.6-p0/src/server/req_tokens.c 2008-09-08
11:43:06.000000000 -0400
--- torque-2.3.6-p1/src/server/req_tokens.c 2009-03-21
17:46:02.000000000 -0400
***************
*** 63,69 ****
count = atof(entry);
! if (count <= 0 || count > 1000)
{
err = PBSE_IVALREQ;
}
--- 63,69 ----
count = atof(entry);
! if (count <= 0 || count > PBS_MAXTOKEN)
{
err = PBSE_IVALREQ;
}
diff -cr torque-2.3.6-p0/src/server/resc_def_all.c
torque-2.3.6-p1/src/server/resc_def_all.c
*** torque-2.3.6-p0/src/server/resc_def_all.c 2008-12-23
15:43:08.000000000 -0500
--- torque-2.3.6-p1/src/server/resc_def_all.c 2009-03-21
17:46:02.000000000 -0400
***************
*** 1044,1050 ****
colon++;
count = atof(colon);
! if ((count <= 0) || (count > 1000))
{
ret = PBSE_BADATVAL;
}
--- 1044,1050 ----
colon++;
count = atof(colon);
! if ((count <= 0) || (count > PBS_MAXTOKEN))
{
ret = PBSE_BADATVAL;
}
<----------------------- PATCH 2 ---------------------------->
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/check.c
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/check.c
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/check.c 2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/check.c 2009-03-22
15:49:52.000000000 -0400
***************
*** 79,84 ****
--- 79,85 ----
#include <stdio.h>
#include <stdlib.h>
+ #include <time.h>
#include "pbs_ifl.h"
#include "log.h"
#include <string.h>
***************
*** 92,97 ****
--- 93,99 ----
#include "globals.h"
#include "dedtime.h"
#include "token_acct.h"
+ #include "pbs_error.h"
/* Internal functions */
int check_server_max_run(server_info *sinfo);
***************
*** 773,778 ****
--- 775,804 ----
}
/*
+ * is_dynamic_token - Determine if token is in the dynamic_token list
+ *
+ * identifier - token identifier
+ *
+ * Returns the index or -1 on no match
+ */
+ int is_dynamic_token(char * identifier)
+ {
+ int i;
+
+ for (i=0;i<MAX_DYNTOKEN_SIZE;i++)
+ {
+ if (conf.dynamic_tokens[i].identifier != NULL)
+ {
+ if (strcmp(conf.dynamic_tokens[i].identifier,identifier)==0) return i;
+ }
+ else
+ return -1;
+ }
+
+ return -1;
+ }
+
+ /*
*
* check_token_utiliztion
*
***************
*** 802,807 ****
--- 828,834 ----
char token_log_message[TOKEN_ACCT_MAX_RCD];
int ret = SUCCESS;
+ int dyn_token = 0;
tokens_req = find_resource_req(jinfo -> resreq, ATTR_tokens);
***************
*** 860,869 ****
}
}
! /* Finally check if the requested plus the used exceeds the allowed */
! if (count_used + count_requested > max_count)
{
ret = SERVER_TOKEN_UTILIZATION;
/* Have to call these functions just for cleanup */
--- 887,939 ----
}
}
+ /* Dynamic token check added here to override previous token check. */
+ if ((i = is_dynamic_token(job_tokens->identifier)) >= 0)
+ {
+ FILE *script;
+ char line[256];
+ char *c;
! dyn_token = 1; /* dynamic token flag */
!
! /* check for necessary delay time */
! if (time(NULL) - conf.dynamic_tokens[i].time >=
conf.dynamic_token_delay)
! {
! script = popen(conf.dynamic_tokens[i].script,"r");
!
! /* if script fails, set available licenses to 0 */
! max_count = 0.0;
! if (NULL != script)
! {
! c = fgets(line,255,script);
! if (NULL != c)
! {
! max_count = atof(line);
! }
! else
! {
! snprintf(line,255,"%s returned nothing or does not
exist",conf.dynamic_tokens[i].script);
! log_err(PBSE_SYSTEM,"check_token_utilization",line);
! }
! }
! else
! {
! log_err(PBSE_SYSTEM,"check_token_utilization","popen() error");
! }
! pclose(script);
! }
! else
! {
! /* delay time not sufficient, do not block scheduler */
! snprintf(line,255,"token delay triggered for
%s",job_tokens->identifier);
! log_err(PBSE_NONE,"check_token_utilization",line);
! max_count = 0.0;
! }
! }
!
! /* Finally check if the requested plus the used exceeds the allowed */
! if ((!dyn_token && (count_used + count_requested > max_count))
|| (dyn_token && (count_requested > max_count)))
{
ret = SERVER_TOKEN_UTILIZATION;
/* Have to call these functions just for cleanup */
***************
*** 872,877 ****
--- 942,949 ----
}
else
{
+ /* track last successful use of a dynamic token */
+ if (dyn_token) conf.dynamic_tokens[i].time = time(NULL);
/* Write accounting record for the event of allocating tokens
to this job */
token_account_record(TOKEN_ACCT_RUN, jinfo->name, tokens_req->res_str);
/* Write accounting record of what is currently being used */
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/config.h
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/config.h
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/config.h 2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/config.h 2009-03-22
14:55:18.000000000 -0400
***************
*** 119,124 ****
--- 119,126 ----
#define PARSE_HELP_STARVING_JOBS "help_starving_jobs"
#define PARSE_MAX_STARVE "max_starve"
#define PARSE_SORT_QUEUES "sort_queues"
+ #define PARSE_TOKEN "tokens"
+ #define PARSE_TOKEN_DELAY "dynamic_token_delay"
/* max sizes */
#define MAX_HOLIDAY_SIZE 50
***************
*** 127,133 ****
#define MAX_LOG_SIZE 100
#define MAX_RES_NAME_SIZE 256
#define MAX_RES_RET_SIZE 256
!
/* messages -
* INFO - messages printed via info_msg
--- 129,135 ----
#define MAX_LOG_SIZE 100
#define MAX_RES_NAME_SIZE 256
#define MAX_RES_RET_SIZE 256
! #define MAX_DYNTOKEN_SIZE 25
/* messages -
* INFO - messages printed via info_msg
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/data_types.h
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/data_types.h
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/data_types.h 2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/data_types.h 2009-03-22
15:00:21.000000000 -0400
***************
*** 109,114 ****
--- 109,116 ----
struct token;
+ struct dynamic_token;
+
typedef struct state_count state_count;
typedef struct server_info server_info;
***************
*** 131,136 ****
--- 133,140 ----
typedef struct token token;
+ typedef struct dynamic_token dynamic_token;
+
typedef RESOURCE_TYPE sch_resource_t;
/* since resource values and usage values are linked */
typedef sch_resource_t usage_t;
***************
*** 141,146 ****
--- 145,156 ----
float count; /* The number of tokens available of
type identifier */
};
+ struct dynamic_token
+ {
+ char* identifier; /* Token identifier */
+ char* script; /* Script to obtain available tokens */
+ time_t time; /* Last time this token was used */
+ };
struct state_count
{
***************
*** 480,485 ****
--- 490,499 ----
int log_filter; /* what events to filter out */
char ded_prefix[PBS_MAXQUEUENAME +1]; /* prefix to dedicated queues */
time_t max_starve; /* starving threshold */
+
+ struct dynamic_token dynamic_tokens[MAX_DYNTOKEN_SIZE]; /* dynamic tokens */
+
+ time_t dynamic_token_delay; /* job start delay for dynamic tokens */
};
/* for description of these bits, check the PBS admin guide or
scheduler IDS */
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/parse.c
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/parse.c
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/parse.c 2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/parse.c 2009-03-22
15:48:48.000000000 -0400
***************
*** 92,98 ****
#include "prime.h"
#include "node_info.h"
-
/*
*
* parse_config - parse the config file and set a struct config
--- 92,97 ----
***************
*** 121,126 ****
--- 120,126 ----
int linenum = 0; /* the current line number in the file */
int pkey_num = 1; /* number of prime time keys for multisort */
int npkey_num = 1; /* number of nonprime time keys for multisort */
+ int tok_count = 0; /* number of dynamic tokens */
int i;
if ((fp = fopen(fname, "r")) == NULL)
***************
*** 317,322 ****
--- 317,339 ----
}
}
}
+ else if (!strcmp(config_name, PARSE_TOKEN))
+ {
+ if (tok_count < MAX_DYNTOKEN_SIZE)
+ {
+ error = 0;
+
+ conf.dynamic_tokens[tok_count].identifier =
string_dup(config_value);
+ conf.dynamic_tokens[tok_count].script = string_dup(prime_value);
+
+ tok_count++;
+ }
+ else error = 1;
+ }
+ else if (!strcmp(config_name, PARSE_TOKEN_DELAY))
+ {
+ conf.dynamic_token_delay = num;
+ }
}
else
error = 1;
***************
*** 341,346 ****
--- 358,365 ----
int
init_config(void)
{
+ int i;
+
memset(&conf, 0, sizeof(struct config));
memset(&cstat, 0, sizeof(struct status));
***************
*** 369,374 ****
--- 388,401 ----
else
init_non_prime_time();
+ for (i=0;i<MAX_DYNTOKEN_SIZE;i++) {
+ conf.dynamic_tokens[i].identifier = NULL;
+ conf.dynamic_tokens[i].script = NULL;
+ conf.dynamic_tokens[i].time = 0;
+ }
+
+ conf.dynamic_token_delay = 0;
+
return 1;
}
***************
*** 383,390 ****
--- 410,425 ----
int
reinit_config(void)
{
+ int i;
+
free(conf.prime_sort);
free(conf.non_prime_sort);
free_group_tree(conf.group_root);
+
+ for (i=0;i<MAX_DYNTOKEN_SIZE;i++) {
+ free(conf.dynamic_tokens[i].identifier);
+ free(conf.dynamic_tokens[i].script);
+ }
+
return init_config();
}
diff -cr torque-2.3.6-p1/src/scheduler.cc/samples/fifo/sched_config
torque-2.3.6-p3/src/scheduler.cc/samples/fifo/sched_config
*** torque-2.3.6-p1/src/scheduler.cc/samples/fifo/sched_config 2009-03-21
18:16:22.000000000 -0400
--- torque-2.3.6-p3/src/scheduler.cc/samples/fifo/sched_config 2009-03-22
15:05:10.000000000 -0400
***************
*** 183,185 ****
--- 183,202 ----
# sync_time - the amount of time between syncing the usage information to disk
# NO PRIME OPTION
sync_time: 1:00:00
+
+ # tokens: defines a dynamic token, where the number of available tokens are
+ # obtained by running a script rather than a maximum count based on the server
+ # tokens attribute. This can be used to check for available licenses from
+ # a network license server.
+ # The identifier must be the same string as the token name requested
by the job
+ # and this token must be defined in the server tokens attribute.
+ # script must be the full path or proceeded by ./ for scripts
located in sched_priv
+ # tokens: identifier script
+ # tokens: matlab ./matlab.sh
+
+ # dynamic_token_delay: Used to prevent jobs requesting dynamic tokens from
+ # starting too quickly, before a previously run job actually obtains the
+ # necessary licenses. This parameter is the number of seconds to wait
+ # before running a job requesting the same dynamic token as a previous job.
+ # dynamic_token_delay: 15
+
<--------------------------- PATCH 3 ---------------------------->
diff -cr torque-2.3.6-p3/doc/ers/pbs_resources_all.so
torque-2.3.6-p4/doc/ers/pbs_resources_all.so
*** torque-2.3.6-p3/doc/ers/pbs_resources_all.so 2009-03-21
18:09:48.000000000 -0400
--- torque-2.3.6-p4/doc/ers/pbs_resources_all.so 2009-03-21
23:00:31.000000000 -0400
***************
*** 159,164 ****
--- 159,171 ----
This resource is provided for use by the site's scheduling policy.
The allowable values and effect on job placement is site dependent.
Units: string.
+ .IP tokens
+ Allows a user to specify the quantity of a consumable resource
+ required by the job. The format of this request is the token
+ name and count separated by a colon, e.g. matlab:2.
+ This resource is provided for use by the site scheduling policy.
+ The allowable values and effect on job placement is site dependent.
+ Units: string:float.
.LP
.Sh EXAMPLES
qsub \-l nodes=15,walltime=2:00:00 script
diff -cr torque-2.3.6-p3/doc/man7/pbs_server_attributes.7.in
torque-2.3.6-p4/doc/man7/pbs_server_attributes.7.in
*** torque-2.3.6-p3/doc/man7/pbs_server_attributes.7.in 2009-03-21
18:09:47.000000000 -0400
--- torque-2.3.6-p4/doc/man7/pbs_server_attributes.7.in 2009-03-21
22:40:01.000000000 -0400
***************
*** 514,519 ****
--- 514,529 ----
.if !\n(Pb .ig Ig
[internal type: list]
.Ig
+ .Al tokens
+ A list of tokens which specify the maximum allowable usage count
+ for a consumable resource. A typical use of this feature would be to
+ specify the number of available licenses for a particular software
+ package, e.g. matlab:10. This attribute is advisory to the Scheduler,
+ it is not enforced by the server.
+ Format: string:float; default value: none.
+ .if !\n(Pb .ig Ig
+ [internal type: list]
+ .Ig
.RE
.LP
.if !\n(Pb .ig Ig
More information about the torquedev
mailing list