[torqueusers] How to reject a job in svr_startjob()?

Mike Coyne Mike.Coyne at PACCAR.com
Tue Mar 16 14:30:49 MDT 2010


In getting the TM interface to work with gssapi/gsiapi I needed to
pre-authenicate to each node so a copy of the credentials will be

Abailable to be used by a tm-spawn no the sister nodes. In doing this I
would also like to KILL / DEFER the job if something has gone wrong

And the credentials on the pbsserver has expired prior to starting the
job. How /what do I call to reject/abort  the job. Req_reject?

 

/*

 * svr_startjob - place a job into running state by shipping it to MOM

 *   called by req_runjob()

 */

 

int svr_startjob(

 

...

#if defined(BOEING) || defined(GSSAPI)

  /* Verify that all the nodes are alive via a TCP connect. */

 

  /* NOTE: Copy the nodes into a temp string because strtok() is
destructive. */

 

  size = strlen(pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);

 

  hostlist = malloc(size + 1);

 

  if (hostlist == NULL)

    {

    sprintf(log_buffer, "could not allocate temporary buffer (malloc
failed) -- skipping TCP connect check");

    log_err(errno, id, log_buffer);

    }

  else

    {

    /* Get the first host. */

 

    strncpy(hostlist,
pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str, size);

    hostlist[size] = '\0';

    nodestr = strtok(hostlist, "+");

    }

  lasthost = NULL;

  while (nodestr != NULL)

  {

              /* truncate from trailing slash on (if one exists). */

 

              if ((cp = strchr(nodestr, '/')) != NULL)

              {

                          cp[0] = '\0';

              }

/* No need to do duplicate node checks  only helps if nodestr is

 * in host name order, noticed to be generally the case */

              if(lasthost == NULL){

                          lasthost=nodestr;

              }else

              {

                          if((strlen(lasthost)==
strlen(nodestr))&&(strcmp(lasthost,nodestr)==0))

                          {

                                      nodestr = strtok(NULL, "+");

                                    continue;

                          }else{

                                      lasthost=nodestr;

                          }

              }

 

              /* Lookup IP address of host. */

 

              if ((hp = gethostbyname(nodestr)) == NULL)

              {

                          sprintf(log_buffer, "could not contact %s
(gethostbyname failed, errno: %d (%s))",

                                                  nodestr,

                                                  errno,

                                                  pbs_strerror(errno));

 

                          if (FailHost != NULL)

                                      strncpy(FailHost, nodestr, 1024);

 

                          if (EMsg != NULL)

                                      strncpy(EMsg, log_buffer, 1024);

 

                          log_record(

                                                  PBSEVENT_JOB,

                                                  PBS_EVENTCLASS_JOB,

                                                  pjob->ji_qs.ji_jobid,

                                                  log_buffer);

 

                          /* Add this host to the reject destination
list for the job */

 

                          bp = (badplace *)malloc(sizeof(badplace));

 

                          if (bp == NULL)

                          {

                                      log_err(errno, id,
msg_err_malloc);

 

                                      return;

                          }

 

                          CLEAR_LINK(bp->bp_link);

 

                          strcpy(bp->bp_dest, nodestr);

 

                          append_link(&pjob->ji_rejectdest,
&bp->bp_link, bp);

 

                          /* FAILURE - cannot lookup master compute host
*/

 

                          return(PBSE_RESCUNAV);

              }

 

              /* open a socket. */

#ifdef BOEING

...

else /* GSSAPI */

              cntype = ToServerDIS;

              hostaddr=get_hostaddr(nodestr);

              broke=0;

              if ((con = svr_connect(hostaddr, port, 0, cntype)) ==
PBS_NET_RC_FATAL)

              {

                          sprintf(log_buffer, "send_job failed to %lx
port %d",

                                                  hostaddr,

                                                  port);

 

                          /*        log_err(pbs_errno, id, log_buffer);
*/

                          /*        exit(1); */

                          /*push bad node info*/

                          broke=1;

 

              }

 

              if (con == PBS_NET_RC_RETRY)

              {

                          pbs_errno = 0; /* should retry */

 

                          continue;

              }

 

              if  ( globus_module_activate(GLOBUS_GSI_GSSAPI_MODULE)
!=GLOBUS_SUCCESS ||

 
globus_module_activate(GLOBUS_GSI_GSS_ASSIST_MODULE) !=GLOBUS_SUCCESS ){

                          /* return -1; */

                          return(PBSE_RESCUNAV);

              }

 

              DIS_tcp_setup(connection[con].ch_socket);

              if (encode_DIS_ReqHdr(connection[con].ch_socket,

                                      PBS_BATCH_ForwardCreds,

                                      pbs_current_user) ||

 
encode_DIS_JobId(connection[con].ch_socket,pjob->ji_qs.ji_jobid) ||

 
encode_DIS_ReqExtend(connection[con].ch_socket,0)) {

                          /*        exit(1); */

                          return(PBSE_RESCUNAV);

              }

              DIS_tcp_wflush(connection[con].ch_socket);

 

              /* do client gss auth */

              hostname = get_hostnamefromaddr(hostaddr);

              if (hostname == NULL) {

                          sprintf(log_buffer,"send job failed: couldn't
get hostname for %lx\n",hostaddr);

                          log_err(0,"svr_movejob
get_hostname",log_buffer);

                          /*        exit(1); */

                          broke=1;

              }

 

              /* Forward job's credentials to the server.

                         in a child, so ok to block */

              ccname = ccname_for_job(pjob->ji_qs.ji_jobid,path_creds);

              if (setenv("X509_USER_PROXY",ccname,1)) {

                          perror("Couldn't put X509_USER_PROXY into
environment");

                          /*        exit(1); */

                          return(PBSE_RESCUNAV);

              }

              if(getenv("GLOBUS_LOCATION") == NULL){

                          if
(setenv("GLOBUS_LOCATION","/opt/vdt/globus",1)) {

                                      perror("svr_movejob_gsi:Couldn't
put default  GLOBUS_LOCATION into environment");

                                      return(PBSE_RESCUNAV);

                          }

              }

              sprintf(log_buffer,"svr_movejob::hostname for %lx ccname
%s\n",hostaddr,ccname);

              log_err(0,"svr_movejob ccname",log_buffer);

              retries = 0;

              if(!broke){

                          while ((i =
pbsgss_client_authenticate(hostname, connection[con].ch_socket,1,0)) !=
0) {

                                      fprintf(stderr,"send job failed:
Couldn't authenticate as user to %s:%d : %d\n",hostname,con,i);

                                      if (retries++ > 2) {

                                                  /*          exit(1);
*/

                                                  broke=1;

                                      }

                          }

              }

-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20100316/e127805d/attachment-0001.html 


More information about the torqueusers mailing list