[torqueusers] How to reject a job in svr_startjob()?
Mike Coyne
Mike.Coyne at PACCAR.com
Tue Mar 16 14:30:49 MDT 2010
In getting the TM interface to work with gssapi/gsiapi I needed to
pre-authenicate to each node so a copy of the credentials will be
Abailable to be used by a tm-spawn no the sister nodes. In doing this I
would also like to KILL / DEFER the job if something has gone wrong
And the credentials on the pbsserver has expired prior to starting the
job. How /what do I call to reject/abort the job. Req_reject?
/*
* svr_startjob - place a job into running state by shipping it to MOM
* called by req_runjob()
*/
int svr_startjob(
...
#if defined(BOEING) || defined(GSSAPI)
/* Verify that all the nodes are alive via a TCP connect. */
/* NOTE: Copy the nodes into a temp string because strtok() is
destructive. */
size = strlen(pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);
hostlist = malloc(size + 1);
if (hostlist == NULL)
{
sprintf(log_buffer, "could not allocate temporary buffer (malloc
failed) -- skipping TCP connect check");
log_err(errno, id, log_buffer);
}
else
{
/* Get the first host. */
strncpy(hostlist,
pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str, size);
hostlist[size] = '\0';
nodestr = strtok(hostlist, "+");
}
lasthost = NULL;
while (nodestr != NULL)
{
/* truncate from trailing slash on (if one exists). */
if ((cp = strchr(nodestr, '/')) != NULL)
{
cp[0] = '\0';
}
/* No need to do duplicate node checks only helps if nodestr is
* in host name order, noticed to be generally the case */
if(lasthost == NULL){
lasthost=nodestr;
}else
{
if((strlen(lasthost)==
strlen(nodestr))&&(strcmp(lasthost,nodestr)==0))
{
nodestr = strtok(NULL, "+");
continue;
}else{
lasthost=nodestr;
}
}
/* Lookup IP address of host. */
if ((hp = gethostbyname(nodestr)) == NULL)
{
sprintf(log_buffer, "could not contact %s
(gethostbyname failed, errno: %d (%s))",
nodestr,
errno,
pbs_strerror(errno));
if (FailHost != NULL)
strncpy(FailHost, nodestr, 1024);
if (EMsg != NULL)
strncpy(EMsg, log_buffer, 1024);
log_record(
PBSEVENT_JOB,
PBS_EVENTCLASS_JOB,
pjob->ji_qs.ji_jobid,
log_buffer);
/* Add this host to the reject destination
list for the job */
bp = (badplace *)malloc(sizeof(badplace));
if (bp == NULL)
{
log_err(errno, id,
msg_err_malloc);
return;
}
CLEAR_LINK(bp->bp_link);
strcpy(bp->bp_dest, nodestr);
append_link(&pjob->ji_rejectdest,
&bp->bp_link, bp);
/* FAILURE - cannot lookup master compute host
*/
return(PBSE_RESCUNAV);
}
/* open a socket. */
#ifdef BOEING
...
else /* GSSAPI */
cntype = ToServerDIS;
hostaddr=get_hostaddr(nodestr);
broke=0;
if ((con = svr_connect(hostaddr, port, 0, cntype)) ==
PBS_NET_RC_FATAL)
{
sprintf(log_buffer, "send_job failed to %lx
port %d",
hostaddr,
port);
/* log_err(pbs_errno, id, log_buffer);
*/
/* exit(1); */
/*push bad node info*/
broke=1;
}
if (con == PBS_NET_RC_RETRY)
{
pbs_errno = 0; /* should retry */
continue;
}
if ( globus_module_activate(GLOBUS_GSI_GSSAPI_MODULE)
!=GLOBUS_SUCCESS ||
globus_module_activate(GLOBUS_GSI_GSS_ASSIST_MODULE) !=GLOBUS_SUCCESS ){
/* return -1; */
return(PBSE_RESCUNAV);
}
DIS_tcp_setup(connection[con].ch_socket);
if (encode_DIS_ReqHdr(connection[con].ch_socket,
PBS_BATCH_ForwardCreds,
pbs_current_user) ||
encode_DIS_JobId(connection[con].ch_socket,pjob->ji_qs.ji_jobid) ||
encode_DIS_ReqExtend(connection[con].ch_socket,0)) {
/* exit(1); */
return(PBSE_RESCUNAV);
}
DIS_tcp_wflush(connection[con].ch_socket);
/* do client gss auth */
hostname = get_hostnamefromaddr(hostaddr);
if (hostname == NULL) {
sprintf(log_buffer,"send job failed: couldn't
get hostname for %lx\n",hostaddr);
log_err(0,"svr_movejob
get_hostname",log_buffer);
/* exit(1); */
broke=1;
}
/* Forward job's credentials to the server.
in a child, so ok to block */
ccname = ccname_for_job(pjob->ji_qs.ji_jobid,path_creds);
if (setenv("X509_USER_PROXY",ccname,1)) {
perror("Couldn't put X509_USER_PROXY into
environment");
/* exit(1); */
return(PBSE_RESCUNAV);
}
if(getenv("GLOBUS_LOCATION") == NULL){
if
(setenv("GLOBUS_LOCATION","/opt/vdt/globus",1)) {
perror("svr_movejob_gsi:Couldn't
put default GLOBUS_LOCATION into environment");
return(PBSE_RESCUNAV);
}
}
sprintf(log_buffer,"svr_movejob::hostname for %lx ccname
%s\n",hostaddr,ccname);
log_err(0,"svr_movejob ccname",log_buffer);
retries = 0;
if(!broke){
while ((i =
pbsgss_client_authenticate(hostname, connection[con].ch_socket,1,0)) !=
0) {
fprintf(stderr,"send job failed:
Couldn't authenticate as user to %s:%d : %d\n",hostname,con,i);
if (retries++ > 2) {
/* exit(1);
*/
broke=1;
}
}
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.supercluster.org/pipermail/torqueusers/attachments/20100316/e127805d/attachment-0001.html
More information about the torqueusers
mailing list