[torqueusers] running jobs with stdout to terminal and stdin from
a script file
Chris Samuel
csamuel at vpac.org
Wed Aug 25 22:58:17 MDT 2004
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
On Thu, 26 Aug 2004 12:28 am, Maestas, Christopher Daniel wrote:
> I know a couple people have made the request already (I'm one of them :-).
> I'll ask it again:
>
> Can we have the mpiexec patches into torque? Pretty please?
Hear hear. We've got users begging for it.
Here is a merge of that patch against Torque 1.1.0p0 done by Brett here at
VPAC (thanks Brett!).
WARNING: We have not tested it *at* *all*, if your cluster blows up, breaks
down or otherwise stops working don't call us. If it snaps you get to keep
both parts. Batteries not included. Do not divide by zero. Caveat emptor.
cheers,
Chris
- --
Christopher Samuel - (03)9925 4751 - VPAC Systems & Network Admin
Victorian Partnership for Advanced Computing http://www.vpac.org/
Bldg 91, 110 Victoria Street, Carlton South, VIC 3053, Australia
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (GNU/Linux)
iD8DBQFBLW3pO2KABBYQAh8RAlisAJ9w+2OivdnBnw1ZRFG+yhdYUg6NtQCfafRF
G5qcr/hQ1euGkGncpMben/4=
=ueaf
-----END PGP SIGNATURE-----
-------------- next part --------------
diff -ruN --exclude='.nfs*' torque-1.1.0p0/README.mpiexec torque-1.1.0p0-bp/README.mpiexec
--- torque-1.1.0p0/README.mpiexec 1970-01-01 10:00:00.000000000 +1000
+++ torque-1.1.0p0-bp/README.mpiexec 2004-08-19 16:41:56.000000000 +1000
@@ -0,0 +1,24 @@
+
+Documentation of changes applied as part of the mpiexec patch.
+http://www.osc.edu/~pw/mpiexec/
+Last changed 04 Dec 2001.
+
+
+General bug fix:
+
+ src/lib/Libifl/tm.c
+
+ Do not call DIS_tcp_setup(-1) if connection to the local mom failed
+ after 5 tries.
+
+Functionality additions:
+
+ src/resmom/start_exec.c
+
+ Added extension to get stdin/out/err files from the environment rather
+ than always using /dev/null during tm_spawn.
+
+ src/cmds/pbs_demux.c
+
+ Never expect a PBS_JOBCOOKIE to be delivered from a process.
+
diff -ruN --exclude='.nfs*' torque-1.1.0p0/src/cmds/pbs_demux.c torque-1.1.0p0-bp/src/cmds/pbs_demux.c
--- torque-1.1.0p0/src/cmds/pbs_demux.c 2004-06-03 04:54:43.000000000 +1000
+++ torque-1.1.0p0-bp/src/cmds/pbs_demux.c 2004-08-19 16:41:56.000000000 +1000
@@ -106,10 +106,8 @@
struct routem {
enum rwhere r_where;
short r_nl;
- short r_first;
};
fd_set readset;
-char *cookie = 0;
void readit(int sock, struct routem *prm)
@@ -127,18 +125,6 @@
i = 0;
if ((amt = read(sock, buf, 256)) > 0) {
- if (prm->r_first == 1) {
-
- /* first data on connection must be the cookie to validate it */
-
- i = strlen(cookie);
- if (strncmp(buf, cookie, i) != 0) {
- (void)close(sock);
- prm->r_where = invalid;
- FD_CLR(sock, &readset);
- }
- prm->r_first = 0;
- }
for (pc = buf+i; pc < buf+amt; ++pc) {
#ifdef DEBUG
if (prm->r_nl) {
@@ -176,22 +162,12 @@
parent = getppid();
- cookie = getenv("PBS_JOBCOOKIE");
- if (cookie == 0) {
- fprintf(stderr, "%s: no PBS_JOBCOOKIE found in the env\n",
- argv[0]);
- exit(3);
- }
-#ifdef DEBUG
- printf("Cookie found in environment: %s\n", cookie);
-#endif
maxfd = sysconf(_SC_OPEN_MAX);
routem = (struct routem *)malloc(maxfd*sizeof(struct routem));
for (i=0; i<maxfd; ++i) {
(routem+i)->r_where = invalid;
(routem+i)->r_nl = 1;
- (routem+i)->r_first = 0;
}
(routem+main_sock_out)->r_where = new_out;
(routem+main_sock_err)->r_where = new_err;
@@ -244,7 +220,6 @@
newsock = accept(i, 0, 0);
(routem+newsock)->r_where = (routem+i)->r_where== new_out ? old_out : old_err;
FD_SET(newsock, &readset);
- (routem+newsock)->r_first = 1;
break;
case old_out:
case old_err:
diff -ruN --exclude='.nfs*' torque-1.1.0p0/src/lib/Libifl/tm.c torque-1.1.0p0-bp/src/lib/Libifl/tm.c
--- torque-1.1.0p0/src/lib/Libifl/tm.c 2004-06-03 04:54:44.000000000 +1000
+++ torque-1.1.0p0-bp/src/lib/Libifl/tm.c 2004-08-19 16:41:56.000000000 +1000
@@ -458,7 +458,8 @@
}
}
- DIS_tcp_setup(local_conn);
+ if (local_conn >= 0)
+ DIS_tcp_setup(local_conn);
return (local_conn);
}
diff -ruN --exclude='.nfs*' torque-1.1.0p0/src/resmom/start_exec.c torque-1.1.0p0-bp/src/resmom/start_exec.c
--- torque-1.1.0p0/src/resmom/start_exec.c 2004-06-03 04:54:44.000000000 +1000
+++ torque-1.1.0p0-bp/src/resmom/start_exec.c 2004-08-19 17:12:48.000000000 +1000
@@ -1858,6 +1858,42 @@
/*
+ * Look for a certain environment variable which has a port# which should
+ * be opened on the MS to establish communication for one of the 3 stdio
+ * streams. >=0 return is that valid fd, -1 means no env var found,
+ * -2 means malformed env value or failure to connect.
+ */
+static int
+search_env_and_open(const char *envname, u_long ipaddr)
+{
+ static char *id = "search_env_and_open";
+ int i, len = strlen(envname);
+
+ for (i=0; i<vtable.v_used; i++)
+ if (!strncmp(vtable.v_envp[i], envname, len)) {
+ const char *cp = vtable.v_envp[i] + len;
+ char *cq;
+ int fd, port;
+ if (*cp++ != '=') break; /* empty, ignore it */
+ port = strtol(cp, &cq, 10);
+ if (*cq) {
+ log_err(errno, id, "improper value for MPIEXEC_STD*_PORT");
+ return -2;
+ }
+#if 0 /* debugging */
+ log_err(0, "search_env_and_open attempting open", vtable.v_envp[i]);
+#endif
+ if ((fd = open_demux(ipaddr, port)) < 0) {
+ log_err(errno, id, "failed connect to mpiexec process on MS");
+ log_err(errno, id, vtable.v_envp[i]);
+ return -2;
+ }
+ return fd;
+ }
+ return -1; /* not found */
+}
+
+/*
** Start a process for a spawn request. This will be different from
** a job's initial shell task in that the environment will be specified
** and no interactive code need be included.
@@ -1878,7 +1914,7 @@
int pipes[2], kid_read, kid_write, parent_read, parent_write;
int pts;
int i, j;
- int fd;
+ int fd0, fd1, fd2;
u_long ipaddr;
struct array_strings *vstrs;
struct startjob_rtn sjr;
@@ -2263,7 +2299,11 @@
** Set up stdin.
*/
- if ((fd = open("/dev/null", O_RDONLY)) == -1)
+ /* look through env for a port# on MS we should use for stdin */
+ if ((fd0 = search_env_and_open("MPIEXEC_STDIN_PORT", ipaddr)) == -2)
+ starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
+ /* use /dev/null if no env var found */
+ if (fd0 < 0 && (fd0 = open("/dev/null", O_RDONLY)) == -1)
{
log_err(errno,"newtask","could not open dev/null");
@@ -2271,19 +2311,24 @@
}
else
{
- dup2(fd,0);
-
- if (fd > 0)
- close(fd);
+ (void)dup2(fd0, 0);
+ if (fd0 > 0)
+ (void)close(fd0);
}
+ /* look through env for a port# on MS we should use for stdout/err */
+ if ((fd1 = search_env_and_open("MPIEXEC_STDOUT_PORT", ipaddr)) == -2)
+ starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
+ if ((fd2 = search_env_and_open("MPIEXEC_STDERR_PORT", ipaddr)) == -2)
+ starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
+
if (pjob->ji_numnodes > 1)
{
/*
** Open sockets to demux proc for stdout and stderr.
*/
- if ((fd = open_demux(ipaddr,pjob->ji_stdout)) == -1)
+ if (fd1 < 0 && (fd1 = open_demux(ipaddr,pjob->ji_stdout)) == -1)
{
starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr);
@@ -2292,12 +2337,12 @@
exit(1);
}
- dup2(fd,1);
+ dup2(fd1,1);
- if (fd > 1)
- close(fd);
+ if (fd1 > 1)
+ close(fd1);
- if ((fd = open_demux(ipaddr,pjob->ji_stderr)) == -1)
+ if (fd2 < 0 && (fd2 = open_demux(ipaddr,pjob->ji_stderr)) == -1)
{
starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr);
@@ -2306,44 +2351,70 @@
exit(1);
}
- dup2(fd,2);
+ dup2(fd2,2);
- if (fd > 2)
- close(fd);
+ if (fd2 > 2)
+ close(fd2);
- write(1,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
- strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str));
-
- write(2,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
- strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str));
}
else if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags&ATR_VFLAG_SET) &&
(pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0))
{
/* interactive job, single node, write to pty */
- if ((pts = open_pty(pjob)) < 0)
- {
- log_err(errno,id,"cannot open slave");
+ pts = -1;
+ if (fd1 < 0 || fd2 < 0) {
+ if ((pts = open_pty(pjob)) < 0)
+ {
+ log_err(errno,id,"cannot open slave");
- starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);
- }
+ starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);
+ }
+
+ if (fd1 < 0)
+ fd1 = pts;
+ if (fd2 < 0)
+ fd2 = pts;
- dup2(pts,1);
- dup2(pts,2);
+ }
+
+ (void)dup2(fd1, 1);
+ (void)dup2(fd2, 2);
+ if (fd1 != pts)
+ (void) close(fd1);
+ if (fd2 != pts)
+ (void) close(fd2);
}
else
{
/* normal batch job, single node, write straight to files */
- if (open_std_out_err(pjob) == -1)
- {
- log_err(errno,id,"cannot open stderr/stdout");
+ pts = -1;
+ if (fd1 < 0 || fd2 < 0) {
- starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr);
- }
+ if (open_std_out_err(pjob) == -1)
+ {
+ log_err(errno,id,"cannot open stderr/stdout");
+
+ starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr);
+ }
}
+ if (fd1 >= 0) {
+ (void) close(1);
+ (void) dup2(fd1, 1);
+ if (fd1 > 1)
+ (void) close(fd1);
+ }
+ if (fd2 >= 0) {
+ (void) close(2);
+ (void) dup2(fd2, 2);
+ if (fd2 > 2)
+ (void) close(fd2);
+ }
+
+ }
+
log_close(0);
starter_return(
More information about the torqueusers
mailing list