[TORQUEUsers] pbs_mom problem(s)

Chris Samuel csamuel@vpac.org
Mon, 4 Aug 2003 15:19:15 +1000


--Boundary-00=_TzeL/SeMQqWgTnM
Content-Type: Text/Plain;
  charset="us-ascii"
Content-Transfer-Encoding: quoted-printable
Content-Description: clearsigned data
Content-Disposition: inline

=2D----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi all,

We've had a problem for a while where pbs_mom's seemed to die at random.

On the sc2 release I hacked in some catches for things like SEGV and caught=
 a=20
couple of core dumps, a backtrace of which looks like:

(gdb) where
#0  0x4004a0a1 in kill () from /lib/libc.so.6
#1  0x40049e99 in raise () from /lib/libc.so.6
#2  0x4004b364 in abort () from /lib/libc.so.6
#3  0x0804c2ba in catch_abort () at mom_main.c:1075
#4  <signal handler called>
#5  0x08053085 in im_request (stream=3D1, version=3D1) at mom_comm.c:1808
#6  0x0804d3e7 in do_rpp (stream=3D1) at mom_main.c:1576
#7  0x0804d48f in rpp_request (fd=3D8) at mom_main.c:1606
#8  0x0806d084 in wait_request (waittime=3D10) at net_server.c:241
#9  0x0804d8ef in finish_loop (waittime=3D10) at mom_main.c:1757
#10 0x0804f1e6 in main (argc=3D3, argv=3D0xbffff394) at mom_main.c:2588
#11 0x400391c4 in __libc_start_main () from /lib/libc.so.6

Note that this is a slightly patched version of pbs_mom from sc2, i.e. I've=
=20
added a signal handler for SIGSEGV, SIGBUS, etc - hence the signal handler=
=20
getting called (logging a SEGV).

To give that a bit of context, the area of code it seems to happen in is he=
re=20
(line 1808 is marked with ---> ):

                case    IM_KILL_JOB:
                        /*
                        ** Sender is sending a responce that a job
                        ** which needs to die has been given the ax.
                        ** I'm mother superior.
                        **
                        ** auxiliary info (
                        **      cput    int;
                        **      mem     int;
                        ** )
                        */
                        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) =3D=
=3D 0) {
                                log_err(-1, id,
                                        "got KILL_JOB OKAY and I'm not MS");
                                goto err;
                        }
                        DBPRT(("%s: KILL_JOB %s OKAY\n", id, jobid))

 --->                  pjob->ji_resources[nodeidx-1].nr_cput =3D
                                disrul(stream, &ret);
                        if (ret !=3D DIS_SUCCESS)
                                goto err;

                        DBPRT(("%s: %s FINAL from %d cpu %lu sec mem %lu=20
kb\n",
                                id, jobid, nodeidx,
                                pjob->ji_resources[nodeidx-1].nr_cput,
                                pjob->ji_resources[nodeidx-1].nr_mem))

This seems to match what we've gleaned from our logs, in that it seems to b=
e=20
associated with trying to kill a job off (possibly one that has already=20
exit()ed or crashed).

OK - now onto Valgrind.

I've patched pbs_mom and pbs_sched to do the same as pbs_server does when t=
he=20
PBSDEBUG variable is set, i.e. don't fork into the background. Running that=
=20
version under valgrind flags the following error when gettting a job:

do_rpp: got an internal task manager request
im_request: stream 1 version 1
connect from 172.20.3.2:1023
im_request: JOIN_JOB 13919.brecca-m.vpac.org node 2
job_nodes: 0: node002/0
job_nodes: 1: node019/0
job_nodes: 2: node023/0
job_nodes: 3: node004/0
job: 13919.brecca-m.vpac.org numnodes=3D4 numvnod=3D4
=3D=3D10302=3D=3D Reading syms from /lib/libnss_compat-2.2.5.so
=3D=3D10302=3D=3D    object doesn't have any debug info
=3D=3D10302=3D=3D Syscall param write(buf) contains uninitialised or unaddr=
essable=20
byte(
s)
=3D=3D10302=3D=3D    at 0x4030E754: __libc_write (in /lib/libc-2.2.5.so)
=3D=3D10302=3D=3D    by 0x8056ADF: save_struct (./../server/attr_recov.c:18=
6)
=3D=3D10302=3D=3D    by 0x8056D70: save_attr (./../server/attr_recov.c:309)
=3D=3D10302=3D=3D    by 0x8059023: job_save (./../server/job_recov.c:242)
=3D=3D10302=3D=3D    Address 0x8085346 is not stack'd, malloc'd or free'd
=3D=3D10302=3D=3D
=3D=3D10302=3D=3D Syscall param write(buf) contains uninitialised or unaddr=
essable=20
byte(
s)
=3D=3D10302=3D=3D    at 0x4030E754: __libc_write (in /lib/libc-2.2.5.so)
=3D=3D10302=3D=3D    by 0x8056BD2: save_flush (./../server/attr_recov.c:234)
=3D=3D10302=3D=3D    by 0x805903C: job_save (./../server/job_recov.c:245)
=3D=3D10302=3D=3D    by 0x80516CB: im_request (mom_comm.c:1242)
=3D=3D10302=3D=3D    Address 0x808540E is not stack'd, malloc'd or free'd
mom_do_poll: entered
do_rpp: got an internal task manager request
im_request: stream 1 version 1
connect from 172.20.3.2:1023
im_request: KILL_JOB 13919.brecca-m.vpac.org
kill_jobs: Killing 1092562312

Interestingly it doesn't log a problem when it does the kill job for some=20
reason, but as I wrote above I suspect it's conditional upon the job alread=
y=20
having died on the node when pbs_mom gets told to kill it.

I've attached a patch against the p3 release at:

	http://www.supercluster.org/downloads/TORQUE/

cheers!
Chris
=2D --=20
 Chris Samuel -- VPAC Systems & Network Admin
 Victorian Partnership for Advanced Computing
 Bldg 91, 110 Victoria Street, Carlton South,
 VIC 3053, Australia - http://www.vpac.org/

=2D----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.2 (GNU/Linux)

iD8DBQE/LezXO2KABBYQAh8RAoupAJ9Tsd8UjsOzqUET3b3yvCkdja3sYwCdH0QN
RvoVdibQZ0yzt5atAa57rIQ=3D
=3D/wUL
=2D----END PGP SIGNATURE-----

--Boundary-00=_TzeL/SeMQqWgTnM
Content-Type: text/x-diff;
  charset="us-ascii";
  name="mainstream.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="mainstream.diff"

Index: src/resmom/mom_main.c
===================================================================
RCS file: /usr/users6/vpac/cvs_sys/SuperCluster/TORQUE/src/resmom/mom_main.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 mom_main.c
--- src/resmom/mom_main.c	4 Aug 2003 01:51:43 -0000	1.1.1.1
+++ src/resmom/mom_main.c	4 Aug 2003 05:14:12 -0000
@@ -1054,6 +1054,25 @@
 	return ret_string;
 }
 
+/*
+ * Catch core dump signals - set core size so we can see what happened!
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+static void
+catch_abort(sig)
+	int sig;
+{
+	struct rlimit rlimit;
+
+	log_err (sig,"mom_main","Caught fatal core signal");
+	rlimit.rlim_cur = RLIM_INFINITY;
+	rlimit.rlim_max = RLIM_INFINITY;
+	(void)setrlimit(RLIMIT_CORE, &rlimit);
+	(void)abort();
+}
+
 static void
 catch_hup(sig)
 int	sig;
@@ -2157,44 +2176,47 @@
 	/* go into the background and become own session/process group */
 
 
-#ifndef DEBUG
-	mom_lock(lockfds, F_UNLCK);	/* unlock so child can relock */
+	if (getenv("PBSDEBUG") == NULL)
+	  {
+		mom_lock(lockfds, F_UNLCK);	/* unlock so child can relock */
 
-	 if (fork() > 0)
-		return (0);	/* parent goes away */
-
-	if (setsid() == -1) {
-		log_err(errno, msg_daemonname, "setsid failed");
-		return (2);
-	}
-	mom_lock(lockfds, F_WRLCK);	/* lock out other MOMs */
+		 if (fork() > 0)
+			return (0);	/* parent goes away */
 	
-	(void)fclose(stdin);
-	(void)fclose(stdout);
-	(void)fclose(stderr);
-	dummyfile = fopen("/dev/null", "r");
-	assert( (dummyfile != 0) && (fileno(dummyfile) == 0) );
-	dummyfile = fopen("/dev/null", "w");
-	assert( (dummyfile != 0) && (fileno(dummyfile) == 1) );
-	dummyfile = fopen("/dev/null", "w");
-	assert( (dummyfile != 0) && (fileno(dummyfile) == 2) );
-
-#else	/* DEBUG */
+		if (setsid() == -1) {
+			log_err(errno, msg_daemonname, "setsid failed");
+			return (2);
+		}
+		mom_lock(lockfds, F_WRLCK);	/* lock out other MOMs */
+		
+		(void)fclose(stdin);
+		(void)fclose(stdout);
+		(void)fclose(stderr);
+		dummyfile = fopen("/dev/null", "r");
+		assert( (dummyfile != 0) && (fileno(dummyfile) == 0) );
+		dummyfile = fopen("/dev/null", "w");
+		assert( (dummyfile != 0) && (fileno(dummyfile) == 1) );
+		dummyfile = fopen("/dev/null", "w");
+		assert( (dummyfile != 0) && (fileno(dummyfile) == 2) );
+	
+	  }
+	else
+	{
 #if	defined(_CRAY)
 	
-	/* CRAY cannot restart checkpointed job if MOM has controlling tty */
+		/* CRAY cannot restart checkpointed job if MOM has controlling tty */
 
-	(void)sprintf(log_buffer, "/tmp/pbs_mom.%d", getpid());
-	printf("Debug output will be in %s\n", log_buffer);
-
-	freopen(log_buffer, "w", stdout);
-	freopen(log_buffer, "w", stderr);
-	(void)ioctl(0, TCCLRCTTY, 0);
-	(void)close(0);
+		(void)sprintf(log_buffer, "/tmp/pbs_mom.%d", getpid());
+		printf("Debug output will be in %s\n", log_buffer);
+	
+		freopen(log_buffer, "w", stdout);
+		freopen(log_buffer, "w", stderr);
+		(void)ioctl(0, TCCLRCTTY, 0);
+		(void)close(0);
 #endif	/* _CRAY */
-	(void)setvbuf(stdout, NULL, _IOLBF, 0);
-	(void)setvbuf(stderr, NULL, _IOLBF, 0);
-#endif /* DEBUG */
+		(void)setvbuf(stdout, NULL, _IOLBF, 0);
+		(void)setvbuf(stderr, NULL, _IOLBF, 0);
+	}
 
 	/* write MOM's pid into lockfile */
 	(void)ftruncate(lockfds, (off_t)0);
@@ -2241,6 +2263,22 @@
 #ifdef _CRAY
 	sigaction(WJSIGNAL, &act, NULL);
 #endif
+	/*
+	 * Catch these signals to ensure we core dump even if
+	 * our rlimit for core dumps is set to 0 initially.
+	 *
+	 * Chris Samuel - VPAC
+	 * csamuel@vpac.org - 29th July 2003
+	 */
+
+	act.sa_handler = catch_abort;   /* make sure we core dump on these */
+	sigaction(SIGSEGV, &act, NULL);
+	sigaction(SIGBUS, &act, NULL);
+	sigaction(SIGFPE, &act, NULL);
+	sigaction(SIGILL, &act, NULL);
+	sigaction(SIGTRAP, &act, NULL);
+	sigaction(SIGSYS, &act, NULL);
+
 	act.sa_handler = catch_hup;	/* do a restart on SIGHUP */
 	sigaction(SIGHUP, &act, NULL);
 
Index: src/scheduler.cc/pbs_sched.c
===================================================================
RCS file: /usr/users6/vpac/cvs_sys/SuperCluster/TORQUE/src/scheduler.cc/pbs_sched.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 pbs_sched.c
--- src/scheduler.cc/pbs_sched.c	4 Aug 2003 01:51:45 -0000	1.1.1.1
+++ src/scheduler.cc/pbs_sched.c	4 Aug 2003 05:14:13 -0000
@@ -167,6 +167,27 @@
 	exit(1);
 }
 
+/*
+ *  * Catch core dump signals - set core size so we can see what happened!
+ *   *
+ *    * Chris Samuel - VPAC
+ *     * csamuel@vpac.org - 29th July 2003
+ *      */
+
+static void
+catch_abort(sig)
+	int sig;
+{
+	struct rlimit rlimit;
+
+	log_err (sig,"pbs_sched","Caught fatal core signal");
+	rlimit.rlim_cur = RLIM_INFINITY;
+	rlimit.rlim_max = RLIM_INFINITY;
+	(void)setrlimit(RLIMIT_CORE, &rlimit);
+	(void)abort();
+}
+
+
 static int
 server_disconnect(connect)
 int connect;
@@ -714,6 +735,21 @@
 	sigaction(SIGTERM, &act, NULL);
 
 	/*
+	 * Catch these signals to ensure we core dump even if
+	 * our rlimit for core dumps is set to 0 initially.
+	 *
+	 * Chris Samuel - VPAC
+	 * csamuel@vpac.org - 29th July 2003
+	 */
+	act.sa_handler = catch_abort;   /* make sure we core dump on these */
+	sigaction(SIGSEGV, &act, NULL);
+	sigaction(SIGBUS, &act, NULL);
+	sigaction(SIGFPE, &act, NULL);
+	sigaction(SIGILL, &act, NULL);
+	sigaction(SIGTRAP, &act, NULL);
+	sigaction(SIGSYS, &act, NULL);
+
+	/*
 	 *  Local initialization stuff
 	 */
 	if (schedinit(argc, argv)) {
@@ -723,27 +759,30 @@
 		exit(1);
 	}
 
-#ifndef	DEBUG
-	lock_out(lockfds, F_UNLCK);
-	if ((pid = fork()) == -1) {     /* error on fork */
-		perror("fork");
-		exit(1);
+	if (getenv("PBSDEBUG") == NULL)
+	{
+		lock_out(lockfds, F_UNLCK);
+		if ((pid = fork()) == -1) {     /* error on fork */
+			perror("fork");
+			exit(1);
+		}
+		else if (pid > 0)               /* parent exits */
+			exit(0);
+		if ((pid =setsid()) == -1) {
+			perror("setsid");
+			exit(1);
+		}
+		lock_out(lockfds, F_WRLCK);
+		freopen(dbfile, "a", stdout);
+		setvbuf(stdout, NULL, _IOLBF, 0);
+		dup2(fileno(stdout), fileno(stderr));
 	}
-	else if (pid > 0)               /* parent exits */
-		exit(0);
-	if ((pid =setsid()) == -1) {
-		perror("setsid");
-		exit(1);
+	else
+	{
+		setvbuf(stdout, NULL, _IOLBF, 0);
+		setvbuf(stderr, NULL, _IOLBF, 0);
+		pid = getpid();
 	}
-	lock_out(lockfds, F_WRLCK);
-	freopen(dbfile, "a", stdout);
-	setvbuf(stdout, NULL, _IOLBF, 0);
-	dup2(fileno(stdout), fileno(stderr));
-#else
-	setvbuf(stdout, NULL, _IOLBF, 0);
-	setvbuf(stderr, NULL, _IOLBF, 0);
-	pid = getpid();
-#endif
 	freopen("/dev/null", "r", stdin);
 
 	/* write schedulers pid into lockfile */
Index: src/server/pbsd_init.c
===================================================================
RCS file: /usr/users6/vpac/cvs_sys/SuperCluster/TORQUE/src/server/pbsd_init.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 pbsd_init.c
--- src/server/pbsd_init.c	4 Aug 2003 01:52:42 -0000	1.1.1.2
+++ src/server/pbsd_init.c	4 Aug 2003 05:14:14 -0000
@@ -185,6 +185,7 @@
 static void  init_abt_job A_((job *));
 static char *build_path A_((char *parent, char *name, char *subdir));
 static void  catch_child A_((int));
+static void  catch_abort A_((int));
 static void  change_logs A_((int));
 static int   chk_save_file A_((char *filename));
 static void  need_y_response A_((int));
@@ -300,6 +301,21 @@
 	}
 #endif
 
+	/*
+	 * Catch these signals to ensure we core dump even if
+	 * our rlimit for core dumps is set to 0 initially.
+	 *
+	 * Chris Samuel - VPAC
+	 * csamuel@vpac.org - 29th July 2003
+	 */
+	act.sa_handler = catch_abort;   /* make sure we core dump on these */
+	sigaction(SIGSEGV, &act, NULL);
+	sigaction(SIGBUS, &act, NULL);
+	sigaction(SIGFPE, &act, NULL);
+	sigaction(SIGILL, &act, NULL);
+	sigaction(SIGTRAP, &act, NULL);
+	sigaction(SIGSYS, &act, NULL);
+
 	act.sa_handler = catch_child;
 	if (sigaction( SIGCHLD, &act, &oact) != 0) {
 		log_err(errno, id, "sigaction for CHLD");
@@ -867,6 +883,26 @@
 		log_err(-1, "pbsd_init", logbuf);
 		(void)job_abt(pjob, logbuf);
 	}
+}
+
+/*
+ * Catch core dump signals - set core size so we can see what happened!
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+
+static void
+catch_abort(sig)
+	int sig;
+{
+	struct rlimit rlimit;
+
+	log_err (sig,"mom_main","Caught fatal core signal");
+	rlimit.rlim_cur = RLIM_INFINITY;
+	rlimit.rlim_max = RLIM_INFINITY;
+	(void)setrlimit(RLIMIT_CORE, &rlimit);
+	(void)abort();
 }
 
 /*

--Boundary-00=_TzeL/SeMQqWgTnM--