[TORQUEUsers] pbs_mom problem(s)
Chris Samuel
csamuel@vpac.org
Mon, 4 Aug 2003 15:19:15 +1000
--Boundary-00=_TzeL/SeMQqWgTnM
Content-Type: Text/Plain;
charset="us-ascii"
Content-Transfer-Encoding: quoted-printable
Content-Description: clearsigned data
Content-Disposition: inline
=2D----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Hi all,
We've had a problem for a while where pbs_mom's seemed to die at random.
On the sc2 release I hacked in some catches for things like SEGV and caught=
a=20
couple of core dumps, a backtrace of which looks like:
(gdb) where
#0 0x4004a0a1 in kill () from /lib/libc.so.6
#1 0x40049e99 in raise () from /lib/libc.so.6
#2 0x4004b364 in abort () from /lib/libc.so.6
#3 0x0804c2ba in catch_abort () at mom_main.c:1075
#4 <signal handler called>
#5 0x08053085 in im_request (stream=3D1, version=3D1) at mom_comm.c:1808
#6 0x0804d3e7 in do_rpp (stream=3D1) at mom_main.c:1576
#7 0x0804d48f in rpp_request (fd=3D8) at mom_main.c:1606
#8 0x0806d084 in wait_request (waittime=3D10) at net_server.c:241
#9 0x0804d8ef in finish_loop (waittime=3D10) at mom_main.c:1757
#10 0x0804f1e6 in main (argc=3D3, argv=3D0xbffff394) at mom_main.c:2588
#11 0x400391c4 in __libc_start_main () from /lib/libc.so.6
Note that this is a slightly patched version of pbs_mom from sc2, i.e. I've=
=20
added a signal handler for SIGSEGV, SIGBUS, etc - hence the signal handler=
=20
getting called (logging a SEGV).
To give that a bit of context, the area of code it seems to happen in is he=
re=20
(line 1808 is marked with ---> ):
case IM_KILL_JOB:
/*
** Sender is sending a responce that a job
** which needs to die has been given the ax.
** I'm mother superior.
**
** auxiliary info (
** cput int;
** mem int;
** )
*/
if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) =3D=
=3D 0) {
log_err(-1, id,
"got KILL_JOB OKAY and I'm not MS");
goto err;
}
DBPRT(("%s: KILL_JOB %s OKAY\n", id, jobid))
---> pjob->ji_resources[nodeidx-1].nr_cput =3D
disrul(stream, &ret);
if (ret !=3D DIS_SUCCESS)
goto err;
DBPRT(("%s: %s FINAL from %d cpu %lu sec mem %lu=20
kb\n",
id, jobid, nodeidx,
pjob->ji_resources[nodeidx-1].nr_cput,
pjob->ji_resources[nodeidx-1].nr_mem))
This seems to match what we've gleaned from our logs, in that it seems to b=
e=20
associated with trying to kill a job off (possibly one that has already=20
exit()ed or crashed).
OK - now onto Valgrind.
I've patched pbs_mom and pbs_sched to do the same as pbs_server does when t=
he=20
PBSDEBUG variable is set, i.e. don't fork into the background. Running that=
=20
version under valgrind flags the following error when gettting a job:
do_rpp: got an internal task manager request
im_request: stream 1 version 1
connect from 172.20.3.2:1023
im_request: JOIN_JOB 13919.brecca-m.vpac.org node 2
job_nodes: 0: node002/0
job_nodes: 1: node019/0
job_nodes: 2: node023/0
job_nodes: 3: node004/0
job: 13919.brecca-m.vpac.org numnodes=3D4 numvnod=3D4
=3D=3D10302=3D=3D Reading syms from /lib/libnss_compat-2.2.5.so
=3D=3D10302=3D=3D object doesn't have any debug info
=3D=3D10302=3D=3D Syscall param write(buf) contains uninitialised or unaddr=
essable=20
byte(
s)
=3D=3D10302=3D=3D at 0x4030E754: __libc_write (in /lib/libc-2.2.5.so)
=3D=3D10302=3D=3D by 0x8056ADF: save_struct (./../server/attr_recov.c:18=
6)
=3D=3D10302=3D=3D by 0x8056D70: save_attr (./../server/attr_recov.c:309)
=3D=3D10302=3D=3D by 0x8059023: job_save (./../server/job_recov.c:242)
=3D=3D10302=3D=3D Address 0x8085346 is not stack'd, malloc'd or free'd
=3D=3D10302=3D=3D
=3D=3D10302=3D=3D Syscall param write(buf) contains uninitialised or unaddr=
essable=20
byte(
s)
=3D=3D10302=3D=3D at 0x4030E754: __libc_write (in /lib/libc-2.2.5.so)
=3D=3D10302=3D=3D by 0x8056BD2: save_flush (./../server/attr_recov.c:234)
=3D=3D10302=3D=3D by 0x805903C: job_save (./../server/job_recov.c:245)
=3D=3D10302=3D=3D by 0x80516CB: im_request (mom_comm.c:1242)
=3D=3D10302=3D=3D Address 0x808540E is not stack'd, malloc'd or free'd
mom_do_poll: entered
do_rpp: got an internal task manager request
im_request: stream 1 version 1
connect from 172.20.3.2:1023
im_request: KILL_JOB 13919.brecca-m.vpac.org
kill_jobs: Killing 1092562312
Interestingly it doesn't log a problem when it does the kill job for some=20
reason, but as I wrote above I suspect it's conditional upon the job alread=
y=20
having died on the node when pbs_mom gets told to kill it.
I've attached a patch against the p3 release at:
http://www.supercluster.org/downloads/TORQUE/
cheers!
Chris
=2D --=20
Chris Samuel -- VPAC Systems & Network Admin
Victorian Partnership for Advanced Computing
Bldg 91, 110 Victoria Street, Carlton South,
VIC 3053, Australia - http://www.vpac.org/
=2D----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.2 (GNU/Linux)
iD8DBQE/LezXO2KABBYQAh8RAoupAJ9Tsd8UjsOzqUET3b3yvCkdja3sYwCdH0QN
RvoVdibQZ0yzt5atAa57rIQ=3D
=3D/wUL
=2D----END PGP SIGNATURE-----
--Boundary-00=_TzeL/SeMQqWgTnM
Content-Type: text/x-diff;
charset="us-ascii";
name="mainstream.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="mainstream.diff"
Index: src/resmom/mom_main.c
===================================================================
RCS file: /usr/users6/vpac/cvs_sys/SuperCluster/TORQUE/src/resmom/mom_main.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 mom_main.c
--- src/resmom/mom_main.c 4 Aug 2003 01:51:43 -0000 1.1.1.1
+++ src/resmom/mom_main.c 4 Aug 2003 05:14:12 -0000
@@ -1054,6 +1054,25 @@
return ret_string;
}
+/*
+ * Catch core dump signals - set core size so we can see what happened!
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+static void
+catch_abort(sig)
+ int sig;
+{
+ struct rlimit rlimit;
+
+ log_err (sig,"mom_main","Caught fatal core signal");
+ rlimit.rlim_cur = RLIM_INFINITY;
+ rlimit.rlim_max = RLIM_INFINITY;
+ (void)setrlimit(RLIMIT_CORE, &rlimit);
+ (void)abort();
+}
+
static void
catch_hup(sig)
int sig;
@@ -2157,44 +2176,47 @@
/* go into the background and become own session/process group */
-#ifndef DEBUG
- mom_lock(lockfds, F_UNLCK); /* unlock so child can relock */
+ if (getenv("PBSDEBUG") == NULL)
+ {
+ mom_lock(lockfds, F_UNLCK); /* unlock so child can relock */
- if (fork() > 0)
- return (0); /* parent goes away */
-
- if (setsid() == -1) {
- log_err(errno, msg_daemonname, "setsid failed");
- return (2);
- }
- mom_lock(lockfds, F_WRLCK); /* lock out other MOMs */
+ if (fork() > 0)
+ return (0); /* parent goes away */
- (void)fclose(stdin);
- (void)fclose(stdout);
- (void)fclose(stderr);
- dummyfile = fopen("/dev/null", "r");
- assert( (dummyfile != 0) && (fileno(dummyfile) == 0) );
- dummyfile = fopen("/dev/null", "w");
- assert( (dummyfile != 0) && (fileno(dummyfile) == 1) );
- dummyfile = fopen("/dev/null", "w");
- assert( (dummyfile != 0) && (fileno(dummyfile) == 2) );
-
-#else /* DEBUG */
+ if (setsid() == -1) {
+ log_err(errno, msg_daemonname, "setsid failed");
+ return (2);
+ }
+ mom_lock(lockfds, F_WRLCK); /* lock out other MOMs */
+
+ (void)fclose(stdin);
+ (void)fclose(stdout);
+ (void)fclose(stderr);
+ dummyfile = fopen("/dev/null", "r");
+ assert( (dummyfile != 0) && (fileno(dummyfile) == 0) );
+ dummyfile = fopen("/dev/null", "w");
+ assert( (dummyfile != 0) && (fileno(dummyfile) == 1) );
+ dummyfile = fopen("/dev/null", "w");
+ assert( (dummyfile != 0) && (fileno(dummyfile) == 2) );
+
+ }
+ else
+ {
#if defined(_CRAY)
- /* CRAY cannot restart checkpointed job if MOM has controlling tty */
+ /* CRAY cannot restart checkpointed job if MOM has controlling tty */
- (void)sprintf(log_buffer, "/tmp/pbs_mom.%d", getpid());
- printf("Debug output will be in %s\n", log_buffer);
-
- freopen(log_buffer, "w", stdout);
- freopen(log_buffer, "w", stderr);
- (void)ioctl(0, TCCLRCTTY, 0);
- (void)close(0);
+ (void)sprintf(log_buffer, "/tmp/pbs_mom.%d", getpid());
+ printf("Debug output will be in %s\n", log_buffer);
+
+ freopen(log_buffer, "w", stdout);
+ freopen(log_buffer, "w", stderr);
+ (void)ioctl(0, TCCLRCTTY, 0);
+ (void)close(0);
#endif /* _CRAY */
- (void)setvbuf(stdout, NULL, _IOLBF, 0);
- (void)setvbuf(stderr, NULL, _IOLBF, 0);
-#endif /* DEBUG */
+ (void)setvbuf(stdout, NULL, _IOLBF, 0);
+ (void)setvbuf(stderr, NULL, _IOLBF, 0);
+ }
/* write MOM's pid into lockfile */
(void)ftruncate(lockfds, (off_t)0);
@@ -2241,6 +2263,22 @@
#ifdef _CRAY
sigaction(WJSIGNAL, &act, NULL);
#endif
+ /*
+ * Catch these signals to ensure we core dump even if
+ * our rlimit for core dumps is set to 0 initially.
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+
+ act.sa_handler = catch_abort; /* make sure we core dump on these */
+ sigaction(SIGSEGV, &act, NULL);
+ sigaction(SIGBUS, &act, NULL);
+ sigaction(SIGFPE, &act, NULL);
+ sigaction(SIGILL, &act, NULL);
+ sigaction(SIGTRAP, &act, NULL);
+ sigaction(SIGSYS, &act, NULL);
+
act.sa_handler = catch_hup; /* do a restart on SIGHUP */
sigaction(SIGHUP, &act, NULL);
Index: src/scheduler.cc/pbs_sched.c
===================================================================
RCS file: /usr/users6/vpac/cvs_sys/SuperCluster/TORQUE/src/scheduler.cc/pbs_sched.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 pbs_sched.c
--- src/scheduler.cc/pbs_sched.c 4 Aug 2003 01:51:45 -0000 1.1.1.1
+++ src/scheduler.cc/pbs_sched.c 4 Aug 2003 05:14:13 -0000
@@ -167,6 +167,27 @@
exit(1);
}
+/*
+ * * Catch core dump signals - set core size so we can see what happened!
+ * *
+ * * Chris Samuel - VPAC
+ * * csamuel@vpac.org - 29th July 2003
+ * */
+
+static void
+catch_abort(sig)
+ int sig;
+{
+ struct rlimit rlimit;
+
+ log_err (sig,"pbs_sched","Caught fatal core signal");
+ rlimit.rlim_cur = RLIM_INFINITY;
+ rlimit.rlim_max = RLIM_INFINITY;
+ (void)setrlimit(RLIMIT_CORE, &rlimit);
+ (void)abort();
+}
+
+
static int
server_disconnect(connect)
int connect;
@@ -714,6 +735,21 @@
sigaction(SIGTERM, &act, NULL);
/*
+ * Catch these signals to ensure we core dump even if
+ * our rlimit for core dumps is set to 0 initially.
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+ act.sa_handler = catch_abort; /* make sure we core dump on these */
+ sigaction(SIGSEGV, &act, NULL);
+ sigaction(SIGBUS, &act, NULL);
+ sigaction(SIGFPE, &act, NULL);
+ sigaction(SIGILL, &act, NULL);
+ sigaction(SIGTRAP, &act, NULL);
+ sigaction(SIGSYS, &act, NULL);
+
+ /*
* Local initialization stuff
*/
if (schedinit(argc, argv)) {
@@ -723,27 +759,30 @@
exit(1);
}
-#ifndef DEBUG
- lock_out(lockfds, F_UNLCK);
- if ((pid = fork()) == -1) { /* error on fork */
- perror("fork");
- exit(1);
+ if (getenv("PBSDEBUG") == NULL)
+ {
+ lock_out(lockfds, F_UNLCK);
+ if ((pid = fork()) == -1) { /* error on fork */
+ perror("fork");
+ exit(1);
+ }
+ else if (pid > 0) /* parent exits */
+ exit(0);
+ if ((pid =setsid()) == -1) {
+ perror("setsid");
+ exit(1);
+ }
+ lock_out(lockfds, F_WRLCK);
+ freopen(dbfile, "a", stdout);
+ setvbuf(stdout, NULL, _IOLBF, 0);
+ dup2(fileno(stdout), fileno(stderr));
}
- else if (pid > 0) /* parent exits */
- exit(0);
- if ((pid =setsid()) == -1) {
- perror("setsid");
- exit(1);
+ else
+ {
+ setvbuf(stdout, NULL, _IOLBF, 0);
+ setvbuf(stderr, NULL, _IOLBF, 0);
+ pid = getpid();
}
- lock_out(lockfds, F_WRLCK);
- freopen(dbfile, "a", stdout);
- setvbuf(stdout, NULL, _IOLBF, 0);
- dup2(fileno(stdout), fileno(stderr));
-#else
- setvbuf(stdout, NULL, _IOLBF, 0);
- setvbuf(stderr, NULL, _IOLBF, 0);
- pid = getpid();
-#endif
freopen("/dev/null", "r", stdin);
/* write schedulers pid into lockfile */
Index: src/server/pbsd_init.c
===================================================================
RCS file: /usr/users6/vpac/cvs_sys/SuperCluster/TORQUE/src/server/pbsd_init.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 pbsd_init.c
--- src/server/pbsd_init.c 4 Aug 2003 01:52:42 -0000 1.1.1.2
+++ src/server/pbsd_init.c 4 Aug 2003 05:14:14 -0000
@@ -185,6 +185,7 @@
static void init_abt_job A_((job *));
static char *build_path A_((char *parent, char *name, char *subdir));
static void catch_child A_((int));
+static void catch_abort A_((int));
static void change_logs A_((int));
static int chk_save_file A_((char *filename));
static void need_y_response A_((int));
@@ -300,6 +301,21 @@
}
#endif
+ /*
+ * Catch these signals to ensure we core dump even if
+ * our rlimit for core dumps is set to 0 initially.
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+ act.sa_handler = catch_abort; /* make sure we core dump on these */
+ sigaction(SIGSEGV, &act, NULL);
+ sigaction(SIGBUS, &act, NULL);
+ sigaction(SIGFPE, &act, NULL);
+ sigaction(SIGILL, &act, NULL);
+ sigaction(SIGTRAP, &act, NULL);
+ sigaction(SIGSYS, &act, NULL);
+
act.sa_handler = catch_child;
if (sigaction( SIGCHLD, &act, &oact) != 0) {
log_err(errno, id, "sigaction for CHLD");
@@ -867,6 +883,26 @@
log_err(-1, "pbsd_init", logbuf);
(void)job_abt(pjob, logbuf);
}
+}
+
+/*
+ * Catch core dump signals - set core size so we can see what happened!
+ *
+ * Chris Samuel - VPAC
+ * csamuel@vpac.org - 29th July 2003
+ */
+
+static void
+catch_abort(sig)
+ int sig;
+{
+ struct rlimit rlimit;
+
+ log_err (sig,"mom_main","Caught fatal core signal");
+ rlimit.rlim_cur = RLIM_INFINITY;
+ rlimit.rlim_max = RLIM_INFINITY;
+ (void)setrlimit(RLIMIT_CORE, &rlimit);
+ (void)abort();
}
/*
--Boundary-00=_TzeL/SeMQqWgTnM--