[torquedev] torque-4.1.0: qstat <queue> hangs
Martin Siegert
siegert at sfu.ca
Mon Jul 9 11:25:44 MDT 2012
Hi David,
it happens with all queues: route or execution.
And it never responds; strace shows contant polling:
This is the trace of "qstat" (completes successfully:
===<strace -f qstat>==================================
12809 execve("/usr/local/torque/bin/qstat", ["qstat"], [/* 57 vars */]) = 0
...
12809 connect(3, {sa_family=AF_INET, sin_port=htons(15001), sin_addr=inet_addr("172.18.0.40")}, 16) = 0
12809 getuid() = 211168
12809 open("/etc/passwd", O_RDONLY) = 5
12809 fcntl(5, F_GETFD) = 0
12809 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
12809 fstat(5, {st_mode=S_IFREG|0644, st_size=2877, ...}) = 0
12809 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad146e41000
12809 read(5, "root:x:0:0:root:/root:/bin/bash\n"..., 4096) = 2877
12809 close(5) = 0
12809 munmap(0x2ad146e41000, 4096) = 0
12809 getsockname(3, {sa_family=AF_INET, sin_port=htons(55787), sin_addr=inet_addr("172.18.0.40")}, [16994614677449211920]) = 0
12809 rt_sigaction(SIGPIPE, {0x1, [PIPE], SA_RESTORER|SA_RESTART, 0x3a3e6302f0}, {SIG_DFL, [], 0}, 8) = 0
12809 socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
12809 setsockopt(5, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
12809 setsockopt(5, SOL_SOCKET, SO_LINGER, {onoff=0, linger=0}, 8) = 0
12809 setsockopt(5, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
12809 setsockopt(5, SOL_TCP, TCP_KEEPIDLE, [10], 4) = 0
12809 connect(5, {sa_family=AF_INET, sin_port=htons(15005), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
12809 write(5, "3|dev|15001|1|7|siegert|55787|", 30) = 30
12809 ioctl(5, FIONREAD, [0]) = 0
12809 poll([{fd=5, events=POLLIN|POLLHUP}], 1, 100) = 1 ([{fd=5, revents=POLLIN}])
12809 recvfrom(5, "0|0||", 7, MSG_PEEK|MSG_DONTWAIT, NULL, NULL) = 5
12809 ioctl(5, FIONREAD, [5]) = 0
12809 read(5, "0", 1) = 1
12809 read(5, "|", 1) = 1
12809 ioctl(5, FIONREAD, [3]) = 0
12809 read(5, "0", 1) = 1
12809 read(5, "|", 1) = 1
12809 ioctl(5, FIONREAD, [1]) = 0
12809 read(5, "|", 1) = 1
12809 ioctl(5, FIONREAD, [0]) = 0
12809 close(5) = 0
12809 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad14727b000
12809 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad1472bc000
12809 write(3, "+2+22+19+7siegert+0+0+12+16summa"..., 43) = 43
12809 munmap(0x2ad14727b000, 266240) = 0
12809 munmap(0x2ad1472bc000, 266240) = 0
12809 brk(0x87d8000) = 0x87d8000
12809 brk(0x8818000) = 0x8818000
12809 ioctl(3, FIONREAD, [0]) = 0
12809 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 1 ([{fd=3, revents=POLLIN}])
12809 recvfrom(3, "+2+2+0+", 7, MSG_PEEK|MSG_DONTWAIT, NULL, NULL) = 7
12809 ioctl(3, FIONREAD, [2376]) = 0
12809 read(3, "+2+2+0+0+6+2+2+87091.dev2+282+15"..., 2376) = 2376
12809 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
12809 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad14727b000
12809 write(1, "Job id Name "..., 76) = 76
12809 write(1, "------------------------- ------"..., 76) = 76
12809 write(1, "7091.dev ising"..., 87) = 87
12809 write(1, "7092.dev ising"..., 87) = 87
12809 write(3, "+2+22+59+7siegert", 17) = 17
12809 rt_sigaction(SIGALRM, {0x2ad146afcc78, [], SA_RESTORER, 0x3a3f20eb70}, {SIG_DFL, [], 0}, 8) = 0
12809 alarm(5) = 0
12809 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
12809 read(3, "", 65536) = 0
12809 alarm(0) = 5
12809 rt_sigaction(SIGALRM, {SIG_DFL, [], SA_RESTORER, 0x3a3f20eb70}, NULL, 8) = 0
12809 brk(0x8798000) = 0x8798000
12809 close(3) = 0
12809 exit_group(0) = ?
===</strace -f qstat>==================================
and this is the trace of "qstat q1":
===<strace -f qstat q1>================================
12812 execve("/usr/local/torque/bin/qstat", ["qstat", "q1"], [/* 57 vars */]) = 0
...
12812 connect(3, {sa_family=AF_INET, sin_port=htons(15001), sin_addr=inet_addr("172.18.0.40")}, 16) = 0
12812 getuid() = 211168
12812 open("/etc/passwd", O_RDONLY) = 5
12812 fcntl(5, F_GETFD) = 0
12812 fcntl(5, F_SETFD, FD_CLOEXEC) = 0
12812 fstat(5, {st_mode=S_IFREG|0644, st_size=2877, ...}) = 0
12812 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b49c1b1d000
12812 read(5, "root:x:0:0:root:/root:/bin/bash\n"..., 4096) = 2877
12812 close(5) = 0
12812 munmap(0x2b49c1b1d000, 4096) = 0
12812 getsockname(3, {sa_family=AF_INET, sin_port=htons(55789), sin_addr=inet_addr("172.18.0.40")}, [17138729865525067792]) = 0
12812 rt_sigaction(SIGPIPE, {0x1, [PIPE], SA_RESTORER|SA_RESTART, 0x3a3e6302f0}, {SIG_DFL, [], 0}, 8) = 0
12812 socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
12812 setsockopt(5, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
12812 setsockopt(5, SOL_SOCKET, SO_LINGER, {onoff=0, linger=0}, 8) = 0
12812 setsockopt(5, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
12812 setsockopt(5, SOL_TCP, TCP_KEEPIDLE, [10], 4) = 0
12812 connect(5, {sa_family=AF_INET, sin_port=htons(15005), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
12812 write(5, "3|dev|15001|1|7|siegert|55789|", 30) = 30
12812 ioctl(5, FIONREAD, [0]) = 0
12812 poll([{fd=5, events=POLLIN|POLLHUP}], 1, 100) = 1 ([{fd=5, revents=POLLIN}])
12812 recvfrom(5, "0|0||", 7, MSG_PEEK|MSG_DONTWAIT, NULL, NULL) = 5
12812 ioctl(5, FIONREAD, [5]) = 0
12812 read(5, "0", 1) = 1
12812 read(5, "|", 1) = 1
12812 ioctl(5, FIONREAD, [3]) = 0
12812 read(5, "0", 1) = 1
12812 read(5, "|", 1) = 1
12812 ioctl(5, FIONREAD, [1]) = 0
12812 read(5, "|", 1) = 1
12812 ioctl(5, FIONREAD, [0]) = 0
12812 close(5) = 0
12812 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b49c1f57000
12812 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b49c1f98000
12812 write(3, "+2+22+51+7siegert+12+152+11desti"..., 68) = 68
12812 munmap(0x2b49c1f57000, 266240) = 0
12812 munmap(0x2b49c1f98000, 266240) = 0
12812 brk(0xf872000) = 0xf872000
12812 brk(0xf8b2000) = 0xf8b2000
12812 ioctl(3, FIONREAD, [0]) = 0
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = -1 EINTR (Interrupted system call)
12812 --- SIGINT (Interrupt) @ 0 (0) ---
12812 +++ killed by SIGINT +++
===</strace -f qstat q1>===============================
When I switch back to torque-4.0.2 "qstat q1" works as expected.
Cheers,
Martin
On Mon, Jul 09, 2012 at 10:23:38AM -0600, David Beer wrote:
>
> Martin,
> I haven't been able to reproduce this issue. Just to clarify - what
> kind of queue is it and what state is it in?
> Also, when you say hangs, do you mean it is slow to respond or do you
> mean it never responds?
> David
>
> On Fri, Jul 6, 2012 at 8:51 PM, Martin Siegert <[1]siegert at sfu.ca>
> wrote:
>
> Hi,
> I just installed torque-4.1.0 and have problems with the qstat
> command:
> qstat queuename
> hangs (whereas just "qstat" works).
> Cheers,
> Martin
> --
> Martin Siegert
> Simon Fraser University
> Burnaby, British Columbia
> Canada
> _______________________________________________
> torquedev mailing list
> [2]torquedev at supercluster.org
> [3]http://www.supercluster.org/mailman/listinfo/torquedev
>
> --
>
> David Beer | Software Engineer
>
> Adaptive Computing
Canada V5A 1S6
More information about the torquedev
mailing list