[torquedev] torque-4.1.0: qstat <queue> hangs

Martin Siegert siegert at sfu.ca
Mon Jul 9 11:25:44 MDT 2012


Hi David,

it happens with all queues: route or execution.
And it never responds; strace shows contant polling:

This is the trace of "qstat" (completes successfully:

===<strace -f qstat>==================================
12809 execve("/usr/local/torque/bin/qstat", ["qstat"], [/* 57 vars */]) = 0
...
12809 connect(3, {sa_family=AF_INET, sin_port=htons(15001), sin_addr=inet_addr("172.18.0.40")}, 16) = 0
12809 getuid()                          = 211168
12809 open("/etc/passwd", O_RDONLY)     = 5
12809 fcntl(5, F_GETFD)                 = 0
12809 fcntl(5, F_SETFD, FD_CLOEXEC)     = 0
12809 fstat(5, {st_mode=S_IFREG|0644, st_size=2877, ...}) = 0
12809 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad146e41000
12809 read(5, "root:x:0:0:root:/root:/bin/bash\n"..., 4096) = 2877
12809 close(5)                          = 0
12809 munmap(0x2ad146e41000, 4096)      = 0
12809 getsockname(3, {sa_family=AF_INET, sin_port=htons(55787), sin_addr=inet_addr("172.18.0.40")}, [16994614677449211920]) = 0
12809 rt_sigaction(SIGPIPE, {0x1, [PIPE], SA_RESTORER|SA_RESTART, 0x3a3e6302f0}, {SIG_DFL, [], 0}, 8) = 0
12809 socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
12809 setsockopt(5, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
12809 setsockopt(5, SOL_SOCKET, SO_LINGER, {onoff=0, linger=0}, 8) = 0
12809 setsockopt(5, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
12809 setsockopt(5, SOL_TCP, TCP_KEEPIDLE, [10], 4) = 0
12809 connect(5, {sa_family=AF_INET, sin_port=htons(15005), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
12809 write(5, "3|dev|15001|1|7|siegert|55787|", 30) = 30
12809 ioctl(5, FIONREAD, [0])           = 0
12809 poll([{fd=5, events=POLLIN|POLLHUP}], 1, 100) = 1 ([{fd=5, revents=POLLIN}])
12809 recvfrom(5, "0|0||", 7, MSG_PEEK|MSG_DONTWAIT, NULL, NULL) = 5
12809 ioctl(5, FIONREAD, [5])           = 0
12809 read(5, "0", 1)                   = 1
12809 read(5, "|", 1)                   = 1
12809 ioctl(5, FIONREAD, [3])           = 0
12809 read(5, "0", 1)                   = 1
12809 read(5, "|", 1)                   = 1
12809 ioctl(5, FIONREAD, [1])           = 0
12809 read(5, "|", 1)                   = 1
12809 ioctl(5, FIONREAD, [0])           = 0
12809 close(5)                          = 0
12809 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad14727b000
12809 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad1472bc000
12809 write(3, "+2+22+19+7siegert+0+0+12+16summa"..., 43) = 43
12809 munmap(0x2ad14727b000, 266240)    = 0
12809 munmap(0x2ad1472bc000, 266240)    = 0
12809 brk(0x87d8000)                    = 0x87d8000
12809 brk(0x8818000)                    = 0x8818000
12809 ioctl(3, FIONREAD, [0])           = 0
12809 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 1 ([{fd=3, revents=POLLIN}])
12809 recvfrom(3, "+2+2+0+", 7, MSG_PEEK|MSG_DONTWAIT, NULL, NULL) = 7
12809 ioctl(3, FIONREAD, [2376])        = 0
12809 read(3, "+2+2+0+0+6+2+2+87091.dev2+282+15"..., 2376) = 2376
12809 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
12809 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ad14727b000
12809 write(1, "Job id                    Name  "..., 76) = 76
12809 write(1, "------------------------- ------"..., 76) = 76
12809 write(1, "7091.dev                   ising"..., 87) = 87
12809 write(1, "7092.dev                   ising"..., 87) = 87
12809 write(3, "+2+22+59+7siegert", 17) = 17
12809 rt_sigaction(SIGALRM, {0x2ad146afcc78, [], SA_RESTORER, 0x3a3f20eb70}, {SIG_DFL, [], 0}, 8) = 0
12809 alarm(5)                          = 0
12809 fcntl(3, F_GETFL)                 = 0x2 (flags O_RDWR)
12809 read(3, "", 65536)                = 0
12809 alarm(0)                          = 5
12809 rt_sigaction(SIGALRM, {SIG_DFL, [], SA_RESTORER, 0x3a3f20eb70}, NULL, 8) = 0
12809 brk(0x8798000)                    = 0x8798000
12809 close(3)                          = 0
12809 exit_group(0)                     = ?
===</strace -f qstat>==================================

and this is the trace of "qstat q1":

===<strace -f qstat q1>================================
12812 execve("/usr/local/torque/bin/qstat", ["qstat", "q1"], [/* 57 vars */]) = 0
...
12812 connect(3, {sa_family=AF_INET, sin_port=htons(15001), sin_addr=inet_addr("172.18.0.40")}, 16) = 0
12812 getuid()                          = 211168
12812 open("/etc/passwd", O_RDONLY)     = 5
12812 fcntl(5, F_GETFD)                 = 0
12812 fcntl(5, F_SETFD, FD_CLOEXEC)     = 0
12812 fstat(5, {st_mode=S_IFREG|0644, st_size=2877, ...}) = 0
12812 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b49c1b1d000
12812 read(5, "root:x:0:0:root:/root:/bin/bash\n"..., 4096) = 2877
12812 close(5)                          = 0
12812 munmap(0x2b49c1b1d000, 4096)      = 0
12812 getsockname(3, {sa_family=AF_INET, sin_port=htons(55789), sin_addr=inet_addr("172.18.0.40")}, [17138729865525067792]) = 0
12812 rt_sigaction(SIGPIPE, {0x1, [PIPE], SA_RESTORER|SA_RESTART, 0x3a3e6302f0}, {SIG_DFL, [], 0}, 8) = 0
12812 socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
12812 setsockopt(5, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
12812 setsockopt(5, SOL_SOCKET, SO_LINGER, {onoff=0, linger=0}, 8) = 0
12812 setsockopt(5, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
12812 setsockopt(5, SOL_TCP, TCP_KEEPIDLE, [10], 4) = 0
12812 connect(5, {sa_family=AF_INET, sin_port=htons(15005), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
12812 write(5, "3|dev|15001|1|7|siegert|55789|", 30) = 30
12812 ioctl(5, FIONREAD, [0])           = 0
12812 poll([{fd=5, events=POLLIN|POLLHUP}], 1, 100) = 1 ([{fd=5, revents=POLLIN}])
12812 recvfrom(5, "0|0||", 7, MSG_PEEK|MSG_DONTWAIT, NULL, NULL) = 5
12812 ioctl(5, FIONREAD, [5])           = 0
12812 read(5, "0", 1)                   = 1
12812 read(5, "|", 1)                   = 1
12812 ioctl(5, FIONREAD, [3])           = 0
12812 read(5, "0", 1)                   = 1
12812 read(5, "|", 1)                   = 1
12812 ioctl(5, FIONREAD, [1])           = 0
12812 read(5, "|", 1)                   = 1
12812 ioctl(5, FIONREAD, [0])           = 0
12812 close(5)                          = 0
12812 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b49c1f57000
12812 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b49c1f98000
12812 write(3, "+2+22+51+7siegert+12+152+11desti"..., 68) = 68
12812 munmap(0x2b49c1f57000, 266240)    = 0
12812 munmap(0x2b49c1f98000, 266240)    = 0
12812 brk(0xf872000)                    = 0xf872000
12812 brk(0xf8b2000)                    = 0xf8b2000
12812 ioctl(3, FIONREAD, [0])           = 0
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = 0 (Timeout)
12812 poll([{fd=3, events=POLLIN|POLLHUP}], 1, 100) = -1 EINTR (Interrupted system call)
12812 --- SIGINT (Interrupt) @ 0 (0) ---
12812 +++ killed by SIGINT +++
===</strace -f qstat q1>===============================

When I switch back to torque-4.0.2 "qstat q1" works as expected.

Cheers,
Martin

On Mon, Jul 09, 2012 at 10:23:38AM -0600, David Beer wrote:
> 
>    Martin,
>    I haven't been able to reproduce this issue. Just to clarify - what
>    kind of queue is it and what state is it in?
>    Also, when you say hangs, do you mean it is slow to respond or do you
>    mean it never responds?
>    David
> 
>    On Fri, Jul 6, 2012 at 8:51 PM, Martin Siegert <[1]siegert at sfu.ca>
>    wrote:
> 
>      Hi,
>      I just installed torque-4.1.0 and have problems with the qstat
>      command:
>      qstat queuename
>      hangs (whereas just "qstat" works).
>      Cheers,
>      Martin
>      --
>      Martin Siegert
>      Simon Fraser University
>      Burnaby, British Columbia
>      Canada
>      _______________________________________________
>      torquedev mailing list
>      [2]torquedev at supercluster.org
>      [3]http://www.supercluster.org/mailman/listinfo/torquedev
> 
>    --
> 
>    David Beer | Software Engineer
> 
>    Adaptive Computing
Canada  V5A 1S6


More information about the torquedev mailing list