[torqueusers] Slow response of torque when jobs are running

Luc Vereecken Luc.Vereecken at chem.kuleuven.be
Mon Dec 7 18:39:38 MST 2009


Hi Chris,

I attach a strace -T output of qstat. The output looked like a normal 
qstat output with jobnumbers and running times etc, so nothing special there.
The strace reveals that it all goes awry when accessing the 
/tmp/.torque-unix. Major time is lost on a poll (line 78) and a read 
(line 90), all other times look like normal timings.

That reminds me that there is something like a no-unix-sockets option 
in configure, iirc.

Thanks for looking into this.
Luc


At 11:56 PM 12/7/2009, Chris Samuel wrote:

>----- "Luc Vereecken" <Luc.Vereecken at chem.kuleuven.be> wrote:
>
> > I have upgraded my queuing system to torque-2.4.3-snap.200912031436,
> > and as far as I can tell, everything is working correctly. However,
> > when there are jobs running, response from torque commands, such as
> > pbsnodes, qstat, qdel, etc becomes very slow at times, sometimes
> > taking 30 seconds up to 5 minutes to do anything, both on the head
> > node and the compute nodes.
>
>Any chance of an strace -T of qstat when it is in
>this situation please ?
>
>The -T option is important, that says how long it
>was spending in each system call.
>
>cheers,
>Chris
>--
>Christopher Samuel - (03) 9925 4751 - Systems Manager
>  The Victorian Partnership for Advanced Computing
>  P.O. Box 201, Carlton South, VIC 3053, Australia
>VPAC is a not-for-profit Registered Research Agency
>_______________________________________________
>torqueusers mailing list
>torqueusers at supercluster.org
>http://www.supercluster.org/mailman/listinfo/torqueusers
-------------- next part --------------
execve("/usr/local/bin/qstat", ["qstat"], [/* 64 vars */]) = 0 <0.000065>
brk(0)                                  = 0x609000 <0.000003>
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b068f8b1000 <0.000004>
uname({sys="Linux", node="gweyring", ...}) = 0 <0.000003>
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory) <0.000004>
open("/etc/ld.so.cache", O_RDONLY)      = 3 <0.000005>
fstat(3, {st_mode=S_IFREG|0644, st_size=121589, ...}) = 0 <0.000002>
mmap(NULL, 121589, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2b068f8b2000 <0.000004>
close(3)                                = 0 <0.000002>
open("/usr/local/lib/libtorque.so.2", O_RDONLY) = 3 <0.000010>
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\256\0\0\0\0\0\0"..., 832) = 832 <0.000006>
fstat(3, {st_mode=S_IFREG|0755, st_size=754131, ...}) = 0 <0.000002>
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b068f8d0000 <0.000004>
mmap(NULL, 3207360, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x2b068fab2000 <0.000004>
fadvise64(3, 0, 3207360, POSIX_FADV_WILLNEED) = 0 <0.000007>
mprotect(0x2b068fadc000, 2097152, PROT_NONE) = 0 <0.000006>
mmap(0x2b068fcdc000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2a000) = 0x2b068fcdc000 <0.000005>
mmap(0x2b068fcde000, 929984, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x2b068fcde000 <0.000006>
close(3)                                = 0 <0.000003>
open("/lib64/libc.so.6", O_RDONLY)      = 3 <0.000005>
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\334\1\0\0\0\0\0"..., 832) = 832 <0.000003>
fstat(3, {st_mode=S_IFREG|0755, st_size=1354000, ...}) = 0 <0.000003>
mmap(NULL, 3424568, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x2b068fdc2000 <0.000004>
fadvise64(3, 0, 3424568, POSIX_FADV_WILLNEED) = 0 <0.000009>
mprotect(0x2b068fefe000, 2093056, PROT_NONE) = 0 <0.000004>
mmap(0x2b06900fd000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x13b000) = 0x2b06900fd000 <0.000005>
mmap(0x2b0690102000, 16696, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x2b0690102000 <0.000004>
close(3)                                = 0 <0.000003>
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b0690107000 <0.000003>
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b0690108000 <0.000003>
arch_prctl(ARCH_SET_FS, 0x2b0690107b00) = 0 <0.000002>
mprotect(0x2b06900fd000, 12288, PROT_READ) = 0 <0.000005>
mprotect(0x2b068fcdc000, 4096, PROT_READ) = 0 <0.000005>
mprotect(0x607000, 4096, PROT_READ)     = 0 <0.000004>
munmap(0x2b068f8b2000, 121589)          = 0 <0.000007>
brk(0)                                  = 0x609000 <0.000003>
brk(0x62a000)                           = 0x62a000 <0.000003>
open("/var/torque/server_name", O_RDONLY) = 3 <0.000014>
fstat(3, {st_mode=S_IFREG|0644, st_size=9, ...}) = 0 <0.000002>
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b068f8b2000 <0.000004>
read(3, "gweyring\n", 4096)             = 9 <0.000010>
read(3, "", 4096)                       = 0 <0.000003>
close(3)                                = 0 <0.000003>
munmap(0x2b068f8b2000, 4096)            = 0 <0.000005>
socket(PF_FILE, SOCK_STREAM, 0)         = 3 <0.000008>
fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK)    = 0 <0.000003>
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = 0 <0.000025>
sendto(3, "\2\0\0\0\22\0\0\0\t\0\0\0services\0", 21, MSG_NOSIGNAL, NULL, 0) = 21 <0.000010>
poll([{fd=3, events=POLLIN|POLLERR|POLLHUP, revents=POLLIN|POLLHUP}], 1, 5000) = 1 <0.000003>
recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"services\0", 9}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {4}}, msg_flags=0}, 0) = 9 <0.000004>
fstat(4, {st_mode=S_IFREG|0600, st_size=217016, ...}) = 0 <0.000003>
pread(4, "\1\0\0\0h\0\0\0,\0\0\0\1\0\0\0\243\201\31K\0\0\0\0\323\0\0\0\0L\3\0"..., 104, 0) = 104 <0.000006>
mmap(NULL, 217016, PROT_READ, MAP_SHARED, 4, 0) = 0x2b068f8d1000 <0.000004>
close(4)                                = 0 <0.000002>
close(3)                                = 0 <0.000005>
getuid()                                = 5010 <0.000002>
socket(PF_FILE, SOCK_STREAM, 0)         = 3 <0.000004>
fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK)    = 0 <0.000002>
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = 0 <0.000010>
sendto(3, "\2\0\0\0\v\0\0\0\7\0\0\0passwd\0", 19, MSG_NOSIGNAL, NULL, 0) = 19 <0.000008>
poll([{fd=3, events=POLLIN|POLLERR|POLLHUP, revents=POLLIN|POLLHUP}], 1, 5000) = 1 <0.000003>
recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"passwd\0", 7}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {4}}, msg_flags=0}, 0) = 7 <0.000004>
fstat(4, {st_mode=S_IFREG|0600, st_size=217016, ...}) = 0 <0.000002>
pread(4, "\1\0\0\0h\0\0\0j\23\0\0\1\0\0\0\243\201\31K\0\0\0\0\323\0\0\0\0L\3\0"..., 104, 0) = 104 <0.000005>
mmap(NULL, 217016, PROT_READ, MAP_SHARED, 4, 0) = 0x2b068f906000 <0.000004>
close(4)                                = 0 <0.000003>
close(3)                                = 0 <0.000004>
uname({sys="Linux", node="gweyring", ...}) = 0 <0.000003>
socket(PF_FILE, SOCK_STREAM, 0)         = 3 <0.000004>
connect(3, {sa_family=AF_FILE, path="/tmp/.torque-unix"}, 19) = 0 <0.000010>
getuid()                                = 5010 <0.000002>
getgid()                                = 200 <0.000003>
getpid()                                = 6091 <0.000003>
sendmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"m", 1}], msg_controllen=28, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=6091, uid=5010, gid=200}}, msg_flags=0}, 0) = 1 <0.000003>
mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b068f93b000 <0.000003>
mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b068f97c000 <0.000003>
write(3, "+2+12+19+3luc+0+0+0", 19)     = 19 <0.000004>
poll([{fd=3, events=POLLIN|POLLHUP, revents=POLLIN}], 1, 10800000) = 1 <86.064923>
fcntl(3, F_GETFL)                       = 0x2 (flags O_RDWR) <0.000003>
read(3, "+2+1+0+0+62+55+22+1443402.gweyri"..., 262144) = 85573 <0.000049>
brk(0x64b000)                           = 0x64b000 <0.000004>
brk(0x66c000)                           = 0x66c000 <0.000004>
fstat(1, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 <0.000003>
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b068f9bd000 <0.000004>
write(1, "Job id                    Name  "..., 4096) = 4096 <0.000030>
write(3, "+2+12+59+3luc", 13)           = 13 <0.000004>
rt_sigaction(SIGALRM, {SIG_IGN}, {SIG_DFL}, 8) = 0 <0.000003>
alarm(10800)                            = 0 <0.000004>
fcntl(3, F_GETFL)                       = 0x2 (flags O_RDWR) <0.000003>
read(3, "", 65536)                      = 0 <69.001069>
alarm(0)                                = 10731 <0.000004>
rt_sigaction(SIGALRM, {SIG_DFL}, NULL, 8) = 0 <0.000003>
close(3)                                = 0 <0.000005>
write(1, "ng         \n43479.gweyring      "..., 786) = 786 <0.000032>
exit_group(0)                           = ?
-------------- next part --------------



More information about the torqueusers mailing list