[torqueusers] mpiexec problem (was Re: TORQUE torque-2.4.3-snap.200912031436.tar.gz available for download_

Martin MOKREJŠ mmokrejs at ribosome.natur.cuni.cz
Fri Dec 4 07:19:32 MST 2009


Troy Baer wrote:
> On Fri, 2009-12-04 at 00:34 +0100, Martin MOKREJŠ wrote:
>> Troy Baer wrote:
>>> Have you ever tried "mpiexec [mpiexec_args] -- prog [progr_args]" to
>>> keep mpiexec from trying to parse the program's arguments?  (That's a
>>> fairly standard UNIX trick to keep programs for being overly aggressive
>>> in parsing their command line arguments.)
>> I tried now ;-) but that fools osc-mpiexec and it dies and prints usage info. ;))
> 
> What was the complete command line you ran that gave this result?
> 
> The Gentoo ticket you filed doesn't have enough information to diagnose
> the problem IMHO.  You really ought to include the complete set of error
> messages that your sample job generated.

osc-mpiexec -v -v -n 4 -- blastpgp -v 3 -b 1 -d /home/me/protein.sequences.fa -i /home/me/Parameciumtetraurelia.fas -o /tmp/blast.out -a 4


Does the strace output convince you? ;-)


$ export MPIEXEC_COMM=mpich2-pmi; qsub -l nodes=1:ppn=4 -I
qsub: waiting for job 5738.nfssrv.cluster.local to start
qsub: job 5738.nfssrv.cluster.local ready

mmokrejs at node007 ~ $ strace -v -f -s 128 osc-mpiexec -v -v -n 4 -- blastpgp -v 3 -b 1 -d /tmp/nonexistingfile.fa -i /dev/null -o /tmp/blast.out -a 4
execve("/usr/bin/osc-mpiexec", ["osc-mpiexec", "-v", "-v", "-n", "4", "--", "blastpgp", "-v", "3", "-b", "1", "-d", "/tmp/nonexistingfile.fa", "-i", "/dev/null", "-o", "/tmp/blast.out", "-a", "4"], ["MANPATH=/etc/java-config-2/current-system-vm/man:/usr/local/share/man:/usr/share/man:/usr/share/binutils-data/x86_64-pc-linux-gn"..., "NCBI=/etc/ncbi", "MPD_CONF_FILE=@MPD_CONF_FILE_DIR@/mpd.conf", "PBS_VERSION=TORQUE-2.4.1b1", "TERM=xterm", "SHELL=/bin/bash", "PBS_JOBNAME=STDIN", "PBS_ENVIRONMENT=PBS_INTERACTIVE", "PBS_O_WORKDIR=/nfslarge/home/mmokrejs", "PBS_TASKNUM=1", "USER=mmokrejs", ..."PBS_O_HOME=/nfslarge/home/mmokrejs", "PBSLOGLEVEL=7", "PBSCOREDUMP=1", "PBS_MOMPORT=15003", "PLPLOT_LIB=/usr/share/EMBOSS/", "PAGER=/usr/bin/less", "CONFIG_PROTECT_MASK=/etc/sandbox.d /etc/env.d/java/ /etc/udev/rules.d /etc/fonts/fonts.conf /etc/terminfo /etc/ca-certificates.c"..., "XDG_CONFIG_DIRS=/etc/xdg", ..."PBS_O_QUEUE=batch", "PBS_O_LOGNAME=mmokrejs", "PATH=/usr/local/bin:/usr/bin:/bi
n:/opt/bin:/usr/x86_64-pc-linux-gnu/gcc-bin/4.1.2:/var/qmail/bin", "PBS_JOBCOOKIE=E75319466DE2BB99CDBC346FA1760876", ... "BLASTDB=/usr/share/ncbi/formatdb", "PBS_O_SHELL=/bin/bash", "PBS_SERVER_HOME=/var/spool/torque", ... "PBS_SERVER=nfssrv.cluster.local", "PBS_JOBID=5738.nfssrv.cluster.local", "PBSDEBUG=0", ... "SHLVL=1", "HOME=/nfslarge/home/mmokrejs", "QRNADB=/usr/share/qrna/data", "PBS_O_HOST=nfssrv.cluster.local", "PBS_VNODENUM=0", ... "LOGNAME=mmokrejs", "GCC_SPECS=", "CVS_RSH=ssh", "TORQUEKEEPCOMPLETED=1", ... "PBS_QUEUE=batch", ... "PBS_O_MAIL=/var/mail/mmokrejs", "R_HOME=/usr/lib64/R", "BLASTMAT=/usr/share/ncbi/data", ... "PBS_NODEFILE=/var/spool/torque/aux//5738.nfssrv.cluster.local", "PBS_O_PATH=/nfslarge/x86_64_linux26/usr/bin:/nfslarge/i386_linux26/usr/bin:/usr/local/bin:/usr/bin:/bin:/opt/bin:/usr/x86_64-pc-"..., "_=/usr/bin/strace"]) = 0
brk(0)                                  = 0x61d000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8f3526d000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8f3526c000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_dev=makedev(8, 3), st_ino=65581, st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096, st_blocks=184, st_size=87427, st_atime=2009/12/02-10:31:49, st_mtime=2009/12/02-10:31:49, st_ctime=2009/12/02-10:31:49}) = 0
mmap(NULL, 87427, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f8f35256000
close(3)                                = 0
open("/usr/lib/libtorque.so.2", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\320\0\0\0\0\0\0@\0\0\0\0\0\0\0@\304\2\0\0\0\0\0\0\0\0\0@\0008\0\7\0@\0\32\0\31\0\1\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\374\231\2\0\0\0\0\0\374\231\2\0\0\0\0\0\0\0 \0\0\0\0\0\1\0\0\0\6\0\0\0\250"..., 832) = 832
fstat(3, {st_dev=makedev(8, 3), st_ino=1658093, st_mode=S_IFREG|0755, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096, st_blocks=368, st_size=182976, st_atime=2009/12/02-03:54:05, st_mtime=2009/12/02-03:54:05, st_ctime=2009/12/02-03:54:06}) = 0
mmap(NULL, 2457568, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f34df9000
mprotect(0x7f8f34e23000, 2097152, PROT_NONE) = 0
mmap(0x7f8f35023000, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2a000) = 0x7f8f35023000
mmap(0x7f8f35026000, 176096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f8f35026000
close(3)                                = 0
open("/lib/libc.so.6", O_RDONLY)        = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\347\1\0\0\0\0\0@\0\0\0\0\0\0\0p\233\25\0\0\0\0\0\0\0\0\0@\0008\0\v\0@\0E\0D\0\6\0\0\0\5\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0h\2\0\0\0\0\0\0h\2\0\0\0\0\0\0\10\0\0\0\0\0\0\0\3\0\0\0\4\0\0\0\20"..., 832) = 832
fstat(3, {st_dev=makedev(8, 3), st_ino=2525939, st_mode=S_IFREG|0755, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096, st_blocks=2784, st_size=1420464, st_atime=2009/12/02-02:59:10, st_mtime=2009/12/02-02:59:10, st_ctime=2009/12/02-02:59:20}) = 0
mmap(NULL, 3527320, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f34a9b000
mprotect(0x7f8f34bf0000, 2093056, PROT_NONE) = 0
mmap(0x7f8f34def000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x154000) = 0x7f8f34def000
mmap(0x7f8f34df4000, 17048, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f8f34df4000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8f35255000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8f35254000
arch_prctl(ARCH_SET_FS, 0x7f8f352546f0) = 0
mprotect(0x7f8f34def000, 16384, PROT_READ) = 0
mprotect(0x7f8f35023000, 4096, PROT_READ) = 0
mprotect(0x61a000, 4096, PROT_READ)     = 0
mprotect(0x7f8f3526e000, 4096, PROT_READ) = 0
munmap(0x7f8f35256000, 87427)           = 0
brk(0)                                  = 0x61d000
brk(0x63e000)                           = 0x63e000
write(2, "Usage: osc-mpiexec [<args>] <executable> [<exe args>]...\n"..., 57Usage: osc-mpiexec [<args>] <executable> [<exe args>]...
) = 57
write(2, "   or: osc-mpiexec [<args>] -config[=]<file>\n"..., 45   or: osc-mpiexec [<args>] -config[=]<file>
) = 45
write(2, "   or: osc-mpiexec [<args>] -server\n"..., 36   or: osc-mpiexec [<args>] -server
) = 36
write(2, "  -n <numproc> : use only some of the allocated processors\n"..., 59  -n <numproc> : use only some of the allocated processors
) = 59
write(2, "     Default behavior allocates one process per allocated processor.\n"..., 69     Default behavior allocates one process per allocated processor.
) = 69
write(2, "  -verbose : be verbose about mpiexec operation\n"..., 48  -verbose : be verbose about mpiexec operation
) = 48
write(2, "  -nostdin : do not listen to stdin, allowing process to go into background\n"..., 76  -nostdin : do not listen to stdin, allowing process to go into background
) = 76
write(2, "  -allstdin : send stdin to all processes (default just proc #0)\n"..., 65  -allstdin : send stdin to all processes (default just proc #0)
) = 65
write(2, "  -nostdout : do not redirect stdout/stderr, but let pbs accumulate it\n"..., 71  -nostdout : do not redirect stdout/stderr, but let pbs accumulate it
) = 71
write(2, "  -comm (gm|mx|p4|ib|rai|pmi|lam|shmem|emp|portals|none) : choose MPI (default mpich-p4)\n"..., 89  -comm (gm|mx|p4|ib|rai|pmi|lam|shmem|emp|portals|none) : choose MPI (default mpich-p4)
) = 89
write(2, "    -mpich-p4-[no-]shmem : for MPICH/P4, specify if the library was\n                           compiled with shared memory suppor"..., 145    -mpich-p4-[no-]shmem : for MPICH/P4, specify if the library was
                           compiled with shared memory support (default yes)
) = 145
write(2, "  -pernode : allocate only one process per compute node\n"..., 56  -pernode : allocate only one process per compute node
) = 56
write(2, "  -npernode <nprocs> : allocate no more than <nprocs> processes per compute node\n"..., 81  -npernode <nprocs> : allocate no more than <nprocs> processes per compute node
) = 81
write(2, "  -nolocal : do not run any MPI processes on the local node\n"..., 60  -nolocal : do not run any MPI processes on the local node
) = 60
write(2, "  -transform-hostname[=]<sed expression> : use alternate names for MPI\n"..., 71  -transform-hostname[=]<sed expression> : use alternate names for MPI
) = 71
write(2, "  -transform-hostname-program[=]<executable> : use this script or program\n                                               to gener"..., 149  -transform-hostname-program[=]<executable> : use this script or program
                                               to generate alternate names
) = 149
write(2, "  -tv : debug using totalview (ensure it is in your path)\n"..., 58  -tv : debug using totalview (ensure it is in your path)
) = 58
write(2, "  -kill : kill other processes if any one process exits\n"..., 56  -kill : kill other processes if any one process exits
) = 56
write(2, "  -config[=]<file> : use heterogenous node specification file (\"-\" for stdin)\n"..., 78  -config[=]<file> : use heterogenous node specification file ("-" for stdin)
) = 78
write(2, "  -server : do not run any tasks, just serve other concurrent mpiexec clients\n"..., 78  -server : do not run any tasks, just serve other concurrent mpiexec clients
) = 78
write(2, "  -version : show version information\n"..., 38  -version : show version information
) = 38
write(2, "Version 0.83, configure options:  '--prefix=/usr' '--build=x86_64-pc-linux-gnu' '--host=x86_64-pc-linux-gnu' '--mandir=/usr/share"..., 528Version 0.83, configure options:  '--prefix=/usr' '--build=x86_64-pc-linux-gnu' '--host=x86_64-pc-linux-gnu' '--mandir=/usr/share/man' '--infodir=/usr/share/info' '--datadir=/usr/share' '--sysconfdir=/etc' '--localstatedir=/var/lib' '--libdir=/usr/lib64' '--with-default-comm=mpich-p4' '--with-pbs=/usr/' '--with-mpicc=/usr/bin/mpicc' '--with-mpif77=/usr/bin/mpif77' '--disable-mpich-rai' 'build_alias=x86_64-pc-linux-gnu' 'host_alias=x86_64-pc-linux-gnu' 'CFLAGS=-O2 -march=nocona -fomit-frame-pointer -pipe' 'LDFLAGS=-Wl,-O1'
) = 528
exit_group(1)                           = ?
mmokrejs at node007 ~ $ 



I brought this to osc-mpiexec email list (please see several postings in the archive on this,
http://email.osc.edu/pipermail/mpiexec/2009/001063.html). They claim they do not see a problem
in their code and they say they cannot reproduce it.

Unfortunately the Gentoo ebuild have some extra patches so that is maybe why I have the problems.

However, I do not want to spoil the torqueusers archives at the moment as it seems to me it is
probably around osc-mpiexec or mpich2 ... although osc-mpiexec is not linked against mpich2
but rather against libtorque ... well, I do not know more at the moment. ;)

But thanks! Definitely two issues with argument handling exist.
M.



More information about the torqueusers mailing list