[torqueusers] Moving from 32-bit to 64-bit builds of Torque ?
Martin Siegert
siegert at sfu.ca
Mon Jan 9 19:50:58 MST 2006
I have a wrapper script /usr/local/bin/gcc64 installed:
#!/bin/bash
# wrapper script for gcc to that sets the default architecture to 64 bit
ARCH=''
for arg in "$@"; do
if [ "$arg" = '-m32' ]; then
ARCH='-m32'
elif [ "$arg" = '-m64' ]; then
ARCH='-m64'
fi
done
if [ -z "$ARCH" ]; then
exec /usr/bin/gcc -m64 "$@"
else
exec /usr/bin/gcc "$@"
fi
and have /usr/local/bin appear first in the PATH. I configure with
./configure --prefix=/usr/local/torque-2.0.0p3 \
--enable-docs \
--set-cc=gcc64 \
--set-cflags='-g' \
--set-server-home=/var/spool/torque \
--enable-syslog \
--enable-tcl-qstat \
--x-libraries=/usr/X11R6/lib64 \
--enable-gui
I also apply three patches (which I attach): torque-2.0.0p3-tcltk-64bit.patch
allows the gui stuff to compile on SLES9/PPC64, torque-2.0.0p3-rerun.patch
fixes a problem with jobs that are submitted with -r y (without that they
can get stuck indefinitely in the queue; Dave knows about this one thus
it may be included in a more recent version - all my patches are for 2.0.0p3),
and torque-2.0.0p3-NCPUS.patch, which implements the PBS_NCPUS environment
variable.
With those patches torque compiles/works just fine for us on SLES9/PPC64.
[actually, you may need a few more 64bit libraries like tclx-64bit
to get the gui stuff in 64bit; if you want, I can ship you rpms]
Martin
--
Martin Siegert
Head, HPC at SFU
WestGrid Site Manager
Academic Computing Services phone: (604) 291-4691
Simon Fraser University fax: (604) 291-4242
Burnaby, British Columbia email: siegert at sfu.ca
Canada V5A 1S6
On Tue, Jan 10, 2006 at 01:17:28PM +1100, Chris Samuel wrote:
> Hi folks,
>
> SuSE SLES 9 gcc on PPC64 creates 32-bit code by default, not 64-bit. :-(
>
> So as we're doing our shutdown today I thought I'd take the opportunity to
> backup /usr/spool/PBS and upgrade Torque to the current build and swap to a
> 64-bit version so I can build 64-bit versions of LAM, Moab, etc against it.
>
> Unfortunately, attempting to restart the server results in the following
> errors:
>
> PBS_Server: Cannot allocate memory (12) in recov_attr, calloc failed
> PBS_Server: Cannot allocate memory (12) in svr_recov, error on recovering server attr
> PBS_Server: pbsd_init, Unable to read server database
> PBS_Server: PBS_Server, pbsd_init failed
>
> This looks horribly to me like Torque is writing/reading attributes, etc,
> as binary structures and that it's killing the pbs_server.
>
> So, what's the supported way to move from 32-bit to 64-bit builds ?
>
> Is it sufficient to dump the server config of a 32-bit build with:
>
> qmgr -c 'p s'
>
> or similar magic and then reimport it into a 64-bit build or is it
> something that just isn't possible with an existing queue of jobs ?
>
> Help!
>
> Chris
> --
> Christopher Samuel - (03)9925 4751 - VPAC Deputy Systems Manager
> Victorian Partnership for Advanced Computing http://www.vpac.org/
> Bldg 91, 110 Victoria Street, Carlton South, VIC 3053, Australia
> _______________________________________________
> torqueusers mailing list
> torqueusers at supercluster.org
> http://www.supercluster.org/mailman/listinfo/torqueusers
-------------- next part --------------
--- torque-2.0.0p3/configure.tcltk Tue Nov 29 17:04:49 2005
+++ torque-2.0.0p3/configure Thu Dec 1 12:01:28 2005
@@ -932,7 +932,7 @@
ac_LL_PATH=`echo "$PATH" | sed -e 's/:/ /g'`
for ac_libpath in $ac_LL_PATH /__XqqFrobozz ; do
ac_libpath=`echo $ac_libpath | sed -e 's;/bin$;;'`
- if test `/bin/ls ${ac_libpath}/lib/libtclx*.* 2> /dev/null | wc -l` -gt 0; then
+ if test `/bin/ls ${ac_libpath}/lib*/libtclx*.* 2> /dev/null | wc -l` -gt 0; then
tclx_dir="$ac_libpath"
break;
fi
@@ -967,7 +967,7 @@
ac_LL_PATH=`echo "$PATH" | sed -e 's/:/ /g'`
for ac_libpath in $ac_LL_PATH /__XqqFrobozz ; do
ac_libpath=`echo $ac_libpath | sed -e 's;/bin$;;'`
- if test `/bin/ls ${ac_libpath}/lib/libtcl*.* 2> /dev/null | wc -l` -gt 0; then
+ if test `/bin/ls ${ac_libpath}/lib*/libtcl*.* 2> /dev/null | wc -l` -gt 0; then
tcl_dir="$ac_libpath"
break;
fi
@@ -984,7 +984,7 @@
ac_LL_PATH=`echo "$PATH" | sed -e 's/:/ /g'`
for ac_libpath in $ac_LL_PATH /__XqqFrobozz ; do
ac_libpath=`echo $ac_libpath | sed -e 's;/bin$;;'`
- if test `/bin/ls ${ac_libpath}/lib/libtcl*.* 2> /dev/null | wc -l` -gt 0; then
+ if test `/bin/ls ${ac_libpath}/lib*/libtcl*.* 2> /dev/null | wc -l` -gt 0; then
tcl_dir="$ac_libpath"
break;
fi
@@ -996,7 +996,7 @@
TCL=1
TCL_DIR="$tcl_dir"
- count=`/bin/ls ${tcl_dir}/lib/libtk* 2> /dev/null | wc -l`
+ count=`/bin/ls ${tcl_dir}/lib*/libtk* 2> /dev/null | wc -l`
if test "$count" -gt 0 -a -n "$GUI"; then
TK=1
fi
@@ -1043,10 +1043,10 @@
{ echo "configure: error: cannot find Tcl version in $tcl_h" 1>&2; exit 1; };
TCL_LIB_VER="$TCL_VER"
- count=`/bin/ls -d $TCL_DIR/lib/libtcl${TCL_LIB_VER}.* 2> /dev/null | wc -l`
+ count=`/bin/ls -d $TCL_DIR/lib*/libtcl${TCL_LIB_VER}.* 2> /dev/null | wc -l`
if test "$count" -lt 1; then
TCL_LIB_VER=`echo $TCL_LIB_VER | sed -e 's/\.//'`
- count=`/bin/ls $TCL_DIR/lib/libtcl${TCL_LIB_VER}.* | wc -l`
+ count=`/bin/ls $TCL_DIR/lib*/libtcl${TCL_LIB_VER}.* | wc -l`
if test "$count" -lt 1; then
{ echo "configure: error: cannot find a Tcl library for version $TCL_VER" 1>&2; exit 1; }
fi
@@ -1084,10 +1084,10 @@
{ echo "configure: error: cannot find Tk version in $tk_h" 1>&2; exit 1; };
TK_LIB_VER="$TK_VER"
- count=`/bin/ls $TCL_DIR/lib/libtk${TK_LIB_VER}.* 2> /dev/null | wc -l`
+ count=`/bin/ls $TCL_DIR/lib*/libtk${TK_LIB_VER}.* 2> /dev/null | wc -l`
if test "$count" -lt 1; then
TK_LIB_VER=`echo $TK_LIB_VER | sed -e 's/\.//'`
- count=`/bin/ls $TCL_DIR/lib/libtk${TK_LIB_VER}.* | wc -l`
+ count=`/bin/ls $TCL_DIR/lib*/libtk${TK_LIB_VER}.* | wc -l`
if test "$count" -lt 1; then
{ echo "configure: error: cannot find a Tk library for version $TK_VER" 1>&2; exit 1; }
fi
-------------- next part --------------
--- torque-2.0.0p3/src/include/job.h.orig 2005-12-16 15:03:51.405965516 -0800
+++ torque-2.0.0p3/src/include/job.h 2005-12-16 14:36:07.894446964 -0800
@@ -441,8 +441,8 @@
time_t ji_rteretry; /* route retry time */
} ji_routet;
struct {
- int ji_fromsock; /* socket job coming over */
pbs_net_t ji_fromaddr; /* host job coming from */
+ int ji_fromsock; /* socket job coming over */
int ji_scriptsz; /* script size */
} ji_newt;
struct {
-------------- next part --------------
--- torque-2.0.0p3/src/resmom/start_exec.c.ncpus 2005-11-28 21:23:35.000000000 -0800
+++ torque-2.0.0p3/src/resmom/start_exec.c 2005-11-29 18:13:30.070752123 -0800
@@ -184,7 +184,8 @@
"PBS_TASKNUM",
"PBS_MOMPORT",
"PBS_NODEFILE",
- "TMPDIR" };
+ "TMPDIR",
+ "PBS_NCPUS" };
static int num_var_else = sizeof(variables_else) / sizeof(char *);
@@ -1589,6 +1590,8 @@
job *pjob;
task *ptask;
+ resource *resc;
+ long ncpus = 1;
struct passwd *pwdp;
@@ -1780,6 +1783,22 @@
log_buffer);
}
+ /* PBS_NCPUS
+ first check "ncpus" resource, then vnodes */
+
+ resc = find_resc_entry(
+ &pjob->ji_wattr[(int)JOB_ATR_resource],
+ find_resc_def(svr_resc_def,"ncpus",svr_resc_size));
+ if (resc != NULL) {
+ ncpus = resc->rs_value.at_val.at_long;
+ }
+ if (pjob->ji_numvnod > ncpus) {
+ ncpus = pjob->ji_numvnod;
+ }
+ sprintf(buf,"%d",
+ ncpus);
+ bld_env_variables(&vtable,variables_else[13],buf);
+
#if defined(PENABLE_CPUSETS) || defined(PENABLE_DYNAMIC_CPUSETS)
#ifdef PENABLE_DYNAMIC_CPUSETS
More information about the torqueusers
mailing list