Hi all,
i configured our site (GR-02-UoM) for mpi support following the
instructions in
http://goc.grid.sinica.edu.tw/gocwiki/MPI_Support_with_Torque
(torque is the job manager for us) and it seems that everything is ok.
However i tried executing the test job from
http://quattor.web.lal.in2p3.fr/packages/mpi
and the job get stuck in one of the workers till the proxy certificate
expires. The command used to submit the job was:
edg-job-submit --vo dteam --lrms pbs -r
alexander.it.uom.gr:2119/jobmanager-lcgpbs-dteam MPItest.jdl
has anyone have any idea what the problem might be? (i include the files
below).
Best regards
Kostas Georgakopoulos
University of Macedonia
MPItest.jdl:
Type = "Job";
JobType = "MPICH";
NodeNumber = 8;
Executable = "MPItest.sh";
Arguments = "MPItest";
StdOutput = "test.out";
StdError = "test.err";
InputSandbox = {"MPItest.sh","MPItest.c"};
OutputSandbox = {"test.err","test.out","mpiexec.out"};
MPItest.sh:
#!/bin/sh -x
# the binary to execute
EXE=$1
echo
"***********************************************************************"
echo "Running on: $HOSTNAME"
echo "As: " `whoami`
echo
"***********************************************************************"
echo
"***********************************************************************"
echo "Compiling binary: $EXE"
echo mpicc -o ${EXE} ${EXE}.c
mpicc -o ${EXE} ${EXE}.c
echo "*************************************"
if [ "x$PBS_NODEFILE" != "x" ] ; then
echo "PBS Nodefile: $PBS_NODEFILE"
HOST_NODEFILE=$PBS_NODEFILE
fi
if [ "x$LSB_HOSTS" != "x" ] ; then
echo "LSF Hosts: $LSB_HOSTS"
HOST_NODEFILE=`pwd`/lsf_nodefile.$$
for host in ${LSB_HOSTS}
do
echo $host >> ${HOST_NODEFILE}
done
fi
if [ "x$HOST_NODEFILE" = "x" ]; then
echo "No hosts file defined. Exiting..."
exit
fi
echo
"***********************************************************************"
CPU_NEEDED=`cat $HOST_NODEFILE | wc -l`
echo "Node count: $CPU_NEEDED"
echo "Nodes in $HOST_NODEFILE: "
cat $HOST_NODEFILE
echo
"***********************************************************************"
echo
"***********************************************************************"
CPU_NEEDED=`cat $HOST_NODEFILE | wc -l`
echo "Checking ssh for each node:"
NODES=`cat $HOST_NODEFILE`
for host in ${NODES}
do
echo "Checking $host..."
ssh $host hostname
done
echo
"***********************************************************************"
echo
"***********************************************************************"
echo "Executing $EXE with mpiexec"
chmod 755 $EXE
mpiexec `pwd`/$EXE > mpiexec.out 2>&1
echo
"***********************************************************************"
echo
"***********************************************************************"
echo "Executing $EXE with mpirun"
chmod 755 $EXE
mpirun -np $CPU_NEEDED -machinefile $HOST_NODEFILE `pwd`/$EXE
echo
"***********************************************************************"
MPItest.c:
/* hello.c
*
* Simple "Hello World" program in MPI.
*
*/
#include "mpi.h"
#include <stdio.h>
int main(int argc, char *argv[])
{
int numprocs; /* Number of processors */
int procnum; /* Processor number */
/* Initialize MPI */
MPI_Init(&argc, &argv);
/* Find this processor number */
MPI_Comm_rank(MPI_COMM_WORLD, &procnum);
/* Find the number of processors */
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
printf ("Hello world! from processor %d out of %d\n", procnum, numprocs);
/* Shut down MPI */
MPI_Finalize();
return 0;
}
|