Print

Print


  Hi all,

  i configured our site (GR-02-UoM) for mpi support following the 
instructions in 
http://goc.grid.sinica.edu.tw/gocwiki/MPI_Support_with_Torque
(torque is the job manager for us) and it seems that everything is ok. 
However i tried executing the test job from 
http://quattor.web.lal.in2p3.fr/packages/mpi
and the job get stuck in one of the workers till the proxy certificate 
expires. The command used to submit the job was:

edg-job-submit --vo dteam --lrms pbs -r 
alexander.it.uom.gr:2119/jobmanager-lcgpbs-dteam MPItest.jdl

has anyone have any idea what the problem might be? (i include the files 
below).

Best regards
Kostas Georgakopoulos
University of Macedonia

MPItest.jdl:

Type = "Job";
JobType = "MPICH";
NodeNumber = 8;
Executable = "MPItest.sh";
Arguments = "MPItest";
StdOutput = "test.out";
StdError = "test.err";
InputSandbox = {"MPItest.sh","MPItest.c"};
OutputSandbox = {"test.err","test.out","mpiexec.out"};

MPItest.sh:

#!/bin/sh -x

# the binary to execute
EXE=$1

echo 
"***********************************************************************"
echo "Running on: $HOSTNAME"
echo "As:       " `whoami`
echo 
"***********************************************************************"

echo 
"***********************************************************************"
echo "Compiling binary: $EXE"
echo mpicc -o ${EXE} ${EXE}.c
mpicc -o ${EXE} ${EXE}.c
echo "*************************************"

if [ "x$PBS_NODEFILE" != "x" ] ; then
 echo "PBS Nodefile: $PBS_NODEFILE"
 HOST_NODEFILE=$PBS_NODEFILE
fi

if [ "x$LSB_HOSTS" != "x" ] ; then
 echo "LSF Hosts: $LSB_HOSTS"
 HOST_NODEFILE=`pwd`/lsf_nodefile.$$
 for host in ${LSB_HOSTS}
 do
   echo $host >> ${HOST_NODEFILE}
 done
fi

if [ "x$HOST_NODEFILE" = "x" ]; then
 echo "No hosts file defined.  Exiting..."
 exit
fi

echo 
"***********************************************************************"
CPU_NEEDED=`cat $HOST_NODEFILE | wc -l`
echo "Node count: $CPU_NEEDED"
echo "Nodes in $HOST_NODEFILE: "
cat $HOST_NODEFILE
echo 
"***********************************************************************"

echo 
"***********************************************************************"
CPU_NEEDED=`cat $HOST_NODEFILE | wc -l`
echo "Checking ssh for each node:"
NODES=`cat $HOST_NODEFILE`
for host in ${NODES}
do
 echo "Checking $host..."
 ssh $host hostname
done
echo 
"***********************************************************************"

echo 
"***********************************************************************"
echo "Executing $EXE with mpiexec"
chmod 755 $EXE
mpiexec `pwd`/$EXE > mpiexec.out 2>&1
echo 
"***********************************************************************"

echo 
"***********************************************************************"
echo "Executing $EXE with mpirun"
chmod 755 $EXE
mpirun -np $CPU_NEEDED -machinefile $HOST_NODEFILE `pwd`/$EXE
echo 
"***********************************************************************"

MPItest.c:

/*  hello.c
*
*  Simple "Hello World" program in MPI.
*
*/
 
#include "mpi.h"
#include <stdio.h>
int main(int argc, char *argv[])
{
 int numprocs;  /* Number of processors */
 int procnum;   /* Processor number */
 /* Initialize MPI */
 MPI_Init(&argc, &argv);
 /* Find this processor number */
 MPI_Comm_rank(MPI_COMM_WORLD, &procnum);
 /* Find the number of processors */
 MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
 printf ("Hello world! from processor %d out of %d\n", procnum, numprocs);
 /* Shut down MPI */
 MPI_Finalize();
 return 0;
}