All our CPUs were stalled since ~3 days ago full of LHCb jobs that try to
lcg-cp something...; some jobs stalled since 2 jan 05, the most part since 4
jan 05!
Now we killed by hand the 157 stalled commands, host by host, and things now
seems to go forward... but it is clear that it is not desiderable/possible to
continue killing by hand each single stalled process.
Hers I will append a snap of the status of lhcb processes in one of our
machines before the kills.
Cheers,
Sergio
[root@cmsfarm-01-02 root]# ps -efwww
UID PID PPID C STIME TTY TIME CMD
[...]
lhcb001 21170 9587 0 Jan04 ? 00:00:00 /usr/local/lsf/etc/res -d
/sw/lsf/mnt/conf -m t2-ce-01 /home/lhcb001/.lsbatch/1104802774.319450
lhcb001 21175 21170 0 Jan04 ? 00:00:00 /bin/sh
/home/lhcb001/.lsbatch/1104802774.319450
lhcb001 21178 21175 0 Jan04 ? 00:00:00 /bin/sh
/sw/lsf/mnt/conf/lsbatch/cmsfarm/configdir/lcg-setEnv4lsf.sh
/home/lhcb001/.lsbatch/1104802774.319450.shell
lhcb001 21274 21178 0 Jan04 ? 00:00:00 /bin/sh
/home/lhcb001/.lsbatch/1104802774.319450.shell
lhcb001 21278 21274 0 Jan04 ? 00:00:00 /usr/bin/perl -w
/tmp/bootstrap.TIHSt5 /home/lhcb001/ t2-ce-01.lnl.infn.it
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/07/11a5849bb298cad1f3039300b6d4d4/data
X509GPG:globus-cache-export.hUATjq.gpg /dev/null
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/55/ba4b3a33e96ba9b8f92e422d56fca9/data
stdoutftp
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/0f/df9adab19d98b2b67d24ccbfd7a4b3/data
stderrftp /home/lhcb001/.lcgjm/globus-cache-export.hUATjq
https://t2-ce-01.lnl.infn.it:20005/23807/1104802719/ /home/lhcb001/ NONE
https://lxb2010.cern.ch:20003/var/edgwl/jobcontrol/submit/JZ/JobWrapper.https_3a_2f_2flxb2010.cern.ch_3a9000_2fJZJM0HOw4UVyC8vaqQlmGw.sh
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 21284 21278 0 Jan04 ? 00:00:00 /usr/bin/perl -w
/tmp/bootstrap.TIHSt5 /home/lhcb001/ t2-ce-01.lnl.infn.it
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/07/11a5849bb298cad1f3039300b6d4d4/data
X509GPG:globus-cache-export.hUATjq.gpg /dev/null
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/55/ba4b3a33e96ba9b8f92e422d56fca9/data
stdoutftp
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/0f/df9adab19d98b2b67d24ccbfd7a4b3/data
stderrftp /home/lhcb001/.lcgjm/globus-cache-export.hUATjq
https://t2-ce-01.lnl.infn.it:20005/23807/1104802719/ /home/lhcb001/ NONE
https://lxb2010.cern.ch:20003/var/edgwl/jobcontrol/submit/JZ/JobWrapper.https_3a_2f_2flxb2010.cern.ch_3a9000_2fJZJM0HOw4UVyC8vaqQlmGw.sh
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 21537 21278 0 Jan04 ? 00:00:00 /usr/bin/perl -w
/tmp/bootstrap.TIHSt5 /home/lhcb001/ t2-ce-01.lnl.infn.it
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/07/11a5849bb298cad1f3039300b6d4d4/data
X509GPG:globus-cache-export.hUATjq.gpg /dev/null
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/55/ba4b3a33e96ba9b8f92e422d56fca9/data
stdoutftp
/home/lhcb001/.globus/.gass_cache/local/md5/55/b328300f37b4bf678933619e2f139e/md5/0f/df9adab19d98b2b67d24ccbfd7a4b3/data
stderrftp /home/lhcb001/.lcgjm/globus-cache-export.hUATjq
https://t2-ce-01.lnl.infn.it:20005/23807/1104802719/ /home/lhcb001/ NONE
https://lxb2010.cern.ch:20003/var/edgwl/jobcontrol/submit/JZ/JobWrapper.https_3a_2f_2flxb2010.cern.ch_3a9000_2fJZJM0HOw4UVyC8vaqQlmGw.sh
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 21748 21278 0 Jan04 ? 00:00:00 bash
/home/lhcb001/globus-tmp.cmsfarm-01-02.21278.0/globus-tmp.cmsfarm-01-02.21278.0/local/md5/55/b328300f37b4bf678933619e2f139e/md5/e9/10023ddc8ed7a3b1257fc85cef948b/data
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 21816 21748 0 Jan04 ? 00:00:00 bash
/home/lhcb001/globus-tmp.cmsfarm-01-02.21278.0/globus-tmp.cmsfarm-01-02.21278.0/local/md5/55/b328300f37b4bf678933619e2f139e/md5/e9/10023ddc8ed7a3b1257fc85cef948b/data
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 21817 21816 0 Jan04 ? 00:00:00 /bin/sh ./lcg-dirac.sh
lhcb001 21818 21816 0 Jan04 ? 00:00:00 perl -e ? while (1) {?
$time_left = `grid-proxy-info -timeleft 2> /dev/null` || 0;? last if
($time_left <= 0);? sleep($time_left);? }?
kill(defined($ENV{"EDG_WL_NOSETPGRP"}) ? 9 : -9, 21817);? exit(1);?
lhcb001 22003 21817 0 Jan04 ? 00:00:00 python2.2
./DIRAC/scripts/dirac-agent -o Agent.ini
lhcb001 23530 9587 0 Jan04 ? 00:00:00 /usr/local/lsf/etc/res -d
/sw/lsf/mnt/conf -m t2-ce-01 /home/lhcb001/.lsbatch/1104808815.319471
lhcb001 23535 23530 0 Jan04 ? 00:00:00 /bin/sh
/home/lhcb001/.lsbatch/1104808815.319471
lhcb001 23538 23535 0 Jan04 ? 00:00:00 /bin/sh
/sw/lsf/mnt/conf/lsbatch/cmsfarm/configdir/lcg-setEnv4lsf.sh
/home/lhcb001/.lsbatch/1104808815.319471.shell
lhcb001 23634 23538 0 Jan04 ? 00:00:00 /bin/sh
/home/lhcb001/.lsbatch/1104808815.319471.shell
lhcb001 23638 23634 0 Jan04 ? 00:00:00 /usr/bin/perl -w
/tmp/bootstrap.QOWJNL /home/lhcb001/ t2-ce-01.lnl.infn.it
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/15/134170297bfd2ab5011afb98ad88af/data
X509GPG:globus-cache-export.bwnWd5.gpg /dev/null
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/5a/2e37a0ab7579644c2257161c7f88c0/data
stdoutftp
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/1e/72994a5131f7cec9ee62ecf356ea1b/data
stderrftp /home/lhcb001/.lcgjm/globus-cache-export.bwnWd5
https://t2-ce-01.lnl.infn.it:20000/9738/1104808730/ /home/lhcb001/ NONE
https://lcgrb02.ifae.es:20001/var/edgwl/jobcontrol/submit/pr/JobWrapper.https_3a_2f_2flcgrb02.ifae.es_3a9000_2fprsimH_5fjIFtzO07hiVPSGg.sh
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 23644 23638 0 Jan04 ? 00:00:00 /usr/bin/perl -w
/tmp/bootstrap.QOWJNL /home/lhcb001/ t2-ce-01.lnl.infn.it
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/15/134170297bfd2ab5011afb98ad88af/data
X509GPG:globus-cache-export.bwnWd5.gpg /dev/null
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/5a/2e37a0ab7579644c2257161c7f88c0/data
stdoutftp
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/1e/72994a5131f7cec9ee62ecf356ea1b/data
stderrftp /home/lhcb001/.lcgjm/globus-cache-export.bwnWd5
https://t2-ce-01.lnl.infn.it:20000/9738/1104808730/ /home/lhcb001/ NONE
https://lcgrb02.ifae.es:20001/var/edgwl/jobcontrol/submit/pr/JobWrapper.https_3a_2f_2flcgrb02.ifae.es_3a9000_2fprsimH_5fjIFtzO07hiVPSGg.sh
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 23897 23638 0 Jan04 ? 00:00:00 /usr/bin/perl -w
/tmp/bootstrap.QOWJNL /home/lhcb001/ t2-ce-01.lnl.infn.it
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/15/134170297bfd2ab5011afb98ad88af/data
X509GPG:globus-cache-export.bwnWd5.gpg /dev/null
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/5a/2e37a0ab7579644c2257161c7f88c0/data
stdoutftp
/home/lhcb001/.globus/.gass_cache/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/1e/72994a5131f7cec9ee62ecf356ea1b/data
stderrftp /home/lhcb001/.lcgjm/globus-cache-export.bwnWd5
https://t2-ce-01.lnl.infn.it:20000/9738/1104808730/ /home/lhcb001/ NONE
https://lcgrb02.ifae.es:20001/var/edgwl/jobcontrol/submit/pr/JobWrapper.https_3a_2f_2flcgrb02.ifae.es_3a9000_2fprsimH_5fjIFtzO07hiVPSGg.sh
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 24108 23638 0 Jan04 ? 00:00:00 bash
/home/lhcb001/globus-tmp.cmsfarm-01-02.23638.0/globus-tmp.cmsfarm-01-02.23638.0/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/ab/ac6a5341a7d2be0aec1c1b2735680e/data
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 24190 24108 0 Jan04 ? 00:00:00 bash
/home/lhcb001/globus-tmp.cmsfarm-01-02.23638.0/globus-tmp.cmsfarm-01-02.23638.0/local/md5/fb/eb5c177e812c83fdd7a6bacda20769/md5/ab/ac6a5341a7d2be0aec1c1b2735680e/data
UI=000003:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=000000:APP=000000
lhcb001 24191 24190 0 Jan04 ? 00:00:00 /bin/sh ./lcg-dirac.sh
lhcb001 24192 24190 0 Jan04 ? 00:00:00 perl -e ? while (1) {?
$time_left = `grid-proxy-info -timeleft 2> /dev/null` || 0;? last if
($time_left <= 0);? sleep($time_left);? }?
kill(defined($ENV{"EDG_WL_NOSETPGRP"}) ? 9 : -9, 24191);? exit(1);?
lhcb001 24377 24191 0 Jan04 ? 00:00:00 python2.2
./DIRAC/scripts/dirac-agent -o Agent.ini
lhcb001 4130 22003 0 Jan04 ? 00:00:00 lcg-cp --vo lhcb
file:///home/lhcb001/globus-tmp.cmsfarm-01-02.21278.0/WMS_cmsfarm-01-02_021748_https_3a_2f_2flxb2010.cern.ch_3a9000_2fJZJM0HOw4UVyC8vaqQlmGw/dirac_directory
gsiftp://castorgrid.cern.ch/castor/cern.ch/grid/lhcb/DC04/v2/00000747/dirac_directory
lhcb001 5353 24377 0 Jan04 ? 00:00:00 lcg-cp --vo lhcb
file:///home/lhcb001/globus-tmp.cmsfarm-01-02.23638.0/WMS_cmsfarm-01-02_024108_https_3a_2f_2flcgrb02.ifae.es_3a9000_2fprsimH_5fjIFtzO07hiVPSGg/dirac_directory
gsiftp://castorgrid.cern.ch/castor/cern.ch/grid/lhcb/DC04/v2/00000747/dirac_directory
[...]
Ian Stokes-Rees wrote:
> I think David Bouvet is right about this -- they are waiting to transfer
> files to CERN CASTOR GridFTP server, which I believe crashed yesterday.
> They sit in sleep loop which checks every 10-30 minutes to see if the
> data transfer at the end of the job can be be done. If not, they sleep.
> They do this for about 12-18 hours then give up.
>
> We would like there to be a "better way", but unfortunately we don't
> trust any other "queued data transfer" mechanism in LCG yet, so we build
> it into our jobs. Of course if the reason they do this is that if the
> data transfer fails then "everything" gets messed up -- the 12-18 hours
> already used to create the data is wasted, and we have to catch this
> failure and mark the job as "failed due to LCG" meaning it can be
> resubmitted, as opposed to "failed due to LHCb", which means someone
> needs to look at it to figure out what went wrong.
>
> Joel Closier and Andrei Tsaregorodtsev are also good people to contact
> about this if you see it happen at your site.
>
> Cheers,
>
> Ian.
>
> Dimitris Zilaskos wrote:
>
>> Hello ,
>>
>> I have a number of lhcb jobs sitting in my queue . They have been
>> siting in that exact stage for more than 12 hours (the Time Use counter
>> is not increasing and the process that was eating cpu appears to have
>> completed its task). They appear to be waiting for something ( user
>> intervention?).
>> There were some same jobs 3-4 days ago that exhibited the same
>> behaviour but after around another 12 hours the jobs exited
>> successfully.I have mailed Ricardo Graciani who appears to have
>> submitted those jobs but I got no response. If someone knows what is
>> going on ... because our queues have been filled for days and no other
>> jobs cat run (ie the job submission tests)
>>
>> Job id Name User Time Use S Queue
>> ---------------- ---------------- ---------------- -------- - -----
>> 8.node001 STDIN lhcb001 27:01:05 R infinite
>> 9.node001 STDIN lhcb001 27:37:44 R infinite
>> 10.node001 STDIN lhcb001 27:08:25 R infinite
>> 11.node001 STDIN lhcb001 27:33:07 R infinite
>> 12.node001 STDIN lhcb001 25:59:44 R infinite
>> 13.node001 STDIN lhcb001 26:29:33 R infinite
>> 14.node001 STDIN lhcb001 27:52:40 R infinite
>> 16.node001 STDIN lhcb001 27:13:36 R infinite
>> 17.node001 STDIN lhcb001 0 Q infinite
>> 18.node001 STDIN lhcb001 0 Q infinite
>> 19.node001 STDIN lhcb001 0 Q infinite
>> 20.node001 STDIN lhcb001 0 Q infinite
>> 21.node001 STDIN lhcb001 0 Q infinite
>> 23.node001 STDIN dteam004 0 Q short
>> (...)
>>
>>
>> Best regards ,
>> --
>> =============================================================================
>>
>>
>>
>> Dimitris Zilaskos
>>
>> Department of Physics @ Aristotle Univercity of Thessaloniki , Greece
>> PGP key : http://tassadar.physics.auth.gr/~dzila/pgp_public_key.asc
>> http://egnatia.ee.auth.gr/~dzila/pgp_public_key.asc
>> MD5sum : de2bd8f73d545f0e4caf3096894ad83f pgp_public_key.asc
>> =============================================================================
>>
>>
>
> --
> Ian Stokes-Rees [log in to unmask]
> Particle Physics, Oxford http://grid.physics.ox.ac.uk/~stokes
--
---------------------------------------------------------------------
Sergio Fantinel EGEE Project
---------------------------------------------------------------------
INFN - Lab. Naz. di Legnaro phone: +39 049 8068 489
viale dell'Università n. 2,
35020 Legnaro (PD) ITALY [log in to unmask]
---------------------------------------------------------------------
--
---------------------------------------------------------------------
Sergio Fantinel EGEE Project
---------------------------------------------------------------------
INFN - Lab. Naz. di Legnaro phone: +39 049 8068 489
viale dell'Università n. 2,
35020 Legnaro (PD) ITALY [log in to unmask]
---------------------------------------------------------------------
|