Thanks Stephen for replying!
Burke, S (Stephen) wrote:
>
> Are you also using the CERN bdii? It maybe that machine is sick. I think
> cern is on holiday today and tomorrow, maybe you could try a different bdii?
Yes, I was using CERN's bdii, but just switched to NIKHEF's one.
>
>>Connecting to host grid-rb.physik.uni-wuppertal.de, port 7772
>>Logging to host grid-rb.physik.uni-wuppertal.de, port 9002
>>**** Error: NS_SUBMIT_FAIL ****
>>"SandboxIOException: Globus Ftp API Failure in creating remote
>>Directories." received when submitting a job to NS
>
>
> Is the disk full on the RB? Can you do a gridftp directly to the broker,
> e.g. to /tmp or from /etc/passwd?
That was my first guess, too. But:
[root@grid-rb root]# df
Filesystem 1k-blocks Used Available Use% Mounted on
/dev/hda2 37903148 5515440 30462320 16% /
/dev/hda1 38859 15244 21609 42% /boot
grid-lcfg2.physik.uni-wuppertal.de:/opt/local/linux/7.3
38456308 20621444 15881364 57%
/export/local/linux/7.3
quarks:/raid/scratch/grid-scratch/edguser
1470539320 1061866740 408672580 73% /home/edguser
quarks:/raid/scratch/grid-scratch/dteam002
1470539320 1061866740 408672580 73% /home/dteam002
quarks:/raid/scratch/grid-scratch/dzero001
1470539320 1061866740 408672580 73% /home/dzero001
quarks:/raid/scratch/grid-scratch/dteam001
1470539320 1061866740 408672580 73% /home/dteam001
bash-2.05a$ globus-url-copy gsiftp://grid-rb/etc/passwd file:///$PWD/passwd
bash-2.05a$ head passwd
root:x:0:0:root:/root:/bin/bash
bin:x:1:1:bin:/bin:/sbin/nologin
daemon:x:2:2:daemon:/sbin:/sbin/nologin
adm:x:3:4:adm:/var/adm:/sbin/nologin
lp:x:4:7:lp:/var/spool/lpd:/sbin/nologin
sync:x:5:0:sync:/sbin:/bin/sync
shutdown:x:6:0:shutdown:/sbin:/sbin/shutdown
halt:x:7:0:halt:/sbin:/sbin/halt
mail:x:8:12:mail:/var/spool/mail:/sbin/nologin
news:x:9:13:news:/var/spool/news:
bash-2.05a$ globus-url-copy file:///$PWD/list gsiftp://grid-rb/tmp/list
bash-2.05a$
I also rebootet the machine to get everything back to zero.
>>Near 99% of these jobs ran into a "maximum RetryCount"!!!
>
>
> In itself that just tells you the jobs are failing somehow, try
> edg-job-get-logging-info -v 1 <jobid> to see what the real failure reasons
> were.
Here is one, but what does it tell me? (NOTE: THIS WAS DONE BEFORE THE
BDII CHANGE!)
bash-2.05a$ edg-job-status -v 2
https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ
*************************************************************
BOOKKEEPING INFORMATION:
Status info for the Job :
https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ
Current Status: Aborted
Status Reason: Job RetryCount (3) hit
Destination: grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs-long
reached on: Thu Sep 9 19:23:18 2004
---
- cancelling = 0
- ce_node = grid-ce.physik.uni-wuppertal.de
- children_hist = 0
- children_num = 0
- condorId = 73867
- cpuTime = 0
- destination =
grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs-long
- done_code = 1
- expectUpdate = 0
- jobtype = 0
- lastUpdateTime = Thu Sep 9 19:23:18 2004
- location = none
- network_server = grid-rb.physik.uni-wuppertal.de:7772
- owner = /O=GermanGrid/OU=UniWuppertal/CN=Torsten
Harenberg
- resubmitted = 0
- seed = uLU0BArrdV98O41PLThJ5Q
- subjob_failed = 0
---
- condor_jdl =
+stream_error = False
+edg_jobid =
"https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ"
Arguments =
'UI=000003:NS=0000000003:WM=000016:BH=0000000000:JSS=000012:LM=000018:LRMS=000000:APP=000000'
GlobusScheduler = grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs
Transfer_Executable = True
+ce_id = "grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs-long"
Submit_Event_Notes =
(https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ)
(UI=000003:NS=0000000003:WM=000016:BH=0000000000:JSS=000012:LM=000018:LRMS=000000:APP=000000)
(0)
Output =
/var/edgwl/jobcontrol/cond/8L/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ/StandardOutput
Copy_to_Spool = False
Executable =
/var/edgwl/jobcontrol/submit/8L/JobWrapper.https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ.sh
X509UserProxy =
/opt/edg/var/spool/edg-wl-renewd/6e04245850e26027f3194c23e733ee10.4
error =
/var/edgwl/jobcontrol/cond/8L/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ/StandardError
+LB_sequence_code =
"UI=000003:NS=0000000003:WM=000016:BH=0000000000:JSS=000012:LM=000018:LRMS=000000:APP=000000"
Notification = never
+stream_output = False
GlobusRSL =
(queue=long)(jobtype=single)(environment=(EDG_WL_JOBID
'https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ'))
+Type = "job"
Universe = Globus
+UserSubjectName = "/O=GermanGrid/OU=UniWuppertal/CN=Torsten
Harenberg"
Log = /var/edgwl/logmonitor/CondorG.log/CondorG.1094734190.log
+CondorSubmitFile =
"/var/edgwl/jobcontrol/submit/8L/Condor.https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ.submit"
Queue 1
- jdl =
[
requirements = ( other.GlueCEStateStatus == "Production" );
RetryCount = 3;
edg_jobid =
"https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ";
OutputSandboxPath =
"/var/edgwl/SandboxDir/8L/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ/output";
MyProxyServer = "grid-rb.physik.uni-wuppertal.de";
SubmitTo =
"grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs-long";
JobType = "normal";
Executable = "script.sh";
CertificateSubject = "/O=GermanGrid/OU=UniWuppertal/CN=Torsten
Harenberg";
X509UserProxy =
"/opt/edg/var/spool/edg-wl-renewd/6e04245850e26027f3194c23e733ee10.4";
StdOutput = "std.out";
OutputSandbox = { "std.out","std.err" };
InputSandboxPath =
"/var/edgwl/SandboxDir/8L/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ/input";
LB_sequence_code =
"UI=000003:NS=0000000003:WM=000000:BH=0000000000:JSS=000000:LM=000000:LRMS=000000:APP=000000";
VirtualOrganisation = "dzero";
rank = -other.GlueCEStateEstimatedResponseTime;
Type = "job";
StdError = "std.err";
DefaultRank = -other.GlueCEStateEstimatedResponseTime;
InputSandbox = { "script.sh" };
InputData = { "lfn:dzero_mc_harenber_v1" };
DataAccessProtocol = { "gsiftp" }
]
- matched_jdl =
[
Arguments =
[
JobAd =
[
stream_error = false;
edg_jobid =
"https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ";
GlobusScheduler =
"grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs";
Transfer_Executable = true;
ce_id =
"grid-ce.physik.uni-wuppertal.de:2119/jobmanager-pbs-long";
Output =
"/var/edgwl/jobcontrol/cond/8L/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ/StandardOutput";
Copy_to_Spool = false;
Executable =
"/var/edgwl/jobcontrol/submit/8L/JobWrapper.https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ.sh";
X509UserProxy =
"/opt/edg/var/spool/edg-wl-renewd/6e04245850e26027f3194c23e733ee10.4";
Error_ =
"/var/edgwl/jobcontrol/cond/8L/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f8L34KEbKInfYRe0lWPz4VQ/StandardError";
LB_sequence_code =
"UI=000003:NS=0000000003:WM=000016:BH=0000000000:JSS=000009:LM=000018:LRMS=000000:APP=000000";
Notification = "never";
stream_output = false;
GlobusRSL =
"(queue=long)(jobtype=single)(environment=(EDG_WL_JOBID
\'https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ\'))";
Type = "job";
Universe = "Globus";
UserSubjectName = "/O=GermanGrid/OU=UniWuppertal/CN=Torsten
Harenberg";
Log = "/var/edgwl/logmonitor/CondorG.log/CondorG.log"
]
];
Command = "Submit";
Source = 2;
Protocol = "1.0.0"
]
- rsl =
(queue=long)(jobtype=single)(environment=(EDG_WL_JOBID
'https://grid-rb.physik.uni-wuppertal.de:9000/8L34KEbKInfYRe0lWPz4VQ'))
- stateEnterTimes =
Submitted : Thu Sep 9 19:15:00 2004
Waiting : Thu Sep 9 19:23:18 2004
Ready : Thu Sep 9 19:21:14 2004
Scheduled : Thu Sep 9 19:21:36 2004
Running : Thu Sep 9 19:22:55 2004
Done : Thu Sep 9 19:23:17 2004
Cleared : ---
Aborted : Thu Sep 9 19:23:18 2004
Cancelled : ---
*************************************************************
>
>>The edg-job-get-output complaines that he could only get 1 of the 2
>>files back. Only a second instance than finally managed to get the 11
>>bytes long std.out! Wow!
>
>
> Could also be due to a full disk or a gridftp problem.
Is there any way to debug the gridftp server? I guess the problem is
somewhere there. Looking into /var/log/globus-gridftp.log didn't tell me
too much, but I'm not very much experienced in that. Lines look all like
this:
DATE=20040909202435.399998 HOST=grid-rb.physik.uni-wuppertal.de
PROG=wuftpd NL.E
VNT=FTP_INFO START=20040909202435.063830 USER=dteam002
FILE=/var/edgwl/SandboxDi
r/3H/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f3HxnrYSNBFMKM1poqwKi
qA/input/.BrokerInfo BUFFER=16384 BLOCK=65536 NBYTES=714 VOLUME=/
STREAMS=1 STRI
PES=1 DEST=1[130.206.11.207] TYPE=RETR CODE=226
DATE=20040909202438.905267 HOST=grid-rb.physik.uni-wuppertal.de
PROG=wuftpd NL.E
VNT=FTP_INFO START=20040909202438.533095 USER=dteam002
FILE=/var/edgwl/SandboxDi
r/3H/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2f3HxnrYSNBFMKM1poqwKi
qA/Maradona.output BUFFER=87380 BLOCK=65536 NBYTES=20 VOLUME=/ STREAMS=1
STRIPES
=1 DEST=1[130.206.11.207] TYPE=STOR CODE=226
DATE=20040909202530.250401 HOST=grid-rb.physik.uni-wuppertal.de
PROG=wuftpd NL.E
VNT=FTP_INFO START=20040909202530.112010 USER=dteam002
FILE=/var/edgwl/SandboxDi
r/FQ/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2fFQmIkahlfaZaOr0u6jw8
uQ/input/.BrokerInfo BUFFER=16384 BLOCK=65536 NBYTES=714 VOLUME=/
STREAMS=1 STRI
PES=1 DEST=1[131.169.223.58] TYPE=RETR CODE=226
DATE=20040909202532.942997 HOST=grid-rb.physik.uni-wuppertal.de
PROG=wuftpd NL.E
VNT=FTP_INFO START=20040909202532.807026 USER=dteam002
FILE=/var/edgwl/SandboxDi
r/FQ/https_3a_2f_2fgrid-rb.physik.uni-wuppertal.de_3a9000_2fFQmIkahlfaZaOr0u6jw8
uQ/Maradona.output BUFFER=87380 BLOCK=65536 NBYTES=20 VOLUME=/ STREAMS=1
STRIPES
=1 DEST=1[131.169.223.58] TYPE=STOR CODE=226
Any of the experts see any problem?
Thanks and good night ;-)
Torsten
--
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
<> <>
<> Torsten Harenberg [log in to unmask] <>
<> Bergische Universitaet <>
<> FB C - Physik Tel.: +49 (0)202 439-3521 <>
<> Gaussstr. 20 Fax : +49 (0)202 439-2811 <>
<> 42097 Wuppertal <>
<> <>
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
|