Hi to all,
urgent help needed: I submitted a lot of jobs through our glite-WMS to our
lcg-CE, and all of them are successfully finished (output sandboxes OK,
checked on WMS); however, WMS claims that majority of them are still in
Running state (for 6+ hours after finishing).
glite-job-logging-info confirms (example below) that this is just the
logging problem from CE to WMS - WN logged that the job is done, but CE
didn't. This is probably due to crash of logmonitor or something like that -
I tried to restart it on WMS several time, and verified that everything
works for new jobs (they go to "Done (Success)" state).
Is there any easy way to convince WMS that these jobs are finished?
Thanks, Antun
[antun@ce rho1]$ glite-job-logging-info -v 2
https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q
**********************************************************************
LOGGING INFORMATION:
Printing info for the Job :
https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q
---
Event: RegJob
- arrived = Wed Sep 6 14:47:31 2006 CEST
- host = ce.phy.bg.ac.yu
- ns = g01.phy.bg.ac.yu:7772
- nsubjobs = 0
- seed = uLU0BArrdV98O41PLThJ5Q
- source = UserInterface
- timestamp = Wed Sep 6 14:47:31 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: Transfer
- arrived = Wed Sep 6 14:47:31 2006 CEST
- dest_host = g01.phy.bg.ac.yu
- dest_instance = g01.phy.bg.ac.yu:7772
- destination = NetworkServer
- host = ce.phy.bg.ac.yu
- result = START
- source = UserInterface
- timestamp = Wed Sep 6 14:47:31 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: Transfer
- arrived = Wed Sep 6 14:47:34 2006 CEST
- dest_host = g01.phy.bg.ac.yu
- dest_instance = g01.phy.bg.ac.yu:7772
- destination = NetworkServer
- host = ce.phy.bg.ac.yu
- result = OK
- source = UserInterface
- timestamp = Wed Sep 6 14:47:34 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: Accepted
- arrived = Wed Sep 6 14:47:33 2006 CEST
- from = UserInterface
- from_host = g01.phy.bg.ac.yu
- host = g01.phy.bg.ac.yu
- source = NetworkServer
- src_instance = 7772
- timestamp = Wed Sep 6 14:47:33 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: EnQueued
- arrived = Wed Sep 6 14:47:34 2006 CEST
- host = g01.phy.bg.ac.yu
- queue = /var/glite/workload_manager/input.fl
- result = OK
- source = NetworkServer
- timestamp = Wed Sep 6 14:47:34 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: DeQueued
- arrived = Wed Sep 6 14:47:37 2006 CEST
- host = g01.phy.bg.ac.yu
- queue = /var/glite/workload_manager/input.fl
- source = WorkloadManager
- src_instance = 1977
- timestamp = Wed Sep 6 14:47:37 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: Match
- arrived = Wed Sep 6 14:47:45 2006 CEST
- dest_id = ce.phy.bg.ac.yu:2119/jobmanager-pbs-aegis
- host = g01.phy.bg.ac.yu
- source = WorkloadManager
- src_instance = 1977
- timestamp = Wed Sep 6 14:47:45 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: EnQueued
- arrived = Wed Sep 6 14:47:45 2006 CEST
- host = g01.phy.bg.ac.yu
- queue = /var/glite/jobcontrol/queue.fl
- reason = unavailable
- result = START
- source = WorkloadManager
- src_instance = 1977
- timestamp = Wed Sep 6 14:47:45 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: EnQueued
- arrived = Wed Sep 6 14:47:45 2006 CEST
- host = g01.phy.bg.ac.yu
- queue = /var/glite/jobcontrol/queue.fl
- reason = unavailable
- result = OK
- source = WorkloadManager
- src_instance = 1977
- timestamp = Wed Sep 6 14:47:45 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: DeQueued
- arrived = Wed Sep 6 14:47:47 2006 CEST
- host = g01.phy.bg.ac.yu
- local_jobid = unavailable
- queue = /var/glite/jobcontrol/queue.fl
- source = JobController
- src_instance = unique
- timestamp = Wed Sep 6 14:47:47 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: Transfer
- arrived = Wed Sep 6 14:47:47 2006 CEST
- dest_host = localhost
- dest_instance
= /var/glite/logmonitor/CondorG.log/CondorG.1157546690.log
- dest_jobid = unavailable
- destination = LogMonitor
- host = g01.phy.bg.ac.yu
- reason = unavailable
- result = START
- source = JobController
- src_instance = unique
- timestamp = Wed Sep 6 14:47:47 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: Transfer
- arrived = Wed Sep 6 14:47:47 2006 CEST
- dest_host = localhost
- dest_instance
= /var/glite/logmonitor/CondorG.log/CondorG.1157546690.log
- dest_jobid = 115804
- destination = LogMonitor
- host = g01.phy.bg.ac.yu
- reason = unavailable
- result = OK
- source = JobController
- src_instance = unique
- timestamp = Wed Sep 6 14:47:47 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: Running
- arrived = Wed Sep 6 14:48:24 2006 CEST
- host = wn13.phy.bg.ac.yu
- node = wn13.phy.bg.ac.yu
- source = LRMS
- timestamp = Wed Sep 6 14:48:07 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: Done
- arrived = Thu Sep 7 02:15:41 2006 CEST
- exit_code = 0
- host = wn13.phy.bg.ac.yu
- reason = (nil)
- source = LRMS
- status_code = OK
- timestamp = Thu Sep 7 02:15:06 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
---
Event: Accepted
- arrived = Wed Sep 6 14:47:52 2006 CEST
- from = JobController
- from_host = localhost
- from_instance = unavailable
- host = g01.phy.bg.ac.yu
- local_jobid = 115804
- source = LogMonitor
- src_instance = unique
- timestamp = Wed Sep 6 14:47:52 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
---
Event: Transfer
- arrived = Wed Sep 6 14:48:04 2006 CEST
- dest_host = ce.phy.bg.ac.yu:2119/jobmanager-pbs
- dest_instance
= /var/glite/logmonitor/CondorG.log/CondorG.1157546690.log
- dest_jobid = unavailable
- destination = LRMS
- host = g01.phy.bg.ac.yu
- reason = Job successfully submitted to Globus
- result = OK
- source = LogMonitor
- src_instance = unique
- timestamp = Wed Sep 6 14:48:04 2006 CEST
- user = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz/CN=proxy/CN=proxy
**********************************************************************
[antun@ce rho1]$ glite-job-status -v 3
https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q
*************************************************************
BOOKKEEPING INFORMATION:
Status info for the Job :
https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q
Current Status: Running
Status Reason: Job successfully submitted to Globus
Destination: ce.phy.bg.ac.yu:2119/jobmanager-pbs-aegis
Submitted: Wed Sep 6 14:47:31 2006 CEST
---
- cancelling = 0
- ce_node = wn13.phy.bg.ac.yu
- children_num = 0
- condorId = 115804
- cpuTime = 0
- destination = ce.phy.bg.ac.yu:2119/jobmanager-pbs-aegis
- done_code = 0
- expectUpdate = 0
- jobtype = 0
- lastUpdateTime = Thu Sep 7 02:15:06 2006 CEST
- location = LRMS/worknode/wn13.phy.bg.ac.yu
- network_server = g01.phy.bg.ac.yu:7772
- owner = /DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz
- Payload_Running = 0
- Possible_Ce_Nodes = wn13.phy.bg.ac.yu
- Possible Destinations = ce.phy.bg.ac.yu:2119/jobmanager-pbs-aegis
- resubmitted = 0
- seed = uLU0BArrdV98O41PLThJ5Q
- subjob_failed = 0
---
- children_hist = 0
- condor_jdl =
stream_error = False
+edg_jobid = "https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q"
Arguments
= 'UI=000002:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=
000000:APP=000000'
GlobusScheduler = ce.phy.bg.ac.yu:2119/jobmanager-pbs
Transfer_Executable = True
+ce_id = "ce.phy.bg.ac.yu:2119/jobmanager-pbs-aegis"
Output
= /var/glite/jobcontrol/condorio/Gi/https_3a_2f_2fg01.phy.bg.ac.yu_3a9000_2fG
iES4X_5f5UZFFYsRmN0OZ_5fQ/StandardOutput
Submit_Event_Notes =
(https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q)
(UI=000002:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=00
0000:APP=000000) (0)
Copy_to_Spool = False
Executable
= /var/glite/jobcontrol/submit/Gi/JobWrapper.https_3a_2f_2fg01.phy.bg.ac.yu_3
a9000_2fGiES4X_5f5UZFFYsRmN0OZ_5fQ.sh
X509UserProxy = /var/glite/spool/glite-
renewd/5ad9af98dabbbc2065a81f7ea87177b9.82
error
= /var/glite/jobcontrol/condorio/Gi/https_3a_2f_2fg01.phy.bg.ac.yu_3a9000_2fG
iES4X_5f5UZFFYsRmN0OZ_5fQ/StandardError
+LB_sequence_code
= "UI=000002:NS=0000000003:WM=000004:BH=0000000000:JSS=000003:LM=000000:LRMS=
000000:APP=000000"
Notification = never
stream_output = False
GlobusRSL = (queue=aegis)(jobtype=single)
+Type = "job"
Universe = grid
+UserSubjectName = "/DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun Balaz"
Log = /var/glite/logmonitor/CondorG.log/CondorG.1157546690.log
grid_type = globus
+CondorSubmitFile
= "/var/glite/jobcontrol/submit/Gi/Condor.https_3a_2f_2fg01.phy.bg.ac.yu_3a90
00_2fGiES4X_5f5UZFFYsRmN0OZ_5fQ.submit"
Queue 1
- jdl =
[
requirements = ( ( RegExp("ce.phy.bg.ac.yu*",other.GlueCEUniqueID) )
&& ( other.GlueCEStateStatus == "Production" ) ) && (
other.GlueCEStateStatus == "Production" );
RetryCount = 3;
edg_jobid = "https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q";
Arguments = "";
MyProxyServer = "grid.phy.bg.ac.yu";
JobType = "normal";
Executable = "solar1-51.sh";
StdOutput = "stdout";
VOMS_FQAN = "/aegis/Role=NULL/Capability=NULL";
OutputSandbox = { "stdout","stderr","out1-51.gz" };
LB_sequence_code
= "UI=000002:NS=0000000000:WM=000000:BH=0000000000:JSS=000000:LM=000000:LRMS=
000000:APP=000000";
VirtualOrganisation = "aegis";
rank = -other.GlueCEStateEstimatedResponseTime;
Type = "job";
StdError = "stderr";
DefaultRank = -other.GlueCEStateEstimatedResponseTime;
InputSandbox = { "file:///home/antun/solar-prl2/rho1/solar1-
51.sh","file:///home/antun/solar-prl2/rho1/solar1","file:///home/antun/solar-
prl2/rho1/input1-51" }
]
- matched_jdl =
[
Arguments =
[
JobAd =
[
stream_error = false;
edg_jobid
= "https://g01.phy.bg.ac.yu:9000/GiES4X_5UZFFYsRmN0OZ_Q";
GlobusScheduler = "ce.phy.bg.ac.yu:2119/jobmanager-pbs";
Transfer_Executable = true;
ce_id = "ce.phy.bg.ac.yu:2119/jobmanager-pbs-aegis";
Output
= "/var/glite/jobcontrol/condorio/Gi/https_3a_2f_2fg01.phy.bg.ac.yu_3a9000_2f
GiES4X_5f5UZFFYsRmN0OZ_5fQ/StandardOutput";
Copy_to_Spool = false;
Executable
= "/var/glite/jobcontrol/submit/Gi/JobWrapper.https_3a_2f_2fg01.phy.bg.ac.yu_
3a9000_2fGiES4X_5f5UZFFYsRmN0OZ_5fQ.sh";
X509UserProxy = "/var/glite/spool/glite-
renewd/5ad9af98dabbbc2065a81f7ea87177b9.82";
Error_
= "/var/glite/jobcontrol/condorio/Gi/https_3a_2f_2fg01.phy.bg.ac.yu_3a9000_2f
GiES4X_5f5UZFFYsRmN0OZ_5fQ/StandardError";
LB_sequence_code
= "UI=000002:NS=0000000003:WM=000004:BH=0000000000:JSS=000000:LM=000000:LRMS=
000000:APP=000000";
Notification = "never";
stream_output = false;
GlobusRSL = "(queue=aegis)(jobtype=single)";
Type = "job";
Universe = "grid";
UserSubjectName = "/DC=ORG/DC=SEE-GRID/O=People/O=UOB/CN=Antun
Balaz";
Log = "/var/glite/logmonitor/CondorG.log/CondorG.log";
grid_type = "globus"
]
];
Command = "Submit";
Source = 2;
Protocol = "1.0.0"
]
- rsl =
(queue=aegis)(jobtype=single)
- stateEnterTimes =
Submitted : Wed Sep 6 14:47:31 2006 CEST
Waiting : Wed Sep 6 14:47:33 2006 CEST
Ready : Wed Sep 6 14:47:45 2006 CEST
Scheduled : Wed Sep 6 14:48:04 2006 CEST
Running : Wed Sep 6 14:48:07 2006 CEST
Done : ---
Cleared : ---
Aborted : ---
Cancelled : ---
Unknown : ---
*************************************************************
-----
Antun Balaz
Research Assistant
E-mail: [log in to unmask]
Web: http://scl.phy.bg.ac.yu/
Phone: +381 11 3160260, Ext. 152
Fax: +381 11 3162190
Scientific Computing Laboratory
Institute of Physics, Belgrade, Serbia
-----
|