david bouvet wrote:
> Here in Lyon IN2P3-CC, we encountered a strange behaviour with some jobs
> submitted on our CE setup with LCG2_3_0 on SL3.
>
> Jobs submitted to that CE get failed from the user point of vue, but
> were still running or even queued in our batch system (BQS)
> Looking in the logs, there is no gram_job_state file for these jobs in
> /opt/globus/tmp/gram_job_state/ and in the trace file of our batch
> system the poll method is suddenly no more called even if the job status
> is not ended:
>
> [2005/02/12-02:48:58 27906] bqs poll ending...
> [2005/02/12-02:48:58 27906] bqs poll starting...
> [2005/02/12-02:48:58 27906] setting BQSCLUSTER to anastasie
> [2005/02/12-02:48:58 27906] job='lcg0211123145-23035' queue='T' localUser='lhcb0
> 01' gridUser='/C=ES/O=DATAGRID-ES/O=UB/CN=Ricardo Graciani' status=RUNNING step=
> EOJSTART
> ...
> (=> then no more reference to that job !?!)
The gram_job_state file is removed when the job is cleaned up, which happens
when it was first considered done by the grid_monitor, which happens when the
job manager perl script, i.e. the BQS job manager script, considers the job done.
> In the meantime, looking in the RB logs, the following problem appears :
>
> "Cannot read JobWrapper output, both from Condor and from Maradona"
>
> and so the RB thinks the job is finished.
That is understandable: when the job is marked done prematurely, there will be
no user job exit status transferred to the RB, hence that error message.
> So it seems the gridmonitor loose the jobs, or stop to talk with our
> batch system.
>
> The questions are:
>
> * Do you know about this strange phenomenon? Is it related to the
> LCG2_3_0 release (we don't have this problem with our other CE in
> LCG2_2_0 in RH7.3)
There have been a few changes in the grid_monitor script;
I have attached the diffs, which may give a clue.
> We noticed also a "maxtime" defined on CE LCG2_3_0 not defined on
> CE LCG2_2_0:
>
> 0 S dteam004 27473 27472 0 75 0 - 1721 schedu 08:16 ?
> 00:00:01 perl /tmp/grid_manager_monitor_agent.dteam004.27472.1000
> --delete-self --maxtime=3600s
The grid_monitor has 2 components: a master process per user, and a child
process for each RB that has outstanding jobs on the CE for that user;
IIRC, when the last job from a particular RB has exited, the corresponding
child hangs around for at most "maxtime", in case new jobs arrive shortly.
--- 2_2_0/opt/condor/sbin/grid_monitor.sh Tue Feb 17 11:16:29 2004
+++ 2_3_0/opt/condor/sbin/grid_monitor.sh Tue Oct 5 16:21:18 2004
@@ -517,6 +517,10 @@
# allow starting a new agent
my $KEEPALIVE_CHECK_AGE = 120;
+# Time since last state change after which globus state
+# files may be considered stale. (And possibly removed)
+my $MAX_STATE_AGE = 86400;
+
# Globals for the job state information
my %AllJobs;
my %AllStateFiles;
@@ -774,10 +778,28 @@
{
my %prev;
my %prev_time;
- my %prev_cachetag_to_contact;
+ my %prev_state_time;
+ my %prev_contact_2_cachetag;
my ($prev_start,$prev_end);
local(*FL);
+
+ if (open(FL,"< ".$Config{TimeFile}))
+ {
+ while(<FL>)
+ {
+ chomp(my $line=$_);
+ if ($line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\d+)$/)
+ {
+ my ($contact,$cachetag,$qtime,$schange) = ($1,$2,$3,$4);
+ $prev_time{$cachetag} = $qtime;
+ $prev_state_time{$cachetag} = $schange;
+ $prev_contact_2_cachetag{$contact} = $cachetag;
+ }
+ }
+ close(FL);
+ }
+
if (open(FL,"< ".$Config{OutputFile}))
{
while(<FL>)
@@ -796,25 +818,14 @@
if ($line =~ /^(\S+)\s+(\d+)$/)
{
my ($contact,$state) = ($1,$2);
- $prev{$contact} = $state;
+ if (defined $prev_contact_2_cachetag{$contact})
+ {
+ $prev{$prev_contact_2_cachetag{$contact}} = $state;
+ }
}
}
}
close(FL);
- if (open(FL,"< ".$Config{TimeFile}))
- {
- while(<FL>)
- {
- chomp(my $line=$_);
- if ($line =~ /^(\S+)\s+(\S+)\s+(\d+)$/)
- {
- my ($contact,$cachetag,$qtime) = ($1,$2,$3);
- $prev_time{$contact} = $qtime;
- $prev_cachetag_to_contact{$cachetag} = $contact;
- }
- }
- close(FL);
- }
}
# Record the current time..
@@ -981,9 +992,28 @@
}
else
{
- my $prev_contact;
- $prev_contact = $prev_cachetag_to_contact{$CacheTag} if exists $prev_cachetag_to_contact{$CacheTag};
- $AllJobs{$CacheTag}->{qtime} = $prev_time{$prev_contact} if defined $prev_contact && exists $prev_time{$prev_contact};
+ $AllJobs{$CacheTag}->{qtime} = $prev_time{$CacheTag} if defined $prev_time{$CacheTag};
+ }
+ }
+
+ # Remove state files for which the state hasn't changed for some time
+ foreach my $Job (keys %AllJobs)
+ {
+ my $Job_ref = $AllJobs{$Job}->{Job};
+ my $FullPath = $Job_ref->{FullPath};
+
+ if (defined $prev_state_time{$Job} && $PassStartTime-$prev_state_time{$Job}>$MAX_STATE_AGE)
+ {
+ if (defined $prev{$Job} &&
+ ($prev{$Job} == Globus::GRAM::JobState::DONE ||
+ $prev{$Job} == Globus::GRAM::JobState::FAILED ||
+ $prev{$Job} == Globus::GRAM::JobState::UNSUBMITTED)) {
+
+ delete $AllJobs{$Job};
+ delete $AllStateFiles{$FullPath};
+
+ unlink($FullPath, $FullPath.".lock");
+ }
}
}
@@ -993,6 +1023,7 @@
my $nfinished = 0;
my %Jobs;
my %query_state;
+ my %state_change;
foreach my $CacheTag (@sorted_cachetags)
{
@@ -1009,7 +1040,9 @@
my $max_time_allowed = $Config{Period};
$skip_rest = 1 if $nqueries>0 && $elapsed_time>$max_time_allowed;
- $skip_rest = 1 if $nfinished > $Config{Period}/2;
+
+ # don't limit the number of finished jobs per scan
+ # $skip_rest = 1 if $nfinished > $Config{Period}/2;
}
# Set the spool directory env
@@ -1036,41 +1069,49 @@
my $JobState;
- my $prev_contact;
- $prev_contact = $prev_cachetag_to_contact{$CacheTag} if exists $prev_cachetag_to_contact{$CacheTag};
if ( $skip_rest )
{
- if ( !defined $prev_contact || !exists $prev{$prev_contact} || $AllJobs{$CacheTag}->{mtime} > $AllJobs{$CacheTag}->{qtime} )
+ if ( !defined $prev{$CacheTag} || $AllJobs{$CacheTag}->{mtime} > $AllJobs{$CacheTag}->{qtime} )
{
$JobState = $Job_ref->{Status};
$query_state{$CacheTag} = -3;
}
else
{
- $JobState = $prev{$prev_contact};
+ $JobState = $prev{$CacheTag};
$query_state{$CacheTag} = -2;
}
}
else
{
- # Create a job description and job manager
- my $JobDescription = CreateJobDescription( $Job_ref );
- my $JobManager = CreateJobManager( $Job_ref, $JobDescription );
-
- # And, get the current state as best we can..
-
my $query_time = time();
- $JobState = GetJobState( $JobManager );
- $nqueries++;
+ # avoid call to jobmanager poll() if the job was already done or failed
+ if ( defined $prev{$CacheTag} &&
+ ( $prev{$CacheTag} == Globus::GRAM::JobState::DONE ||
+ $prev{$CacheTag} == Globus::GRAM::JobState::FAILED ))
+ {
+ $JobState = $prev{$CacheTag};
+ }
+ else
+ {
+ # Create a job description and job manager
+ my $JobDescription = CreateJobDescription( $Job_ref );
+ my $JobManager = CreateJobManager( $Job_ref, $JobDescription );
+
+ # And, get the current state as best we can..
+ $JobState = GetJobState( $JobManager );
+
+ $nqueries++;
+ }
$query_state{$CacheTag} = $query_time if defined $JobState;
- if (defined $prev_contact && exists $prev{$prev_contact})
+ if (defined $prev{$CacheTag})
{
if (defined $JobState)
{
- if ($prev{$prev_contact} != $JobState)
+ if ($prev{$CacheTag} != $JobState)
{
$nfinished++ if ($JobState == Globus::GRAM::JobState::DONE ||
$JobState == Globus::GRAM::JobState::FAILED);
@@ -1078,7 +1119,7 @@
}
else
{
- $JobState = $prev{$prev_contact};
+ $JobState = $prev{$CacheTag};
$query_state{$CacheTag} = -1;
}
}
@@ -1088,6 +1129,10 @@
if ( defined $JobState )
{
$Jobs{$CacheTag} = $JobState;
+ if ( defined $prev{$CacheTag} && $prev{$CacheTag} == $JobState )
+ {
+ $state_change{$CacheTag} = $prev_state_time{$CacheTag};
+ }
}
}
@@ -1113,16 +1158,21 @@
open( FILE, "> ".$Config{TimeFile} ) || die( "Can't write to data file ".$Config{TimeFile}.": $!" );
foreach my $Job ( sort keys %Jobs )
{
+ my $state_change_time;
+ $state_change_time = $state_change{$Job} if defined $state_change{$Job};
+
if ( $query_state{$Job} >= 0 )
{
- printf FILE "%-60s %-60s %10d\n", $AllJobs{$Job}->{Contact}, $Job, $query_state{$Job};
+ $state_change_time = $query_state{$Job} if !defined $state_change_time;
+ printf FILE "%-60s %-60s %10d %10d\n", $AllJobs{$Job}->{Contact}, $Job, $query_state{$Job}, $state_change_time;
}
else
{
my $new_time = $AllJobs{$Job}->{mtime};
$new_time = $AllJobs{$Job}->{qtime} if $new_time < $AllJobs{$Job}->{qtime};
+ $state_change_time = $new_time if !defined $state_change_time;
- printf FILE "%-60s %-60s %10d\n", $AllJobs{$Job}->{Contact}, $Job, $new_time;
+ printf FILE "%-60s %-60s %10d %10d\n", $AllJobs{$Job}->{Contact}, $Job, $new_time, $state_change_time;
}
}
close( FILE );
|