The scripts are attached.
#!/usr/bin/perl -w
#
# deal with absence of or deadlock between JC/LM/WM/NS
#
use strict;
#
# typical content of /proc/locks in case of a deadlock:
#
# 1: POSIX ADVISORY READ 7119 03:02:2065024 0 EOF [...]
# 2: POSIX ADVISORY READ 7185 03:02:2065024 0 EOF [...]
# 3: POSIX ADVISORY READ 7184 03:02:2065024 0 EOF [...]
# 4: POSIX ADVISORY READ 7193 03:02:2065024 0 EOF [...]
# 5: POSIX ADVISORY READ 7192 03:02:2065024 0 EOF [...]
# 6: POSIX ADVISORY READ 7191 03:02:2065024 0 EOF [...]
# 7: POSIX ADVISORY READ 7190 03:02:2065024 0 EOF [...]
# 8: POSIX ADVISORY READ 7189 03:02:2065024 0 EOF [...]
# 9: POSIX ADVISORY READ 7188 03:02:2065024 0 EOF [...]
# 10: POSIX ADVISORY READ 7187 03:02:2065024 0 EOF [...]
# 11: POSIX ADVISORY READ 7052 03:02:2065024 0 EOF [...]
# 12: POSIX ADVISORY READ 7053 03:02:2065024 0 EOF [...]
# 13: POSIX ADVISORY READ 7186 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7053 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7052 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7187 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7188 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7189 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7190 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7191 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7192 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7193 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7184 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7185 03:02:2065024 0 EOF [...]
# 13: -> POSIX ADVISORY WRITE 7119 03:02:2065024 0 EOF [...]
# 14: POSIX ADVISORY WRITE 7101 03:02:1950466 0 EOF [...]
# 15: POSIX ADVISORY WRITE 7077 03:02:2048475 0 EOF [...]
# 16: FLOCK ADVISORY WRITE 1796 03:02:2048385 0 EOF [...]
#
# some example processes involved:
#
# 7052 ? S 5:11 /opt/edg/bin/edg-wl-workload_manager [...]
# 7094 ? S 7:46 /opt/edg/bin/edg-wl-job_controller -c edg_wl.conf
# 7119 ? S 1:03 /opt/edg/bin/edg-wl-log_monitor -c edg_wl.conf
# 7185 ? S 0:01 /opt/edg/bin/edg-wl-ns_daemon
#
# inode numbers of the 2 possible files involved:
#
# 1032965 /tmp/jobcontrol/queue.fl
# 2065024 /tmp/workload_manager/input.fl
#
print "$0 starting at ";
system('date "+20%y/%m/%d %H:%M:%S"');
my @lock_status = `cat /proc/locks`;
my %lock_seekers;
foreach (@lock_status) {
my @fields = split / +/;
if ($fields[1] eq '->') {
push @{ $lock_seekers{$fields[6]} }, $fields[5];
}
}
my %daemons = (
'edg-wl-job_controller' => {
'stop_rank' => 3,
'start_rank' => 950,
'script' => '/etc/init.d/edg-wl-jc',
'delay' => 1,
},
'edg-wl-log_monitor' => {
'stop_rank' => 2,
'start_rank' => 951,
'script' => '/etc/init.d/edg-wl-lm',
'delay' => 1,
},
'edg-wl-ns_daemon' => {
'stop_rank' => 1,
'start_rank' => 952,
'script' => '/etc/init.d/edg-wl-ns',
'delay' => 60,
},
'edg-wl-workload_manager' => {
'stop_rank' => 4,
'start_rank' => 940,
'script' => '/etc/init.d/edg-wl-wm',
'delay' => 1,
},
);
my (%stop_cmds, %start_cmds);
foreach my $file (keys %lock_seekers) {
my %seen = ();
foreach my $pid (@{ $lock_seekers{$file} }) {
my $name = `ps ww$pid`;
foreach (keys %daemons) {
if ($name =~ /$_/) {
$seen{$_} = 1;
last;
}
}
}
if (keys %seen > 1) {
foreach (keys %seen) {
my $params = \%{ $daemons{$_} };
my $stop_rank = $$params{'stop_rank'};
my $start_rank = $$params{'start_rank'};
my $script = $$params{'script'};
my $delay = $$params{'delay'};
$stop_cmds {$stop_rank } = "$script stop";
$start_cmds{$start_rank} = "sleep $delay; $script start";
print "File $file wanted by $_\n";
}
}
}
#
# Examples of the remaining "edguser" processes:
#
# edguser 6976 [...] Oct24 0:54 /opt/edg/sbin/edg-wl-logd -k [...]
# edguser 7100 [...] Oct24 0:33 /opt/edg/sbin/edg-wl-renewd -r [...]
# edguser 8142 [...] Oct24 15:54 /opt/edg/sbin/edg-wl-interlogd -k [...]
# edguser 6012 [...] Oct27 1:37 /opt/condor/sbin/condor_master
# edguser 6013 [...] Oct27 3:33 condor_schedd -f
# edguser 15681 [...] Nov02 0:02 /opt/edg/sbin/edg-wl-bkserverd
#
my %other_daemons = (
'condor_master' => {
'start_rank' => 950,
'script' => '/etc/init.d/edg-wl-jc',
'delay' => 1,
},
'condor_schedd' => {
'start_rank' => 950,
'script' => '/etc/init.d/edg-wl-jc',
'delay' => 1,
},
'edg-wl-bkserverd' => {
'start_rank' => 920,
'script' => '/etc/init.d/edg-wl-lbserver',
'delay' => 1,
},
'edg-wl-interlogd' => {
'start_rank' => 930,
'script' => '/etc/init.d/edg-wl-locallogger',
'delay' => 1,
},
'edg-wl-logd' => {
'start_rank' => 930,
'script' => '/etc/init.d/edg-wl-locallogger',
'delay' => 1,
},
'edg-wl-renewd' => {
'start_rank' => 953,
'script' => '/etc/init.d/edg-wl-proxyrenewal',
'delay' => 1,
},
);
my $processes = `ps auxww | grep '^edguser'`;
foreach my $d (\%daemons, \%other_daemons) {
foreach (sort keys %$d) {
if ($processes !~ /$_/) {
my $params = \%{ $$d{$_} };
my $start_rank = $$params{'start_rank'};
my $script = $$params{'script'};
my $delay = $$params{'delay'};
$start_cmds{$start_rank} = "sleep $delay; $script start";
print "Absent: $_\n";
}
}
}
foreach (sort keys %stop_cmds) {
print "$stop_cmds{$_}\n";
system "$stop_cmds{$_}\n";
}
foreach (sort keys %start_cmds) {
print "$start_cmds{$_}\n";
system "$start_cmds{$_}\n";
}
|