#!/usr/bin/perl -w # # deal with absence of or deadlock between JC/LM/WM/NS # use strict; # # typical content of /proc/locks in case of a deadlock: # # 1: POSIX ADVISORY READ 7119 03:02:2065024 0 EOF [...] # 2: POSIX ADVISORY READ 7185 03:02:2065024 0 EOF [...] # 3: POSIX ADVISORY READ 7184 03:02:2065024 0 EOF [...] # 4: POSIX ADVISORY READ 7193 03:02:2065024 0 EOF [...] # 5: POSIX ADVISORY READ 7192 03:02:2065024 0 EOF [...] # 6: POSIX ADVISORY READ 7191 03:02:2065024 0 EOF [...] # 7: POSIX ADVISORY READ 7190 03:02:2065024 0 EOF [...] # 8: POSIX ADVISORY READ 7189 03:02:2065024 0 EOF [...] # 9: POSIX ADVISORY READ 7188 03:02:2065024 0 EOF [...] # 10: POSIX ADVISORY READ 7187 03:02:2065024 0 EOF [...] # 11: POSIX ADVISORY READ 7052 03:02:2065024 0 EOF [...] # 12: POSIX ADVISORY READ 7053 03:02:2065024 0 EOF [...] # 13: POSIX ADVISORY READ 7186 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7053 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7052 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7187 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7188 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7189 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7190 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7191 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7192 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7193 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7184 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7185 03:02:2065024 0 EOF [...] # 13: -> POSIX ADVISORY WRITE 7119 03:02:2065024 0 EOF [...] # 14: POSIX ADVISORY WRITE 7101 03:02:1950466 0 EOF [...] # 15: POSIX ADVISORY WRITE 7077 03:02:2048475 0 EOF [...] # 16: FLOCK ADVISORY WRITE 1796 03:02:2048385 0 EOF [...] # # some example processes involved: # # 7052 ? S 5:11 /opt/edg/bin/edg-wl-workload_manager [...] # 7094 ? S 7:46 /opt/edg/bin/edg-wl-job_controller -c edg_wl.conf # 7119 ? S 1:03 /opt/edg/bin/edg-wl-log_monitor -c edg_wl.conf # 7185 ? S 0:01 /opt/edg/bin/edg-wl-ns_daemon # # inode numbers of the 2 possible files involved: # # 1032965 /tmp/jobcontrol/queue.fl # 2065024 /tmp/workload_manager/input.fl # print "$0 starting at "; system('date "+20%y/%m/%d %H:%M:%S"'); my @lock_status = `cat /proc/locks`; my %lock_seekers; foreach (@lock_status) { my @fields = split / +/; if ($fields[1] eq '->') { push @{ $lock_seekers{$fields[6]} }, $fields[5]; } } my %daemons = ( 'edg-wl-job_controller' => { 'stop_rank' => 3, 'start_rank' => 950, 'script' => '/etc/init.d/edg-wl-jc', 'delay' => 1, }, 'edg-wl-log_monitor' => { 'stop_rank' => 2, 'start_rank' => 951, 'script' => '/etc/init.d/edg-wl-lm', 'delay' => 1, }, 'edg-wl-ns_daemon' => { 'stop_rank' => 1, 'start_rank' => 952, 'script' => '/etc/init.d/edg-wl-ns', 'delay' => 60, }, 'edg-wl-workload_manager' => { 'stop_rank' => 4, 'start_rank' => 940, 'script' => '/etc/init.d/edg-wl-wm', 'delay' => 1, }, ); my (%stop_cmds, %start_cmds); foreach my $file (keys %lock_seekers) { my %seen = (); foreach my $pid (@{ $lock_seekers{$file} }) { my $name = `ps ww$pid`; foreach (keys %daemons) { if ($name =~ /$_/) { $seen{$_} = 1; last; } } } if (keys %seen > 1) { foreach (keys %seen) { my $params = \%{ $daemons{$_} }; my $stop_rank = $$params{'stop_rank'}; my $start_rank = $$params{'start_rank'}; my $script = $$params{'script'}; my $delay = $$params{'delay'}; $stop_cmds {$stop_rank } = "$script stop"; $start_cmds{$start_rank} = "sleep $delay; $script start"; print "File $file wanted by $_\n"; } } } # # Examples of the remaining "edguser" processes: # # edguser 6976 [...] Oct24 0:54 /opt/edg/sbin/edg-wl-logd -k [...] # edguser 7100 [...] Oct24 0:33 /opt/edg/sbin/edg-wl-renewd -r [...] # edguser 8142 [...] Oct24 15:54 /opt/edg/sbin/edg-wl-interlogd -k [...] # edguser 6012 [...] Oct27 1:37 /opt/condor/sbin/condor_master # edguser 6013 [...] Oct27 3:33 condor_schedd -f # edguser 15681 [...] Nov02 0:02 /opt/edg/sbin/edg-wl-bkserverd # my %other_daemons = ( 'condor_master' => { 'start_rank' => 950, 'script' => '/etc/init.d/edg-wl-jc', 'delay' => 1, }, 'condor_schedd' => { 'start_rank' => 950, 'script' => '/etc/init.d/edg-wl-jc', 'delay' => 1, }, 'edg-wl-bkserverd' => { 'start_rank' => 920, 'script' => '/etc/init.d/edg-wl-lbserver', 'delay' => 1, }, 'edg-wl-interlogd' => { 'start_rank' => 930, 'script' => '/etc/init.d/edg-wl-locallogger', 'delay' => 1, }, 'edg-wl-logd' => { 'start_rank' => 930, 'script' => '/etc/init.d/edg-wl-locallogger', 'delay' => 1, }, 'edg-wl-renewd' => { 'start_rank' => 953, 'script' => '/etc/init.d/edg-wl-proxyrenewal', 'delay' => 1, }, ); my $processes = `ps auxww | grep '^edguser'`; foreach my $d (\%daemons, \%other_daemons) { foreach (sort keys %$d) { if ($processes !~ /$_/) { my $params = \%{ $$d{$_} }; my $start_rank = $$params{'start_rank'}; my $script = $$params{'script'}; my $delay = $$params{'delay'}; $start_cmds{$start_rank} = "sleep $delay; $script start"; print "Absent: $_\n"; } } } foreach (sort keys %stop_cmds) { print "$stop_cmds{$_}\n"; system "$stop_cmds{$_}\n"; } foreach (sort keys %start_cmds) { print "$start_cmds{$_}\n"; system "$start_cmds{$_}\n"; }