#!/usr/local/bin/perl # # Name: # $0 # # Synopsis # Run a command and check it periodically for hanging. $0 forks a # second process which checks the accumulated cpu time of the passed # command. It does not add up accumulated cpu time of children of the # passed command. It assumes that if the cpu time does not grow, the # command has hung. # # Options # see definition of &usage, or guess. # # History: # 0111.13 by David Mackintosh # initial revision # 0104.05 by David Mackintosh # added option to mail at exit with success or failure # # Usefull Values. $BASENAME = $0; $BASENAME =~ s|\\|/|g; if ($BASENAME =~ m|(.*/)(.*)|) { $BASENAME = $2; } $| = 1; # number of times to attempt running the command $MAXATTEMPTS = 10; # how long to sleep in seconds between cpu time checks $SNOOZE = 100; # # code here. while ($#ARGV > -1) { $_ = shift @ARGV; if (!$command) { if (/^-DEBUG_TO$/) { $DEBUG = 1; $LOG = shift @ARGV; &debug("Debugging on"); next; } if (/^-SNOOZE$/) { $SNOOZE = shift @ARGV; next; } if (/^-ATTEMPT$/) { $MAXATTEMPTS = shift @ARGV; next; } if (/^-m$/) { $MailTo = shift @ARGV; next; } if (/^--HELP$/) { &usage("__SHOW_HELP"); } } $command = $command . " $_"; # &usage("$_"); } &usage("__SHOW_HELP") if (!$command); # # display the help message. # I find it usefull to leave this subroutine here, next to the parameter # parsing, as it makes it much easer to keep the help and the # actual parsing in sync. sub usage { $gripe = shift @_; print <<"EO_USAGE"; $BASENAME [options] command args -ATTEMPT x only attempt the command x times, default 10 -SNOOZE x sleep x seconds between cpu time tests, default 100 -m address[,address[...]] email address(es) with notification of success or failure on completion --HELP print this screen Examples: $BASENAME command glark glark glark $BASENAME -ATTEMPT 5 -SNOOZE 30 command glark glark glark EO_USAGE exit 0 if ($gripe eq "__SHOW_HELP"); &die("Don't understand:$gripe"); } $TIME = "date '+\%Y\%m\%d.\%H\%M'"; &debug("command:$command"); $time = `$TIME`; chop $time; $output="- - - $0: $time: Command: $command\n"; $ppid = $$; &debug("My PID is $ppid -- preparing to fork"); $pid = fork(); $done = 0; $attempts = 0; &die("can't fork watchdog:$!") if ($pid < 0); &watchdog() if ($pid == 0); while (!$done and $attempts < $MAXATTEMPTS) { $attempts++; &debug("attempt $attempts for $command"); $result = system($command); $tombstone = $result/256; if ($result != 9) { &debug("ended -- result $result"); $done = 1; $time = `$TIME`; chop $time; if ($result != 0 ) { $event="! ! ! $0: $time: child died with result $tombstone\n"; } else { $event="- - - $0: $time: operation completed\n"; } print $event; $output=$output.$event; &mailout(); } elsif ($attempts < $MAXATTEMPTS) { &debug("looks like it was -9'd, retrying"); $time = `$TIME`; chop $time; $event="* * * $0: $time: restarting child\n"; print $event; $output=$output.$event; } else { $time = `$TIME`; chop $time; $event = "! ! ! $0: $time: giving up\n"; print $event; $output=$output.$event; &mailout(); $done = 1; } } # terminate the watchdog kill(9, $pid); # pass the result of the command back to the caller exit($result); # # watchdog sub watchdog { &debug("WD:starting watchdog $$ child of $ppid"); $fails = 0; $time = 0; ($unused, $tag, @junk) = split (/ /, $command); while (1) { # sleep first to let things get started sleep $SNOOZE; # collect $ok = 0; open(PS, "/usr/bin/ps -eo pid,ppid,time,args|") or &die("can't read processes:$!"); while () { chop; s/^\s+//g; ($tpid, $tppid, $rtime, @junk) = split (/\s+/, $_); &debug("WD:considering $_"); &debug("WD:checking for ppid $ppid = $tppid"); next if ($tppid != $ppid); $arg = shift (@junk); &debug("WD:checking for tag $tag = $arg"); next if ($arg ne $tag); &debug("WD:using $_"); $ok = 1; last; } close PS; &die("WD:can't find sibbling") if (!$ok); &debug("WD:watching $tpid"); &debug("WD:cpu time $rtime"); ($min, $sec) = split (/:/, $rtime); $rtime = ($min * 60) + $sec; &debug("WD:time in seconds:$rtime"); # act &debug("WD:comparing to:$time"); if ($rtime > $time) { &debug("WD:time increased"); $time = $rtime; $fails = 0; next; } &debug("WD:killing sibling"); kill(9, $tpid); $fails = 0; $time = 0; } } # # spit out the result, mail if necessary sub mailout { if ($MailTo) { open(OUT, "|mailx -s \"$command\" $MailTo") or &die("can't mail"); print OUT $output; close OUT; } } # # an easy way to change those back-ticks into # system() calls when we fuck up. sub doThis { local ($result); local ($thing) = pop @_; &debug("doing [$thing]"); # $result=system($thing); $result = `$thing`; &debug("finished shelling; output:\n$result"); &debug("output ends"); return $result; } # # Open our log file. sub openLog { if (!$LOGPATH) { &die("Set logpath in script"); } if (!open(LOG, ">$LOGPATH$BASENAME.$$.log")) { &die("log $LOGPATH$BASENAME.$$.log"); } } # # Add an item to the open log sub logItem { local ($gripe); $gripe = pop (@_); print LOG "$gripe\n" if ($LOGPATH); } # # A (braindead) undertaker. sub die { local ($gripe); $gripe = pop (@_); &warn("fatal:$gripe"); &logItem("fatal:$gripe"); exit 1; } # # A (braindead) friend for our undertaker. sub warn { local ($gripe); $gripe = pop (@_); print STDERR "$BASENAME:$gripe\n"; &logItem($gripe); } # # A simple debug outputer. sub debug { local ($gripe); $gripe = pop (@_); $DEBUG && &log("DEBUG:$gripe"); } # a simple debug logger. sub log { local ($gripe); if ($LOG) { $gripe = shift @_; open(LOG, ">>$LOG"); print LOG "$$:$gripe\n"; close(LOG); } }