From cffca65277f8d696f358e2b9e9f3fc77da07a007 Mon Sep 17 00:00:00 2001 From: Andy Bavier Date: Wed, 22 Dec 2004 18:57:48 +0000 Subject: [PATCH] Beginning of changes to allow pl_moom to monitor, adjust bandwidth --- Makefile | 4 +- pl_mom.pl | 265 +++++++++++++++++++++++++++++++++++++--------------- pl_mom.spec | 4 +- 3 files changed, 194 insertions(+), 79 deletions(-) diff --git a/Makefile b/Makefile index dfe3e77..06048a0 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ # @COPYRIGHT@ # -# $Id: Makefile,v 1.1.1.1 2004/10/29 18:03:29 acb Exp $ +# $Id: Makefile,v 1.2 2004/11/17 20:24:11 acb Exp $ BASE=pl_mom -VERSION=0.2 +VERSION=0.3 TARFILE=$(BASE)-$(VERSION).tgz all: pl_mom.pl pl_mom diff --git a/pl_mom.pl b/pl_mom.pl index d441393..55f8c9e 100755 --- a/pl_mom.pl +++ b/pl_mom.pl @@ -9,6 +9,7 @@ $debug = 0; $proc = "pl_mom"; $alias_addr = "pl-mom\@planet-lab.org"; $from_addr = "support\@planet-lab.org"; +$bwcap = "1.5Mbit"; if (! $debug) { $kill_thresh = 90; @@ -16,20 +17,29 @@ if (! $debug) { $log_thresh = 85; $change_thresh = 5; $min_thresh = 10; + #$byte_cutoff = 16000000000; # 16GB + $byte_cutoff = 5000000000; # 5GB + $bwmon_sleep = 900; $sendmail = "/usr/sbin/sendmail -t -f$from_addr"; $pidfile = "/var/run/$proc.pid"; $rebootfile = "/var/lib/misc/pl_mom.reboot"; + $daily_log = "/var/lib/misc/pl_mom.daily"; + $daily_stamp = "/var/lib/misc/pl_mom.stamp"; } else { $kill_thresh = 2; $reboot_thresh = 20; $log_thresh = 2; $change_thresh = 5; $min_thresh = 10; + $byte_cutoff = 16000; + $bwmon_sleep = 10; $sendmail = "cat"; $pidfile = "./$proc.pid"; $rebootfile = "./pl_mom.reboot"; + $daily_log = "./pl_mom.daily"; + $daily_stamp = "./pl_mom.stamp"; } $sleep = 30; @@ -41,6 +51,7 @@ if (! $debug) { system("echo $$ > $pidfile"); +# Check to see whether pl_mom rebooted the node if (-e $rebootfile) { unlink($rebootfile); syslog ("warning", "pl_mom: Sending shutdown mail"); @@ -53,6 +64,12 @@ if (! $pid) { reboot_kicker(); die (0); } +$pid = fork(); +if (! $pid) { + syslog ("info", "pl_mom: Launching bandwidth monitor"); + bandwidth_monitor(); + die (0); +} while (1) { $used = int(swap_used()); @@ -115,18 +132,97 @@ sub reboot_kicker { } } -sub shutdown_mail { - my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); - my $to = "To: $alias_addr\n"; +sub bandwidth_monitor { + while (1) { + # See if a new day has started for bandwidth monitoring + chomp($now = `date -u +%D`); + if (-e $daily_stamp) { + chomp($stamp = `cat $daily_stamp`); + } + if (! defined($stamp) || !($stamp =~ $now)) { + open (STAMP, ">$daily_stamp") || + die "Can't open file $daily_stamp for writing: $!\n"; + print STAMP "$now\n"; + close STAMP; + unlink ($daily_log); + + # Could save the list of capped slices in a file in order to + # avoid re-sending mails if the daemon restarts. + # Also may want a list of slices that are exempt from capping. + if (defined(%Cap)) { + undef %Cap; + } + + # Need to reset bandwidth limits here + + syslog("info", "pl_mom: Beginning bandwidth monitoring for $now"); + } + + # Get baseline counts + `touch $daily_log`; + open (BASE, "+<$daily_log") || + print "Cannot open $daily_log; $!\n"; + while () { + my ($sliceid, $bytecount) = split(/ /); + $Start{$sliceid} = $bytecount; + } + + $status = `tc -s -d qdisc show`; + @lines = split(/\n/, $status); + for ($i = 0; $i < @lines; $i++) { + if ($lines[$i] =~ /qdisc pfifo/) { + $lines[$i] =~ s/^ +//; + @fields = split(/ /, $lines[$i]); + $slice = $fields[2]; + $slice =~ s/://; + + if ($slice != 9999) { + $lines[$i+1] =~ s/^ +//; + @fields = split(/ /, $lines[$i+1]); + $bytes = $fields[1]; + #if ($bytes) {print "Slice $slice sent $bytes bytes\n";} + + if (! defined($Start{$slice})) { + print BASE "$slice $bytes\n"; + $Start{$slice} = $bytes; + } + $Now{$slice} = $bytes; + } + } + } + close (BASE); + + foreach $slice ( sort (keys %Start) ) { + if (defined $Now{$slice}) { + $today = $Now{$slice} - $Start{$slice}; + if ($today >= $byte_cutoff && ! defined($Cap{$slice})) { + $Cap{$slice} = "sent"; + $slicename = get_slice_name($slice); + if ($slicename) { + bw_cap_mail($slicename); + } else { + syslog("warning", "pl_mom: Could not find slice name". + " for slice ID $slice"); + } + # Add bandwidth cap here + } + } else { + # The /proc/virtual/ directory is gone... + } + } + + sleep($bwmon_sleep); + } +} + +sub send_mail { + # Arg 0: recipient addresses, comma-separated string + # Arg 1: subject line + # Arg 2: body of message + my $to = "To: $_[0]\n"; my $from = "From: $from_addr\n"; - my $subject = "Subject: $proc rebooted $hostname\n"; - my $msg ="Swap space was exhausted on $hostname and so $proc rebooted ". - "it.\n\nAs of $date, the node has successfully come back online.\n". - "\n$date $hostname reboot\n"; + my $subject = "Subject: $_[1]\n"; + my $msg = $_[2]; open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; print SENDMAIL $to; @@ -137,34 +233,75 @@ sub shutdown_mail { close(SENDMAIL); } +sub get_date { + my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) + = localtime(time); + my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", + $year+1900, $mon+1, $mday, $hour, $min, $sec); + return $date; +} + +sub get_slice_name { + # Arg 0: slice ID + + # Need to map slice id to slice name; is there a sensor? + # For now, get it from /etc/passwd + my $name = ""; + open (PASSWD, ") { + my ($slicename, $passwd, $sliceid) = split(/:/); + if ($sliceid == $_[0]) { + $name = $slicename; + } + } + close PASSWD; + return $name; +} + +sub bw_cap_mail { + my $hostname = hostname(); + my $date = get_date(); + my $sent = int($byte_cutoff/1000000000); + + # Put this here because this is where we have the + syslog("warning", "pl_mom: Capping bandwidth of slice ". + "$slicename at $bwcap until midnight GMT."); + + send_mail("acb\@planet-lab.org", + "$proc capped bandwidth of slice $slicename on $hostname", + "Slice $slicename has transmitted more than ${sent}GB today". + " on $hostname. ". + "Its bandwidth will be capped at $bwcap until midnight GMT.". + "\n\n$date $hostname bwcap $slicename\n"); +} + +sub shutdown_mail { + my $hostname = hostname(); + my $date = get_date(); + send_mail($alias_addr, + "$proc rebooted $hostname", + "Swap space was exhausted on $hostname and so $proc rebooted ". + "it.\n\nAs of $date, the node has successfully come back ". + "online.\n\n$date $hostname reboot\n"); +} + sub slice_reset_mail { my $hog = $_[0]; my $top = $_[1]; my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); my $hog_pct = $Slice{$hog}{mem_pct}; my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); - my $to = "To: $alias_addr, $hog\@slices.planet-lab.org\n"; - my $from = "From: $from_addr\n"; - my $subject = "Subject: $proc reset slice $hog on $hostname\n"; - my $msg = "As of $date, swap space is nearly exhausted on $hostname.\n\n". - "Slice $hog is being reset since it is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n". - "Please reply to this message explaining the nature of your ". - "experiment, and what you are doing to address the problem.\n". - "\nOutput of 'top -b -n 1' in your slice prior to reset:\n". - "$top\n\n$date $hostname reset $hog\n"; - - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); + my $date = get_date(); + send_mail("$alias_addr, $hog\@slices.planet-lab.org", + "$proc reset slice $hog on $hostname", + "As of $date, swap space is nearly exhausted on $hostname.\n\n". + "Slice $hog is being reset since it is the largest consumer ". + "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n". + "Please reply to this message explaining the nature of your ". + "experiment, and what you are doing to address the problem.\n". + "\nOutput of 'top -b -n 1' in your slice prior to reset:\n". + "$top\n\n$date $hostname reset $hog\n"); } sub slice_warning_mail { @@ -172,32 +309,21 @@ sub slice_warning_mail { my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); my $hog_pct = $Slice{$hog}{mem_pct}; my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); + my $date = get_date(); if ($hog =~ /^root$/) { - $to = "To: $alias_addr\n"; + $to = $alias_addr; } else { - $to = "To: $alias_addr, $hog\@slices.planet-lab.org\n"; + $to = "$alias_addr, $hog\@slices.planet-lab.org"; } - my $from = "From: $from_addr\n"; - my $subject = "Subject: $proc may reset slice $hog on $hostname\n"; - my $msg = "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "Please check the memory usage of your slice to avoid a reset.\n". - "\n$date $hostname warning $hog\n"; - - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); + send_mail($to, + "$proc may reset slice $hog on $hostname", + "As of $date, swap space is over $log_thresh% full on ". + "$hostname.\n\nSlice $hog is the largest consumer ". + "of physical memory at ${hog_mem}MB ($hog_pct%).\n". + "Please check the memory usage of your slice to avoid a ". + "reset.\n\n$date $hostname warning $hog\n"); } sub unkillable_alarm_mail { @@ -205,32 +331,21 @@ sub unkillable_alarm_mail { my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); my $hog_pct = $Slice{$hog}{mem_pct}; my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); + my $date = get_date(); if ($hog =~ /^root$/) { - $to = "To: $alias_addr\n"; + $to = $alias_addr; } else { - $to = "To: $alias_addr, $hog\@slices.planet-lab.org\n"; + $to = "$alias_addr, $hog\@slices.planet-lab.org"; } - my $from = "From: $from_addr\n"; - my $subject = "Subject: $proc: alarm for slice $hog on $hostname\n"; - my $msg = "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "The slice will not be reset, but please verify its behavior.\n". - "\n$date $hostname alarm $hog\n"; - - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); + send_mail($to, + "$proc: alarm for slice $hog on $hostname", + "As of $date, swap space is over $log_thresh% full on ". + "$hostname.\n\nSlice $hog is the largest consumer ". + "of physical memory at ${hog_mem}MB ($hog_pct%).\n". + "The slice will not be reset, but please verify its behavior.\n". + "\n$date $hostname alarm $hog\n"); } sub slice_reset { diff --git a/pl_mom.spec b/pl_mom.spec index 1083ba2..127fdc0 100644 --- a/pl_mom.spec +++ b/pl_mom.spec @@ -1,6 +1,6 @@ %define name pl_mom -%define version 0.2 -%define release 16.planetlab%{?date:.%{date}} +%define version 0.3 +%define release 1.planetlab%{?date:.%{date}} Summary: PlanetLab mom -- Cleans up your mess Name: %{name} -- 2.43.0