X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=pl_mom.pl;h=55da825421cfd05761bd8b84de74591491902991;hb=refs%2Fheads%2Fplanetlab-3_2-branch;hp=86edd297a602e87697bebc4b9124201ab45dd010;hpb=e672b904df6219c2006ceb8e75abe46d21969344;p=mom.git diff --git a/pl_mom.pl b/pl_mom.pl index 86edd29..55da825 100755 --- a/pl_mom.pl +++ b/pl_mom.pl @@ -3,11 +3,12 @@ use POSIX qw(setsid); use Sys::Syslog; use Sys::Hostname; -use LWP::Simple; +#use LWP::Simple; $debug = 0; $proc = "pl_mom"; -$alias = "pl-mom"; +$alias_addr = "pl-mom\@planet-lab.org"; +$from_addr = "support\@planet-lab.org"; if (! $debug) { $kill_thresh = 90; @@ -15,20 +16,37 @@ if (! $debug) { $log_thresh = 85; $change_thresh = 5; $min_thresh = 10; + #$bwcap_default = "off"; + $bwcap_default = "1.5Mbit"; + $cutoff_default = "16200000000"; # 16GB, for 1.5Mbit cap + $bwmon_sleep = 900; - $sendmail = "/usr/sbin/sendmail -t"; + $sendmail = "/usr/sbin/sendmail -t -f$from_addr"; + $vservers = "/etc/vservers"; $pidfile = "/var/run/$proc.pid"; $rebootfile = "/var/lib/misc/pl_mom.reboot"; + $daily_log = "/var/lib/misc/pl_mom.daily"; + $daily_stamp = "/var/lib/misc/pl_mom.stamp"; + $configfile = "/etc/planetlab/pl_mom.conf"; + $capfile = "/var/lib/misc/pl_mom.oldcaps"; } else { $kill_thresh = 2; $reboot_thresh = 20; $log_thresh = 2; $change_thresh = 5; - $min_thresh = 10; + $min_thresh = 2; + $bwcap_default = "1Kbit"; + $cutoff_default = "10800"; + $bwmon_sleep = 10; $sendmail = "cat"; + $vservers = "./debug"; $pidfile = "./$proc.pid"; - $rebootfile = "./pl_mom.reboot"; + $rebootfile = "./debug/pl_mom.reboot"; + $daily_log = "./debug/pl_mom.daily"; + $daily_stamp = "./debug/pl_mom.stamp"; + $configfile = "./debug/pl_mom.conf"; + $capfile = "./debug/pl_mom.oldcaps"; } $sleep = 30; @@ -40,6 +58,9 @@ if (! $debug) { system("echo $$ > $pidfile"); +read_config_file(); + +# Check to see whether pl_mom rebooted the node if (-e $rebootfile) { unlink($rebootfile); syslog ("warning", "pl_mom: Sending shutdown mail"); @@ -52,6 +73,16 @@ if (! $pid) { reboot_kicker(); die (0); } +$pid = fork(); +if (! $pid) { + syslog ("info", "pl_mom: Launching bandwidth monitor"); + if ($bwcap_default =~ /off/) { + syslog("info", "pl_mom: Max rate unlimited by default"); + } + reset_bandwidth_caps(); + bandwidth_monitor(); + die (0); +} while (1) { $used = int(swap_used()); @@ -78,12 +109,15 @@ while (1) { #slice_warning_mail($hog); } } else { + my $id = `id -u $hog`; + chomp($id); + my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`; syslog ("warning", "pl_mom: Resetting slice $hog"); if (! $debug) { slice_reset($hog); } syslog ("warning", "pl_mom: Sending mail to slice $hog"); - slice_reset_mail($hog); + slice_reset_mail($hog, $top); } } } @@ -102,7 +136,8 @@ sub reboot_kicker { system("touch $rebootfile"); if (! $debug) { - system("shutdown -r now"); + #system("shutdown -r now"); + system("/bin/sync; /sbin/reboot -f"); } die (0); } @@ -111,18 +146,195 @@ sub reboot_kicker { } } -sub shutdown_mail { - my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); - my $to = "To: $alias\@planet-lab.org\n"; - my $from = "From: support\@planet-lab.org\n"; - my $subject = "Subject: $proc rebooted $hostname\n"; - my $msg ="Swap space was exhausted on $hostname and so $proc rebooted ". - "it.\n\nAs of $date, the node has successfully come back online.\n". - "\n$date $hostname reboot\n"; +sub bandwidth_monitor { + while (1) { + # See if a new day has started for bandwidth monitoring + chomp($now = `date -u +%D`); + if (-e $daily_stamp) { + chomp($stamp = `cat $daily_stamp`); + } + if (! defined($stamp) || !($stamp =~ $now)) { + open (STAMP, ">$daily_stamp") || + die "Can't open file $daily_stamp for writing: $!\n"; + print STAMP "$now\n"; + close STAMP; + unlink ($daily_log); + + # Could save the list of capped slices in a file in order to + # avoid re-sending mails if the daemon restarts. + # Also may want a list of slices that are exempt from capping. + if (defined(%Start)) { undef %Start; } + if (defined(%Now)) { undef %Now; } + if (defined(%Cap)) { undef %Cap; } + + reset_bandwidth_caps(); + + syslog("info", "pl_mom: Beginning bandwidth monitoring for $now"); + } + + get_slice_names(); + get_baseline_counts(); + get_slice_limits(); + + foreach $slice ( sort (keys %Start) ) { + if (defined $Now{$slice}) { + $today = $Now{$slice} - $Start{$slice}; + if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) { + $Cutoff{$slice} = $cutoff_default; + $Maxrate{$slice} = $bwcap_default; + } + if ($debug) { + if ($today) { + $cutoff = defined($Cutoff{$slice}) + ? $Cutoff{$slice} : ""; + print "Slice $slice sent $today bytes; ". + "cutoff $cutoff\n"; + } + } + if (defined ($Cutoff{$slice}) && + $today >= $Cutoff{$slice} && + ! defined($Cap{$slice})) { + $Cap{$slice} = "sent"; + bw_cap_mail($slice); + if (! $debug) { + log_bandwidth_cap($slice, $Maxrate{$slice}); + cap_bandwidth($slice, $Maxrate{$slice}); + } + } + } else { + # Token bucket for this slice is gone! + } + } + + sleep($bwmon_sleep); + } +} + +sub read_config_file { + if (-e $configfile) { + open (CONFIG, "<$configfile") || + print "Cannot open $configfile; $!\n"; + while () { + if (m/^(.*)=(.*)$/) { + ${$1} = $2; + if ($debug) { + print "read_config_file: $1 = ${$1}\n"; + } + } + } + close CONFIG; + } +} + +sub get_slice_names { + # Read slice names from /etc/passwd + if (defined (%Name)) { undef %Name; } + open (PASSWD, ") { + my ($slicename, $passwd, $sliceid) = split(/:/); + $Name{$sliceid} = $slicename; + } + close PASSWD; +} + +sub get_baseline_counts { + `touch $daily_log`; + open (BASE, "+<$daily_log") || + print "Cannot open $daily_log; $!\n"; + while () { + my ($slice, $bytecount) = split(/ /); + $Start{$slice} = $bytecount; + } + + my $status = `tc -s -d qdisc show`; + my $sliceid = "9999"; + @Lines = split(/\n/, $status); + foreach $line ( @Lines ) { + if ($line =~ /qdisc pfifo (.*): dev/) { + $sliceid = $1; + } else { + if ($line =~ /Sent (.*) bytes/) { + my $bytes = $1; + if ($sliceid != 9999) { + my $slice = $Name{$sliceid}; + if ($debug && $bytes) { + print "Slice: $slice ($sliceid), bytes $bytes\n"; + } + if (! defined($Start{$slice})) { + print BASE "$slice $bytes\n"; + $Start{$slice} = $bytes; + } + $Now{$slice} = $bytes; + } + } + } + } + close (BASE); +} + +sub get_slice_limits { + if (defined %Maxrate) { undef %Maxrate; } + if (defined %Cutoff) { undef %Cutoff; } + if (-e $vservers) { + my $result = `grep -H "^BWMAXRATE" $vservers/*.conf`; + chomp ($result); + my @Lines = split(/\n/,$result); + foreach $line ( @Lines ) { + if ($line =~ /\/([^\/]*).conf:BWMAXRATE=(.*)[Mm]bit/) { + $slice = $1; + $limit = $2."Mbit"; + $cutoff = ($2 * 1000000 * 86400)/8; + } else { + if ($line =~ /\/([^\/]*).conf:BWMAXRATE=(.*)[Kk]bit/) { + $slice = $1; + $limit = $2."Kbit"; + $cutoff = ($2 * 1000 * 86400)/8; + } else { + die "Could not parse line $line"; + } + } + $Maxrate{$slice} = $limit; + $Cutoff{$slice} = $cutoff; + if ($debug) { + print "Slice $slice, maxrate $Maxrate{$slice}, ". + "cutoff $Cutoff{$slice}\n"; + } + } + } +} + +sub reset_bandwidth_caps { + if (-e $capfile) { + open(CAP, "<$capfile") or die "Cannot open $capfile: $!"; + while () { + chomp(); + ($slicename, $oldcap) = split(/ /); + syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ". + "to $slicename"); + cap_bandwidth ($slicename, $oldcap); + } + close CAP; + unlink($capfile); + } +} + +sub log_bandwidth_cap { + ($slicename, $cap) = @_; + syslog("warning", "pl_mom: Capping bandwidth of slice ". + "$slicename at $cap until midnight GMT."); + # Save current cap to $capfile + system("echo $slicename `bwlimit getcap $slicename` >> $capfile"); +} + +sub send_mail { + # Arg 0: recipient addresses, comma-separated string + # Arg 1: subject line + # Arg 2: body of message + my $to = "To: $_[0]\n"; + my $from = "From: $from_addr\n"; + my $subject = "Subject: $_[1]\n"; + my $msg = $_[2]; open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; print SENDMAIL $to; @@ -133,32 +345,61 @@ sub shutdown_mail { close(SENDMAIL); } -sub slice_reset_mail { - my $hog = $_[0]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); +sub cap_bandwidth { + ($slicename, $cap) = @_; + system("bwlimit setcap $slicename $cap"); + system("bwlimit on $slicename"); +} + +sub get_date { my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) = localtime(time); my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $mday, $hour, $min, $sec); - my $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n"; - my $from = "From: support\@planet-lab.org\n"; - my $subject = "Subject: $proc resetting slice $hog on $hostname\n"; - my $msg = "As of $date, swap space is nearly exhausted on $hostname.\n\n". - "Slice $hog is being reset since it is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n". - "Please reply to this message explaining the nature of your ". - "experiment, and what you are doing to address the problem.\n". - "\n$date $hostname reset $hog\n"; + return $date; +} - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); +sub bw_cap_mail { + my ($slicename) = @_; + my $hostname = hostname(); + my $date = get_date(); + my $sent = int($Cutoff{$slicename}/(1024*1024)); + my $bwcap = $Maxrate{$slicename}; + + send_mail("$alias_addr, $slicename\@slices.planet-lab.org", + "$proc capped bandwidth of slice $slicename on $hostname", + "Slice $slicename has transmitted more than ${sent}MB today". + " on $hostname. ". + "Its bandwidth will be capped at $bwcap until midnight GMT.". + "\n\n$date $hostname bwcap $slicename\n"); +} + +sub shutdown_mail { + my $hostname = hostname(); + my $date = get_date(); + send_mail($alias_addr, + "$proc rebooted $hostname", + "Swap space was exhausted on $hostname and so $proc rebooted ". + "it.\n\nAs of $date, the node has successfully come back ". + "online.\n\n$date $hostname reboot\n"); +} + +sub slice_reset_mail { + my $hog = $_[0]; + my $top = $_[1]; + my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); + my $hog_pct = $Slice{$hog}{mem_pct}; + my $hostname = hostname(); + my $date = get_date(); + send_mail("$alias_addr, $hog\@slices.planet-lab.org", + "$proc reset slice $hog on $hostname", + "As of $date, swap space is nearly exhausted on $hostname.\n\n". + "Slice $hog is being reset since it is the largest consumer ". + "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n". + "Please reply to this message explaining the nature of your ". + "experiment, and what you are doing to address the problem.\n". + "\nOutput of 'top -b -n 1' in your slice prior to reset:\n". + "$top\n\n$date $hostname reset $hog\n"); } sub slice_warning_mail { @@ -166,32 +407,21 @@ sub slice_warning_mail { my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); my $hog_pct = $Slice{$hog}{mem_pct}; my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); + my $date = get_date(); if ($hog =~ /^root$/) { - $to = "To: $alias\@planet-lab.org\n"; + $to = $alias_addr; } else { - $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n"; + $to = "$alias_addr, $hog\@slices.planet-lab.org"; } - my $from = "From: support\@planet-lab.org\n"; - my $subject = "Subject: $proc may reset slice $hog on $hostname\n"; - my $msg = "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "Please check the memory usage of your slice to avoid a reset.\n". - "\n$date $hostname warning $hog\n"; - - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); + send_mail($to, + "$proc may reset slice $hog on $hostname", + "As of $date, swap space is over $log_thresh% full on ". + "$hostname.\n\nSlice $hog is the largest consumer ". + "of physical memory at ${hog_mem}MB ($hog_pct%).\n". + "Please check the memory usage of your slice to avoid a ". + "reset.\n\n$date $hostname warning $hog\n"); } sub unkillable_alarm_mail { @@ -199,32 +429,21 @@ sub unkillable_alarm_mail { my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); my $hog_pct = $Slice{$hog}{mem_pct}; my $hostname = hostname(); - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); + my $date = get_date(); if ($hog =~ /^root$/) { - $to = "To: $alias\@planet-lab.org\n"; + $to = $alias_addr; } else { - $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n"; + $to = "$alias_addr, $hog\@slices.planet-lab.org"; } - my $from = "From: support\@planet-lab.org\n"; - my $subject = "Subject: $proc: alarm for slice $hog on $hostname\n"; - my $msg = "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "The slice will not be reset, but please verify its behavior.\n". - "\n$date $hostname alarm $hog\n"; - - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); + send_mail($to, + "$proc: alarm for slice $hog on $hostname", + "As of $date, swap space is over $log_thresh% full on ". + "$hostname.\n\nSlice $hog is the largest consumer ". + "of physical memory at ${hog_mem}MB ($hog_pct%).\n". + "The slice will not be reset, but please verify its behavior.\n". + "\n$date $hostname alarm $hog\n"); } sub slice_reset { @@ -250,9 +469,10 @@ sub swap_used { sub get_slice_info { if (! $debug) { - $content = get "http://127.0.0.1:3100/slicestat"; + $content = `curl -s http://127.0.0.1:3100/slicestat`; } else { - $content = `cat ../pl_mom-deploy/slicestat` + #$content = `cat ../pl_mom-deploy/slicestat` + $content = `curl -s http://127.0.0.1:3100/slicestat`; } my @lines = split(/\n/, $content); %Slice = ();