use POSIX qw(setsid);
use Sys::Syslog;
use Sys::Hostname;
-use LWP::Simple;
+#use LWP::Simple;
$debug = 0;
$proc = "pl_mom";
-$alias = "pl-mom";
+$alias_addr = "pl-mom\@planet-lab.org";
+$from_addr = "support\@planet-lab.org";
+$bwcap = "1.5Mbit";
if (! $debug) {
$kill_thresh = 90;
$log_thresh = 85;
$change_thresh = 5;
$min_thresh = 10;
+ $byte_cutoff = 16000000000; # 16GB
+ #$byte_cutoff = 5000000000; # 5GB
+ $bwmon_sleep = 900;
- $sendmail = "/usr/sbin/sendmail -t";
+ $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
$pidfile = "/var/run/$proc.pid";
$rebootfile = "/var/lib/misc/pl_mom.reboot";
+ $daily_log = "/var/lib/misc/pl_mom.daily";
+ $daily_stamp = "/var/lib/misc/pl_mom.stamp";
} else {
$kill_thresh = 2;
$reboot_thresh = 20;
$log_thresh = 2;
$change_thresh = 5;
$min_thresh = 10;
+ $byte_cutoff = 16000;
+ $bwmon_sleep = 10;
$sendmail = "cat";
$pidfile = "./$proc.pid";
$rebootfile = "./pl_mom.reboot";
+ $daily_log = "./pl_mom.daily";
+ $daily_stamp = "./pl_mom.stamp";
}
$sleep = 30;
system("echo $$ > $pidfile");
+# Check to see whether pl_mom rebooted the node
if (-e $rebootfile) {
unlink($rebootfile);
syslog ("warning", "pl_mom: Sending shutdown mail");
reboot_kicker();
die (0);
}
+$pid = fork();
+if (! $pid) {
+ syslog ("info", "pl_mom: Launching bandwidth monitor");
+ bandwidth_monitor();
+ die (0);
+}
while (1) {
$used = int(swap_used());
#slice_warning_mail($hog);
}
} else {
+ my $id = `id -u $hog`;
+ chomp($id);
+ my $top = `chcontext --ctx $id top -b -n 1`;
syslog ("warning", "pl_mom: Resetting slice $hog");
if (! $debug) {
slice_reset($hog);
}
syslog ("warning", "pl_mom: Sending mail to slice $hog");
- slice_reset_mail($hog);
+ slice_reset_mail($hog, $top);
}
}
}
}
}
-sub shutdown_mail {
- my $hostname = hostname();
- my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
- = localtime(time);
- my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
- $year+1900, $mon+1, $mday, $hour, $min, $sec);
- my $to = "To: $alias\@planet-lab.org\n";
- my $from = "From: support\@planet-lab.org\n";
- my $subject = "Subject: $proc rebooted $hostname\n";
- my $msg ="Swap space was exhausted on $hostname and so $proc rebooted ".
- "it.\n\nAs of $date, the node has successfully come back online.\n".
- "\n$date $hostname reboot\n";
+sub bandwidth_monitor {
+ while (1) {
+ # See if a new day has started for bandwidth monitoring
+ chomp($now = `date -u +%D`);
+ if (-e $daily_stamp) {
+ chomp($stamp = `cat $daily_stamp`);
+ }
+ if (! defined($stamp) || !($stamp =~ $now)) {
+ open (STAMP, ">$daily_stamp") ||
+ die "Can't open file $daily_stamp for writing: $!\n";
+ print STAMP "$now\n";
+ close STAMP;
+ unlink ($daily_log);
+
+ # Could save the list of capped slices in a file in order to
+ # avoid re-sending mails if the daemon restarts.
+ # Also may want a list of slices that are exempt from capping.
+ if (defined(%Start)) { undef %Start; }
+ if (defined(%Now)) { undef %Now; }
+
+ if (defined(%Cap)) {
+ # Reset bandwidth limits here
+ chomp($cap = `cat /etc/planetlab/bwcap`);
+ foreach $slice ( sort ( keys %Cap ) ) {
+ cap_bandwidth ($slice, $cap);
+ }
+ undef %Cap;
+ }
+
+ syslog("info", "pl_mom: Beginning bandwidth monitoring for $now");
+ }
+
+ # Get baseline counts
+ `touch $daily_log`;
+ open (BASE, "+<$daily_log") ||
+ print "Cannot open $daily_log; $!\n";
+ while (<BASE>) {
+ my ($sliceid, $bytecount) = split(/ /);
+ $Start{$sliceid} = $bytecount;
+ }
+
+ $status = `tc -s -d qdisc show`;
+ @lines = split(/\n/, $status);
+ for ($i = 0; $i < @lines; $i++) {
+ if ($lines[$i] =~ /qdisc pfifo/) {
+ $lines[$i] =~ s/^ +//;
+ @fields = split(/ /, $lines[$i]);
+ $slice = $fields[2];
+ $slice =~ s/://;
+
+ if ($slice != 9999) {
+ $lines[$i+1] =~ s/^ +//;
+ @fields = split(/ /, $lines[$i+1]);
+ $bytes = $fields[1];
+ #if ($bytes) {print "Slice $slice sent $bytes bytes\n";}
+
+ if (! defined($Start{$slice})) {
+ print BASE "$slice $bytes\n";
+ $Start{$slice} = $bytes;
+ }
+ $Now{$slice} = $bytes;
+ }
+ }
+ }
+ close (BASE);
+
+ foreach $slice ( sort (keys %Start) ) {
+ if (defined $Now{$slice}) {
+ $today = $Now{$slice} - $Start{$slice};
+ if ($today >= $byte_cutoff && ! defined($Cap{$slice})) {
+ $Cap{$slice} = "sent";
+ $slicename = get_slice_name($slice);
+ if ($slicename) {
+ bw_cap_mail($slicename);
+ # Cap bandwidth here
+ cap_bandwidth($slicename, $bwcap);
+ } else {
+ syslog("warning", "pl_mom: Could not find slice name".
+ " for slice ID $slice");
+ }
+ }
+ } else {
+ # The /proc/virtual/<id> directory is gone...
+ }
+ }
+
+ sleep($bwmon_sleep);
+ }
+}
+
+sub send_mail {
+ # Arg 0: recipient addresses, comma-separated string
+ # Arg 1: subject line
+ # Arg 2: body of message
+ my $to = "To: $_[0]\n";
+ my $from = "From: $from_addr\n";
+ my $subject = "Subject: $_[1]\n";
+ my $msg = $_[2];
open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
print SENDMAIL $to;
close(SENDMAIL);
}
-sub slice_reset_mail {
- my $hog = $_[0];
- my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
- my $hog_pct = $Slice{$hog}{mem_pct};
- my $hostname = hostname();
+sub cap_bandwidth {
+ # Arg 0: slice name
+ # Arg 1: bandwidth cap for 'tc'
+ `bwlimit setcap $_[0] $_[1]`;
+ `bwlimit on $_[0]`;
+}
+
+sub get_date {
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
= localtime(time);
my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
$year+1900, $mon+1, $mday, $hour, $min, $sec);
- my $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
- my $from = "From: support\@planet-lab.org\n";
- my $subject = "Subject: $proc resetting slice $hog on $hostname\n";
- my $msg = "As of $date, swap space is nearly exhausted on $hostname.\n\n".
- "Slice $hog is being reset since it is the largest consumer ".
- "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
- "Please reply to this message explaining the nature of your ".
- "experiment, and what you are doing to address the problem.\n".
- "\n$date $hostname reset $hog\n";
+ return $date;
+}
- open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
- print SENDMAIL $to;
- print SENDMAIL $from;
- print SENDMAIL $subject;
- print SENDMAIL "Content-type: text/plain\n\n";
- print SENDMAIL $msg;
- close(SENDMAIL);
+sub get_slice_name {
+ # Arg 0: slice ID
+
+ # Need to map slice id to slice name; is there a sensor?
+ # For now, get it from /etc/passwd
+ my $name = "";
+ open (PASSWD, "</etc/passwd") ||
+ print "Cannot open /etc/passwd; $!\n";
+ while (<PASSWD>) {
+ my ($slicename, $passwd, $sliceid) = split(/:/);
+ if ($sliceid == $_[0]) {
+ $name = $slicename;
+ }
+ }
+ close PASSWD;
+ return $name;
+}
+
+sub bw_cap_mail {
+ my $hostname = hostname();
+ my $date = get_date();
+ my $sent = int($byte_cutoff/1000000000);
+
+ # Put this here because this is where we have the
+ syslog("warning", "pl_mom: Capping bandwidth of slice ".
+ "$slicename at $bwcap until midnight GMT.");
+
+ send_mail($alias_addr,
+ "$proc capped bandwidth of slice $slicename on $hostname",
+ "Slice $slicename has transmitted more than ${sent}GB today".
+ " on $hostname. ".
+ "Its bandwidth will be capped at $bwcap until midnight GMT.".
+ "\n\n$date $hostname bwcap $slicename\n");
+}
+
+sub shutdown_mail {
+ my $hostname = hostname();
+ my $date = get_date();
+ send_mail($alias_addr,
+ "$proc rebooted $hostname",
+ "Swap space was exhausted on $hostname and so $proc rebooted ".
+ "it.\n\nAs of $date, the node has successfully come back ".
+ "online.\n\n$date $hostname reboot\n");
+}
+
+sub slice_reset_mail {
+ my $hog = $_[0];
+ my $top = $_[1];
+ my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
+ my $hog_pct = $Slice{$hog}{mem_pct};
+ my $hostname = hostname();
+ my $date = get_date();
+ send_mail("$alias_addr, $hog\@slices.planet-lab.org",
+ "$proc reset slice $hog on $hostname",
+ "As of $date, swap space is nearly exhausted on $hostname.\n\n".
+ "Slice $hog is being reset since it is the largest consumer ".
+ "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
+ "Please reply to this message explaining the nature of your ".
+ "experiment, and what you are doing to address the problem.\n".
+ "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
+ "$top\n\n$date $hostname reset $hog\n");
}
sub slice_warning_mail {
my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
my $hog_pct = $Slice{$hog}{mem_pct};
my $hostname = hostname();
- my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
- = localtime(time);
- my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
- $year+1900, $mon+1, $mday, $hour, $min, $sec);
+ my $date = get_date();
if ($hog =~ /^root$/) {
- $to = "To: $alias\@planet-lab.org\n";
+ $to = $alias_addr;
} else {
- $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
+ $to = "$alias_addr, $hog\@slices.planet-lab.org";
}
- my $from = "From: support\@planet-lab.org\n";
- my $subject = "Subject: $proc may reset slice $hog on $hostname\n";
- my $msg = "As of $date, swap space is over $log_thresh% full on ".
- "$hostname.\n\nSlice $hog is the largest consumer ".
- "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
- "Please check the memory usage of your slice to avoid a reset.\n".
- "\n$date $hostname warning $hog\n";
-
- open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
- print SENDMAIL $to;
- print SENDMAIL $from;
- print SENDMAIL $subject;
- print SENDMAIL "Content-type: text/plain\n\n";
- print SENDMAIL $msg;
- close(SENDMAIL);
+ send_mail($to,
+ "$proc may reset slice $hog on $hostname",
+ "As of $date, swap space is over $log_thresh% full on ".
+ "$hostname.\n\nSlice $hog is the largest consumer ".
+ "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
+ "Please check the memory usage of your slice to avoid a ".
+ "reset.\n\n$date $hostname warning $hog\n");
}
sub unkillable_alarm_mail {
my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
my $hog_pct = $Slice{$hog}{mem_pct};
my $hostname = hostname();
- my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
- = localtime(time);
- my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
- $year+1900, $mon+1, $mday, $hour, $min, $sec);
+ my $date = get_date();
if ($hog =~ /^root$/) {
- $to = "To: $alias\@planet-lab.org\n";
+ $to = $alias_addr;
} else {
- $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
+ $to = "$alias_addr, $hog\@slices.planet-lab.org";
}
- my $from = "From: support\@planet-lab.org\n";
- my $subject = "Subject: $proc: alarm for slice $hog on $hostname\n";
- my $msg = "As of $date, swap space is over $log_thresh% full on ".
- "$hostname.\n\nSlice $hog is the largest consumer ".
- "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
- "The slice will not be reset, but please verify its behavior.\n".
- "\n$date $hostname alarm $hog\n";
-
- open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
- print SENDMAIL $to;
- print SENDMAIL $from;
- print SENDMAIL $subject;
- print SENDMAIL "Content-type: text/plain\n\n";
- print SENDMAIL $msg;
- close(SENDMAIL);
+ send_mail($to,
+ "$proc: alarm for slice $hog on $hostname",
+ "As of $date, swap space is over $log_thresh% full on ".
+ "$hostname.\n\nSlice $hog is the largest consumer ".
+ "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
+ "The slice will not be reset, but please verify its behavior.\n".
+ "\n$date $hostname alarm $hog\n");
}
sub slice_reset {
sub get_slice_info {
if (! $debug) {
- $content = get "http://127.0.0.1:3100/slicestat";
+ #$content = get "http://127.0.0.1:3100/slicestat";
+ $content = `curl -s http://127.0.0.1:3100/slicestat`;
} else {
$content = `cat ../pl_mom-deploy/slicestat`
}