--- /dev/null
+#!/usr/bin/perl -w
+
+use POSIX qw(setsid);
+use Sys::Syslog;
+use Sys::Hostname;
+use LWP::Simple;
+
+$debug = 0;
+$proc = "pl_mom";
+$alias = "pl-mom";
+
+if (! $debug) {
+ $kill_thresh = 90;
+ $reboot_thresh = 95;
+ $log_thresh = 85;
+ $change_thresh = 5;
+ $min_thresh = 10;
+
+ $sendmail = "/usr/sbin/sendmail -t";
+ $pidfile = "/var/run/$proc.pid";
+ $rebootfile = "/var/lib/misc/pl_mom.reboot";
+} else {
+ $kill_thresh = 2;
+ $reboot_thresh = 20;
+ $log_thresh = 2;
+ $change_thresh = 5;
+ $min_thresh = 10;
+
+ $sendmail = "cat";
+ $pidfile = "./$proc.pid";
+ $rebootfile = "./pl_mom.reboot";
+}
+
+$sleep = 30;
+
+# daemonize the program
+if (! $debug) {
+ &daemonize;
+}
+
+system("echo $$ > $pidfile");
+
+if (-e $rebootfile) {
+ unlink($rebootfile);
+ syslog ("warning", "pl_mom: Sending shutdown mail");
+ shutdown_mail();
+}
+
+my $pid = fork();
+if (! $pid) {
+ syslog ("info", "pl_mom: Launching reboot kicker");
+ reboot_kicker();
+ die (0);
+}
+
+while (1) {
+ $used = int(swap_used());
+
+ if (defined($old_used)) {
+ if ($used >= $old_used + $change_thresh) {
+ syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
+ $used - $old_used, $sleep);
+ }
+ }
+
+ if ($used >= $log_thresh) {
+ if (! defined($old_used) || $used != $old_used) {
+ syslog ("info", "pl_mom: Swap used: %d%%", $used);
+ }
+ get_slice_info();
+ my $hog = memory_hog();
+ if ($hog) {
+ if ($used < $kill_thresh) {
+ if (! defined($Warning{$hog})) {
+ $Warning{$hog} = "sent";
+ syslog ("warning", "pl_mom: Slice $hog is ".
+ "using $Slice{$hog}{mem_pct}%% of memory");
+ #slice_warning_mail($hog);
+ }
+ } else {
+ syslog ("warning", "pl_mom: Resetting slice $hog");
+ if (! $debug) {
+ slice_reset($hog);
+ }
+ syslog ("warning", "pl_mom: Sending mail to slice $hog");
+ slice_reset_mail($hog);
+ }
+ }
+ }
+
+ sleep ($sleep);
+
+ $old_used = $used;
+}
+
+sub reboot_kicker {
+ while (1) {
+ $used = swap_used();
+
+ if ($used >= $reboot_thresh) {
+ syslog ("warning", "pl_mom: Rebooting node");
+
+ system("touch $rebootfile");
+ if (! $debug) {
+ system("shutdown -r now");
+ }
+ die (0);
+ }
+
+ sleep (1);
+ }
+}
+
+sub shutdown_mail {
+ my $hostname = hostname();
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
+ = localtime(time);
+ my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
+ $year+1900, $mon+1, $mday, $hour, $min, $sec);
+ my $to = "To: $alias\@planet-lab.org\n";
+ my $from = "From: support\@planet-lab.org\n";
+ my $subject = "Subject: $proc rebooted $hostname\n";
+ my $msg ="Swap space was exhausted on $hostname and so $proc rebooted ".
+ "it.\n\nAs of $date, the node has successfully come back online.\n".
+ "\n$date $hostname reboot\n";
+
+ open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
+ print SENDMAIL $to;
+ print SENDMAIL $from;
+ print SENDMAIL $subject;
+ print SENDMAIL "Content-type: text/plain\n\n";
+ print SENDMAIL $msg;
+ close(SENDMAIL);
+}
+
+sub slice_reset_mail {
+ my $hog = $_[0];
+ my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
+ my $hog_pct = $Slice{$hog}{mem_pct};
+ my $hostname = hostname();
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
+ = localtime(time);
+ my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
+ $year+1900, $mon+1, $mday, $hour, $min, $sec);
+ my $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
+ my $from = "From: support\@planet-lab.org\n";
+ my $subject = "Subject: $proc resetting slice $hog on $hostname\n";
+ my $msg = "As of $date, swap space is nearly exhausted on $hostname.\n\n".
+ "Slice $hog is being reset since it is the largest consumer ".
+ "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
+ "Please reply to this message explaining the nature of your ".
+ "experiment, and what you are doing to address the problem.\n".
+ "\n$date $hostname reset $hog\n";
+
+ open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
+ print SENDMAIL $to;
+ print SENDMAIL $from;
+ print SENDMAIL $subject;
+ print SENDMAIL "Content-type: text/plain\n\n";
+ print SENDMAIL $msg;
+ close(SENDMAIL);
+}
+
+sub slice_warning_mail {
+ my $hog = $_[0];
+ my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
+ my $hog_pct = $Slice{$hog}{mem_pct};
+ my $hostname = hostname();
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
+ = localtime(time);
+ my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
+ $year+1900, $mon+1, $mday, $hour, $min, $sec);
+
+ if ($hog =~ /^root$/) {
+ $to = "To: $alias\@planet-lab.org\n";
+ } else {
+ $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
+ }
+
+ my $from = "From: support\@planet-lab.org\n";
+ my $subject = "Subject: $proc may reset slice $hog on $hostname\n";
+ my $msg = "As of $date, swap space is over $log_thresh% full on ".
+ "$hostname.\n\nSlice $hog is the largest consumer ".
+ "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
+ "Please check the memory usage of your slice to avoid a reset.\n".
+ "\n$date $hostname warning $hog\n";
+
+ open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
+ print SENDMAIL $to;
+ print SENDMAIL $from;
+ print SENDMAIL $subject;
+ print SENDMAIL "Content-type: text/plain\n\n";
+ print SENDMAIL $msg;
+ close(SENDMAIL);
+}
+
+sub unkillable_alarm_mail {
+ my $hog = $_[0];
+ my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
+ my $hog_pct = $Slice{$hog}{mem_pct};
+ my $hostname = hostname();
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
+ = localtime(time);
+ my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
+ $year+1900, $mon+1, $mday, $hour, $min, $sec);
+
+ if ($hog =~ /^root$/) {
+ $to = "To: $alias\@planet-lab.org\n";
+ } else {
+ $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
+ }
+
+ my $from = "From: support\@planet-lab.org\n";
+ my $subject = "Subject: $proc: alarm for slice $hog on $hostname\n";
+ my $msg = "As of $date, swap space is over $log_thresh% full on ".
+ "$hostname.\n\nSlice $hog is the largest consumer ".
+ "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
+ "The slice will not be reset, but please verify its behavior.\n".
+ "\n$date $hostname alarm $hog\n";
+
+ open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
+ print SENDMAIL $to;
+ print SENDMAIL $from;
+ print SENDMAIL $subject;
+ print SENDMAIL "Content-type: text/plain\n\n";
+ print SENDMAIL $msg;
+ close(SENDMAIL);
+}
+
+sub slice_reset {
+ my $slice = $_[0];
+ my $sliceid = $Slice{$slice}{ctx};
+ system("chcontext --ctx $sliceid sudo kill -9 -1");
+ system("/etc/init.d/vserver-init start $slice");
+}
+
+sub swap_used {
+ open (SWAP, "</proc/swaps") ||
+ die "Cannot open /proc/swaps; $!\n";
+
+ $line = <SWAP>;
+ $line = <SWAP>;
+ $line =~ s/[\t ]+/ /g;
+
+ my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
+ close SWAP;
+
+ return 100*($used/$size);
+}
+
+sub get_slice_info {
+ if (! $debug) {
+ $content = get "http://127.0.0.1:3100/slicestat";
+ } else {
+ $content = `cat ../pl_mom-deploy/slicestat`
+ }
+ my @lines = split(/\n/, $content);
+ %Slice = ();
+ foreach $line (@lines) {
+ my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
+ = split(/,/,$line);
+ $Slice{$slice}{ctx} = $ctx;
+ $Slice{$slice}{cpu_pct} = $cpu_pct;
+ $Slice{$slice}{mem_pct} = $mem_pct;
+ $Slice{$slice}{pmem} = $pmem;
+ $Slice{$slice}{vmem} = $vmem;
+ $Slice{$slice}{ntasks} = $ntasks;
+ }
+}
+
+sub memory_hog {
+ @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
+ foreach $key (@keys) {
+ if ($Slice{$key}{mem_pct} >= $min_thresh) {
+ if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
+ if (! defined ($Warning{$key})) {
+ $Warning{$key} = "sent";
+ syslog ("warning", "pl_mom: Sending alarm mail to ".
+ "unkillable slice $key, using ".
+ "$Slice{$key}{mem_pct}%% of memory");
+ unkillable_alarm_mail($key);
+ }
+ } else {
+ return $key;
+ }
+ } else {
+ #syslog ("info", "pl_mom: No killable slice using > ".
+ # "$min_thresh%% memory");
+ return;
+ }
+ }
+}
+
+sub daemonize {
+ chdir '/' or die "Can't chdir to /: $!";
+ open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
+ open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
+ open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
+ defined(my $pid = fork) or die "Can't fork: $!";
+ exit if $pid;
+ setsid or die "Can't start a new session: $!";
+ umask 0;
+}