#!/usr/bin/perl -w

use POSIX qw(setsid);
use Sys::Syslog;
use Sys::Hostname;
#use LWP::Simple;

$debug = 0;
$proc = "pl_mom";
$alias_addr = "pl-mom\@planet-lab.org";
$from_addr = "support\@planet-lab.org";

if (! $debug) {
    $kill_thresh = 90;
    $reboot_thresh = 95;
    $log_thresh = 85;
    $change_thresh = 5;
    $min_thresh = 10;
    #$bwcap_default = "off";
    $bwcap_default = "1.5Mbit";
    $cutoff_default = "16200000000";  # 16GB, for 1.5Mbit cap
    $bwmon_sleep = 900;

    $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
    $vservers = "/etc/vservers";
    $pidfile = "/var/run/$proc.pid";
    $rebootfile = "/var/lib/misc/pl_mom.reboot";
    $daily_log = "/var/lib/misc/pl_mom.daily";
    $daily_stamp = "/var/lib/misc/pl_mom.stamp";
    $configfile = "/etc/planetlab/pl_mom.conf";
    $capfile = "/var/lib/misc/pl_mom.oldcaps";
} else {
    $kill_thresh = 2;
    $reboot_thresh = 20;
    $log_thresh = 2;
    $change_thresh = 5;
    $min_thresh = 2;
    $bwcap_default = "1Kbit";
    $cutoff_default = "10800";
    $bwmon_sleep = 10;

    $sendmail = "cat";
    $vservers = "./debug";
    $pidfile = "./$proc.pid";
    $rebootfile = "./debug/pl_mom.reboot";
    $daily_log = "./debug/pl_mom.daily";
    $daily_stamp = "./debug/pl_mom.stamp";
    $configfile = "./debug/pl_mom.conf";
    $capfile = "./debug/pl_mom.oldcaps";
}

$sleep = 30;

# daemonize the program
if (! $debug) {
    &daemonize;
}

system("echo $$ > $pidfile");

read_config_file();

# Check to see whether pl_mom rebooted the node
if (-e $rebootfile) {
    unlink($rebootfile);
    syslog ("warning", "pl_mom: Sending shutdown mail");
    shutdown_mail();
}

my $pid = fork();
if (! $pid) {
    syslog ("info", "pl_mom: Launching reboot kicker");
    reboot_kicker();
    die (0);
}
$pid = fork();
if (! $pid) {
    syslog ("info", "pl_mom: Launching bandwidth monitor");
    if ($bwcap_default =~ /off/) {
	syslog("info", "pl_mom: Max rate unlimited by default");
    }
    reset_bandwidth_caps();
    bandwidth_monitor();
    die (0);
}

while (1) {
    $used = int(swap_used());

    if (defined($old_used)) {
	if ($used >= $old_used + $change_thresh) {
	    syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
		    $used - $old_used, $sleep);
    	}
    }

    if ($used >= $log_thresh) {
	if (! defined($old_used) || $used != $old_used) {
	    syslog ("info", "pl_mom: Swap used: %d%%", $used);
	}
	get_slice_info();
	my $hog = memory_hog();
	if ($hog) {
	    if ($used < $kill_thresh) {
		if (! defined($Warning{$hog})) {
		    $Warning{$hog} = "sent";
		    syslog ("warning", "pl_mom: Slice $hog is ".
			    "using $Slice{$hog}{mem_pct}%% of memory");
		    #slice_warning_mail($hog);
		}
	    } else {
		my $id = `id -u $hog`;
		chomp($id);
		my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`;
		syslog ("warning", "pl_mom: Resetting slice $hog");
		if (! $debug) {
		    slice_reset($hog);
		}
		syslog ("warning", "pl_mom: Sending mail to slice $hog");
		slice_reset_mail($hog, $top);
	    }
	}
    }
    
    sleep ($sleep);

    $old_used = $used;
}

sub reboot_kicker {
    while (1) {
	$used = swap_used();

	if ($used >= $reboot_thresh) {
	    syslog ("warning", "pl_mom: Rebooting node");

	    system("touch $rebootfile");
	    if (! $debug) {
		#system("shutdown -r now");
		system("/bin/sync; /sbin/reboot -f");
	    }
	    die (0);
	}
	
	sleep (1);
    }
}

sub bandwidth_monitor {
    while (1) {
        # See if a new day has started for bandwidth monitoring
	chomp($now = `date -u +%D`);
	if (-e $daily_stamp) {
	    chomp($stamp = `cat $daily_stamp`);
	}
	if (! defined($stamp) || !($stamp =~ $now)) {
	    open (STAMP, ">$daily_stamp") || 
		die "Can't open file $daily_stamp for writing: $!\n";
	    print STAMP "$now\n";
	    close STAMP;
	    unlink ($daily_log);

	    # Could save the list of capped slices in a file in order to
	    # avoid re-sending mails if the daemon restarts.
	    # Also may want a list of slices that are exempt from capping.
	    if (defined(%Start)) { undef %Start; }
	    if (defined(%Now))   { undef %Now; }
	    if (defined(%Cap))   { undef %Cap; }

	    reset_bandwidth_caps();

	    syslog("info", "pl_mom: Beginning bandwidth monitoring for $now");
	}

	get_slice_names();
	get_baseline_counts();
	get_slice_limits();

	foreach $slice ( sort (keys %Start) ) {
	    if (defined $Now{$slice}) {
		$today = $Now{$slice} - $Start{$slice};
		if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) {
		    $Cutoff{$slice} = $cutoff_default;
		    $Maxrate{$slice} = $bwcap_default;
		}
		if ($debug) {
		    if ($today) {
			$cutoff = defined($Cutoff{$slice}) 
			    ? $Cutoff{$slice} : "<none>"; 
			print "Slice $slice sent $today bytes; ".
			    "cutoff $cutoff\n";
		    }
		}
		if (defined ($Cutoff{$slice}) && 
		    $today >= $Cutoff{$slice} && 
		    ! defined($Cap{$slice})) {
		    $Cap{$slice} = "sent";
		    bw_cap_mail($slice);
		    log_bandwidth_cap($slice, $Maxrate{$slice});
		    cap_bandwidth($slice, $Maxrate{$slice});
		}
	    } else {
		# Token bucket for this slice is gone!
	    }
	}

	sleep($bwmon_sleep);
    }
}

sub read_config_file {
    if (-e $configfile) {
	open (CONFIG, "<$configfile") ||
	    print "Cannot open $configfile; $!\n";
	while (<CONFIG>) {
	    if (m/^(.*)=(.*)$/) {
		${$1} = $2;
		if ($debug) {
		    print "read_config_file: $1 = ${$1}\n";
		}
	    }
	}
	close CONFIG;
    }
}

sub get_slice_names {
    # Read slice names from /etc/passwd
    if (defined (%Name)) { undef %Name; }
    open (PASSWD, "</etc/passwd") ||
	print "Cannot open /etc/passwd; $!\n";
    while (<PASSWD>) {
	my ($slicename, $passwd, $sliceid) = split(/:/);
	$Name{$sliceid} = $slicename;
    }
    close PASSWD;
}

sub get_baseline_counts {
    `touch $daily_log`;
    open (BASE, "+<$daily_log") ||
	print "Cannot open $daily_log; $!\n";
    while (<BASE>) {
	my ($slice, $bytecount) = split(/ /);
	$Start{$slice} = $bytecount;
    }

    my $status = `tc -s -d qdisc show`;
    my $sliceid = 0xffff;
    @Lines = split(/\n/, $status);
    foreach $line ( @Lines ) {
	if ($line =~ /qdisc pfifo (.*): dev/) {
	    $sliceid = hex($1);
	    # "Capped" buckets all begin with 0x1000. Ignore the root
	    # (0x1000) and default (0x1fff) buckets, as well as
	    # "exempt" buckets that begin with 0x2000 (or anything
	    # other than 0x1000).
	    if (($sliceid & 0xf000) == 0x1000 &&
		$sliceid != 0x1000 && $sliceid != 0x1fff) {
		$sliceid = $sliceid & 0x0fff;
	    } else {
		$sliceid = 0xffff;
	    }
	} else {
	    if ($line =~ /Sent (.*) bytes/) {
		my $bytes = $1;
		if ($sliceid != 0xffff) {
		    my $slice = $Name{$sliceid};
		    if ($debug && $bytes) {
			print "Slice: $slice ($sliceid), bytes $bytes\n";
		    }
		    if (! defined($Start{$slice})) {
			print BASE "$slice $bytes\n";
			$Start{$slice} = $bytes;
		    }
		    $Now{$slice} = $bytes;
		}
	    }
	}
    }
    close (BASE);
}

sub get_slice_limits {
    if (defined %Maxrate) { undef %Maxrate; }
    if (defined %Cutoff)  { undef %Cutoff; }
    if (-e $vservers) {
	my $result = `grep -H "^BWMAXRATE" $vservers/*.conf`;
	chomp ($result);
	my @Lines = split(/\n/,$result);
	foreach $line ( @Lines ) {
	    if ($line =~ /\/([^\/]*).conf:BWMAXRATE=(.*)[Mm]bit/) {
		$slice = $1;
		$limit = $2."Mbit";
		$cutoff = ($2 * 1000000 * 86400)/8;
	    } else {
		if ($line =~ /\/([^\/]*).conf:BWMAXRATE=(.*)[Kk]bit/) {
		    $slice = $1;
		    $limit = $2."Kbit";
		    $cutoff = ($2 * 1000 * 86400)/8;
		} else {
		    die "Could not parse line $line";
		}
	    }
	    $Maxrate{$slice} = $limit;
	    $Cutoff{$slice} = $cutoff;
	}
    }
}

sub reset_bandwidth_caps {
    if (-e $capfile) {
	open(CAP, "<$capfile") or die "Cannot open $capfile: $!";
	while (<CAP>) {
	    chomp();
	    ($slicename, $oldcap) = split(/ /);
	    syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ".
		   "to $slicename");
	    cap_bandwidth ($slicename, $oldcap);
	}
	close CAP;
	unlink($capfile);
    }
}

sub log_bandwidth_cap {
    ($slicename, $cap) = @_;
    syslog("warning", "pl_mom: Capping bandwidth of slice ".
	   "$slicename at $cap until midnight GMT.");
    # Save current cap to $capfile
    system("echo $slicename `bwlimit getcap $slicename` >> $capfile");
}

sub send_mail {
    # Arg 0: recipient addresses, comma-separated string
    # Arg 1: subject line
    # Arg 2: body of message
    my $to = "To: $_[0]\n";
    my $from = "From: $from_addr\n";
    my $subject = "Subject: $_[1]\n";
    my $msg = $_[2];

    if ($debug) {
	print $to;
	print $subject;
    } else {
	open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
	print SENDMAIL $to;
	print SENDMAIL $from;
	print SENDMAIL $subject;
	print SENDMAIL "Content-type: text/plain\n\n";
	print SENDMAIL $msg;
	close(SENDMAIL);
    }
}

sub cap_bandwidth {
    ($slicename, $cap) = @_;
    system("bwlimit setcap $slicename $cap");
    system("bwlimit on $slicename");
}

sub get_date {
    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) 
	= localtime(time);
    my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", 
		       $year+1900, $mon+1, $mday, $hour, $min, $sec);
    return $date;
}

sub bw_cap_mail {
    my ($slicename) = @_;
    my $hostname = hostname();
    my $date = get_date();
    my $sent = int($Cutoff{$slicename}/(1024*1024));
    my $bwcap = $Maxrate{$slicename};

    send_mail("$alias_addr, $slicename\@slices.planet-lab.org",
	      "$proc capped bandwidth of slice $slicename on $hostname",
	      "Slice $slicename has transmitted more than ${sent}MB today".
	      " on $hostname.  ".
	      "Its bandwidth will be capped at $bwcap until midnight GMT.".
	      "\n\n$date $hostname bwcap $slicename\n");
}

sub shutdown_mail {
    my $hostname = hostname(); 
    my $date = get_date();
    send_mail($alias_addr, 
	      "$proc rebooted $hostname", 
	      "Swap space was exhausted on $hostname and so $proc rebooted ".
	      "it.\n\nAs of $date, the node has successfully come back ".
	      "online.\n\n$date $hostname reboot\n");
}

sub slice_reset_mail {
    my $hog = $_[0];
    my $top = $_[1];
    my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
    my $hog_pct = $Slice{$hog}{mem_pct};
    my $hostname = hostname(); 
    my $date = get_date();
    send_mail("$alias_addr, $hog\@slices.planet-lab.org",
	      "$proc reset slice $hog on $hostname",
	      "As of $date, swap space is nearly exhausted on $hostname.\n\n".
	      "Slice $hog is being reset since it is the largest consumer ".
	      "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
	      "Please reply to this message explaining the nature of your ".
	      "experiment, and what you are doing to address the problem.\n".
	      "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
	      "$top\n\n$date $hostname reset $hog\n");
}

sub slice_warning_mail {
    my $hog = $_[0];
    my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
    my $hog_pct = $Slice{$hog}{mem_pct};
    my $hostname = hostname(); 
    my $date = get_date();

    if ($hog =~ /^root$/) {
	$to = $alias_addr;
    } else {
	$to = "$alias_addr, $hog\@slices.planet-lab.org";
    }	

    send_mail($to,
	      "$proc may reset slice $hog on $hostname",
	      "As of $date, swap space is over $log_thresh% full on ".
	      "$hostname.\n\nSlice $hog is the largest consumer ".
	      "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
	      "Please check the memory usage of your slice to avoid a ".
	      "reset.\n\n$date $hostname warning $hog\n");
}

sub unkillable_alarm_mail {
    my $hog = $_[0];
    my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
    my $hog_pct = $Slice{$hog}{mem_pct};
    my $hostname = hostname(); 
    my $date = get_date();

    if ($hog =~ /^root$/) {
	$to = $alias_addr;
    } else {
	$to = "$alias_addr, $hog\@slices.planet-lab.org";
    }

    send_mail($to,
	      "$proc: alarm for slice $hog on $hostname",
	      "As of $date, swap space is over $log_thresh% full on ".
	      "$hostname.\n\nSlice $hog is the largest consumer ".
	      "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
	      "The slice will not be reset, but please verify its behavior.\n".
	      "\n$date $hostname alarm $hog\n");
}

sub slice_reset {
    my $slice = $_[0];
    my $sliceid = $Slice{$slice}{ctx};
    system("chcontext --ctx $sliceid sudo kill -9 -1");
    system("/etc/init.d/vserver-init start $slice");
}

sub swap_used {
    open (SWAP, "</proc/swaps") ||
        die "Cannot open /proc/swaps; $!\n";

    $line = <SWAP>;
    $line = <SWAP>;
    $line =~ s/[\t ]+/ /g;

    my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
    close SWAP;
 
    return 100*($used/$size);
}

sub get_slice_info {
    if (! $debug) {
	$content = `curl -s http://127.0.0.1:3100/slicestat`;
    } else {
	#$content = `cat ../pl_mom-deploy/slicestat`
	$content = `curl -s http://127.0.0.1:3100/slicestat`;
    }
    my @lines = split(/\n/, $content);
    %Slice = ();
    foreach $line (@lines) {
	my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks) 
	    = split(/,/,$line);
	$Slice{$slice}{ctx} = $ctx;
	$Slice{$slice}{cpu_pct} = $cpu_pct;
	$Slice{$slice}{mem_pct} = $mem_pct;
	$Slice{$slice}{pmem} = $pmem;
	$Slice{$slice}{vmem} = $vmem;
	$Slice{$slice}{ntasks} = $ntasks;
    }
}

sub memory_hog {
    @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
    foreach $key (@keys) {
	if ($Slice{$key}{mem_pct} >= $min_thresh) {
	    if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
		if (! defined ($Warning{$key})) {
		    $Warning{$key} = "sent";
		    syslog ("warning", "pl_mom: Sending alarm mail to ".
			    "unkillable slice $key, using ".
			    "$Slice{$key}{mem_pct}%% of memory");
		    unkillable_alarm_mail($key);
		}
	    } else {
		return $key;
	    }
	} else {
	    #syslog ("info", "pl_mom: No killable slice using > ".
	    #    "$min_thresh%% memory");
	    return;
	}
    }
}

sub daemonize {
    chdir '/'                 or die "Can't chdir to /: $!";
    open STDIN, '/dev/null'   or die "Can't read /dev/null: $!";
    open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
    open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
    defined(my $pid = fork)   or die "Can't fork: $!";
    exit if $pid;
    setsid                    or die "Can't start a new session: $!";
    umask 0;
}
