From 7ad743a954836ff62bfc9670b4d201e0a6b6c69b Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Fri, 28 Apr 2006 19:11:08 +0000 Subject: [PATCH] - rewrite, rename to swapmon.py and swapmon.init --- pl_mom | 74 -------- pl_mom.pl | 532 ------------------------------------------------------ 2 files changed, 606 deletions(-) delete mode 100755 pl_mom delete mode 100755 pl_mom.pl diff --git a/pl_mom b/pl_mom deleted file mode 100755 index a41c481..0000000 --- a/pl_mom +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/sh -# -# chkconfig: 345 98 02 -# description: pl_mom (daemon of death) startup script -# - -CODE='/usr/local/planetlab/bin/pl_mom.pl' -PROC='pl_mom' - -. /etc/rc.d/init.d/functions - -RETVAL=0 - -pidfile=/var/run/$PROC.pid - -check_status() { - pid=`cat $pidfile 2>/dev/null` - # - # this eliminates a race condition between checking existence of pidfile - # and reading its value - # - [ -n "$pid" -a -d /proc/$pid ] -} - -case "$1" in - start) - echo -n "starting $PROC:" - [ -r $CODE ] || action "code missing" /bin/false || exit 1 - pid=`cat $pidfile 2>/dev/null` - if [ -n "$pid" ]; then - # check whether process really exists - # yes - don't try to start - [ -d /proc/$pid ] && action "already running" /bin/true && exit 1 - - # no - PID file is stale - rm -f $pidfile - fi - - $CODE - sleep 1 - - cmd=success - check_status || cmd=failure - $cmd "$PROC startup" - echo - ;; - - stop) - echo -n "shutting down $PROC: " - check_status && kill -TERM -`cat $pidfile` && sleep 1 - cmd=failure - check_status || cmd=success && rm -f $pidfile - $cmd "$PROC shutdown" - RETVAL=0 - echo - ;; - - restart|reload) - $0 stop - $0 start - RETVAL=$? - ;; - - status) - check_status && echo 'running' && exit 0 || \ - echo 'not running' && exit 1 - ;; - - *) - echo "Usage: $0 {start|stop|restart|status}" - exit 1 -esac - -exit $RETVAL diff --git a/pl_mom.pl b/pl_mom.pl deleted file mode 100755 index e16f24c..0000000 --- a/pl_mom.pl +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/perl -w - -use POSIX qw(setsid); -use Sys::Syslog; -use Sys::Hostname; -#use LWP::Simple; - -$debug = 0; -$proc = "pl_mom"; -$alias_addr = "pl-mom\@planet-lab.org"; -$from_addr = "support\@planet-lab.org"; - -if (! $debug) { - $kill_thresh = 90; - $reboot_thresh = 95; - $log_thresh = 85; - $change_thresh = 5; - $min_thresh = 10; - #$bwcap_default = "off"; - $bwcap_default = "1.5Mbit"; - $cutoff_default = "16200000000"; # 16GB, for 1.5Mbit cap - $bwmon_sleep = 900; - - $sendmail = "/usr/sbin/sendmail -t -f$from_addr"; - $vservers = "/etc/vservers"; - $pidfile = "/var/run/$proc.pid"; - $rebootfile = "/var/lib/misc/pl_mom.reboot"; - $daily_log = "/var/lib/misc/pl_mom.daily"; - $daily_stamp = "/var/lib/misc/pl_mom.stamp"; - $configfile = "/etc/planetlab/pl_mom.conf"; - $capfile = "/var/lib/misc/pl_mom.oldcaps"; -} else { - $kill_thresh = 2; - $reboot_thresh = 20; - $log_thresh = 2; - $change_thresh = 5; - $min_thresh = 2; - $bwcap_default = "1Kbit"; - $cutoff_default = "10800"; - $bwmon_sleep = 10; - - $sendmail = "cat"; - $vservers = "./debug"; - $pidfile = "./$proc.pid"; - $rebootfile = "./debug/pl_mom.reboot"; - $daily_log = "./debug/pl_mom.daily"; - $daily_stamp = "./debug/pl_mom.stamp"; - $configfile = "./debug/pl_mom.conf"; - $capfile = "./debug/pl_mom.oldcaps"; -} - -$sleep = 30; - -# daemonize the program -if (! $debug) { - &daemonize; -} - -system("echo $$ > $pidfile"); - -read_config_file(); - -# Check to see whether pl_mom rebooted the node -if (-e $rebootfile) { - unlink($rebootfile); - syslog ("warning", "pl_mom: Sending shutdown mail"); - shutdown_mail(); -} - -my $pid = fork(); -if (! $pid) { - syslog ("info", "pl_mom: Launching reboot kicker"); - reboot_kicker(); - die (0); -} -$pid = fork(); -if (! $pid) { - syslog ("info", "pl_mom: Launching bandwidth monitor"); - if ($bwcap_default =~ /off/) { - syslog("info", "pl_mom: Max rate unlimited by default"); - } - reset_bandwidth_caps(); - bandwidth_monitor(); - die (0); -} - -while (1) { - $used = int(swap_used()); - - if (defined($old_used)) { - if ($used >= $old_used + $change_thresh) { - syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds", - $used - $old_used, $sleep); - } - } - - if ($used >= $log_thresh) { - if (! defined($old_used) || $used != $old_used) { - syslog ("info", "pl_mom: Swap used: %d%%", $used); - } - get_slice_info(); - my $hog = memory_hog(); - if ($hog) { - if ($used < $kill_thresh) { - if (! defined($Warning{$hog})) { - $Warning{$hog} = "sent"; - syslog ("warning", "pl_mom: Slice $hog is ". - "using $Slice{$hog}{mem_pct}%% of memory"); - #slice_warning_mail($hog); - } - } else { - my $id = `id -u $hog`; - chomp($id); - my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`; - syslog ("warning", "pl_mom: Resetting slice $hog"); - if (! $debug) { - slice_reset($hog); - } - syslog ("warning", "pl_mom: Sending mail to slice $hog"); - slice_reset_mail($hog, $top); - } - } - } - - sleep ($sleep); - - $old_used = $used; -} - -sub reboot_kicker { - while (1) { - $used = swap_used(); - - if ($used >= $reboot_thresh) { - syslog ("warning", "pl_mom: Rebooting node"); - - system("touch $rebootfile"); - if (! $debug) { - #system("shutdown -r now"); - system("/bin/sync; /sbin/reboot -f"); - } - die (0); - } - - sleep (1); - } -} - -sub bandwidth_monitor { - while (1) { - # See if a new day has started for bandwidth monitoring - chomp($now = `date -u +%D`); - if (-e $daily_stamp) { - chomp($stamp = `cat $daily_stamp`); - } - if (! defined($stamp) || !($stamp =~ $now)) { - open (STAMP, ">$daily_stamp") || - die "Can't open file $daily_stamp for writing: $!\n"; - print STAMP "$now\n"; - close STAMP; - unlink ($daily_log); - - # Could save the list of capped slices in a file in order to - # avoid re-sending mails if the daemon restarts. - # Also may want a list of slices that are exempt from capping. - if (defined(%Start)) { undef %Start; } - if (defined(%Now)) { undef %Now; } - if (defined(%Cap)) { undef %Cap; } - - reset_bandwidth_caps(); - - syslog("info", "pl_mom: Beginning bandwidth monitoring for $now"); - } - - get_slice_names(); - get_baseline_counts(); - get_slice_limits(); - - foreach $slice ( sort (keys %Start) ) { - if (defined $Now{$slice}) { - $today = $Now{$slice} - $Start{$slice}; - if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) { - $Cutoff{$slice} = $cutoff_default; - $Maxrate{$slice} = $bwcap_default; - } - if ($debug) { - if ($today) { - $cutoff = defined($Cutoff{$slice}) - ? $Cutoff{$slice} : ""; - print "Slice $slice sent $today bytes; ". - "cutoff $cutoff\n"; - } - } - if (defined ($Cutoff{$slice}) && - $today >= $Cutoff{$slice} && - ! defined($Cap{$slice})) { - $Cap{$slice} = "sent"; - bw_cap_mail($slice); - log_bandwidth_cap($slice, $Maxrate{$slice}); - cap_bandwidth($slice, $Maxrate{$slice}); - } - } else { - # Token bucket for this slice is gone! - } - } - - sleep($bwmon_sleep); - } -} - -sub read_config_file { - if (-e $configfile) { - open (CONFIG, "<$configfile") || - print "Cannot open $configfile; $!\n"; - while () { - if (m/^(.*)=(.*)$/) { - ${$1} = $2; - if ($debug) { - print "read_config_file: $1 = ${$1}\n"; - } - } - } - close CONFIG; - } -} - -sub get_slice_names { - # Read slice names from /etc/passwd - if (defined (%Name)) { undef %Name; } - open (PASSWD, ") { - my ($slicename, $passwd, $sliceid) = split(/:/); - $Name{$sliceid} = $slicename; - } - close PASSWD; -} - -sub get_baseline_counts { - `touch $daily_log`; - open (BASE, "+<$daily_log") || - print "Cannot open $daily_log; $!\n"; - while () { - my ($slice, $bytecount) = split(/ /); - $Start{$slice} = $bytecount; - } - - my $status = `tc -s -d qdisc show`; - my $sliceid = 0xffff; - @Lines = split(/\n/, $status); - foreach $line ( @Lines ) { - if ($line =~ /qdisc pfifo (.*): dev/) { - $sliceid = hex($1); - # "Capped" buckets all begin with 0x1000. Ignore the root - # (0x1000) and default (0x1fff) buckets, as well as - # "exempt" buckets that begin with 0x2000 (or anything - # other than 0x1000). - if (($sliceid & 0xf000) == 0x1000 && - $sliceid != 0x1000 && $sliceid != 0x1fff) { - $sliceid = $sliceid & 0x0fff; - } else { - $sliceid = 0xffff; - } - } else { - if ($line =~ /Sent (.*) bytes/) { - my $bytes = $1; - if ($sliceid != 0xffff) { - my $slice = $Name{$sliceid}; - if ($debug && $bytes) { - print "Slice: $slice ($sliceid), bytes $bytes\n"; - } - if (! defined($Start{$slice})) { - print BASE "$slice $bytes\n"; - $Start{$slice} = $bytes; - } - $Now{$slice} = $bytes; - } - } - } - } - close (BASE); -} - -sub get_slice_limits { - if (defined %Maxrate) { undef %Maxrate; } - if (defined %Cutoff) { undef %Cutoff; } - if (-e $vservers) { - my $result = `grep -H "^BWAVGRATE" $vservers/*.conf`; - chomp ($result); - my @Lines = split(/\n/,$result); - foreach $line ( @Lines ) { - if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Mm]bit/) { - $slice = $1; - $limit = $2."Mbit"; - $cutoff = ($2 * 1000000 * 86400)/8; - } else { - if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Kk]bit/) { - $slice = $1; - $limit = $2."Kbit"; - $cutoff = ($2 * 1000 * 86400)/8; - } else { - die "Could not parse line $line"; - } - } - $Maxrate{$slice} = $limit; - $Cutoff{$slice} = $cutoff; - } - } -} - -sub reset_bandwidth_caps { - if (-e $capfile) { - open(CAP, "<$capfile") or die "Cannot open $capfile: $!"; - while () { - chomp(); - ($slicename, $oldcap) = split(/ /); - syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ". - "to $slicename"); - cap_bandwidth ($slicename, $oldcap); - } - close CAP; - unlink($capfile); - } -} - -sub log_bandwidth_cap { - ($slicename, $cap) = @_; - syslog("warning", "pl_mom: Capping bandwidth of slice ". - "$slicename at $cap until midnight GMT."); - # Save current cap to $capfile - system("echo $slicename `bwlimit getcap $slicename` >> $capfile"); -} - -sub send_mail { - # Arg 0: recipient addresses, comma-separated string - # Arg 1: subject line - # Arg 2: body of message - my $to = "To: $_[0]\n"; - my $from = "From: $from_addr\n"; - my $subject = "Subject: $_[1]\n"; - my $msg = $_[2]; - - if ($debug) { - print $to; - print $subject; - } else { - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); - } -} - -sub cap_bandwidth { - ($slicename, $cap) = @_; - system("bwlimit setcap $slicename $cap"); - system("bwlimit on $slicename"); -} - -sub get_date { - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); - return $date; -} - -sub bw_cap_mail { - my ($slicename) = @_; - my $hostname = hostname(); - my $date = get_date(); - my $sent = int($Cutoff{$slicename}/(1024*1024)); - my $bwcap = $Maxrate{$slicename}; - - send_mail("$alias_addr, $slicename\@slices.planet-lab.org", - "$proc capped bandwidth of slice $slicename on $hostname", - "Slice $slicename has transmitted more than ${sent}MB today". - " on $hostname. ". - "Its bandwidth will be capped at $bwcap until midnight GMT.". - "\n\n$date $hostname bwcap $slicename\n"); -} - -sub shutdown_mail { - my $hostname = hostname(); - my $date = get_date(); - send_mail($alias_addr, - "$proc rebooted $hostname", - "Swap space was exhausted on $hostname and so $proc rebooted ". - "it.\n\nAs of $date, the node has successfully come back ". - "online.\n\n$date $hostname reboot\n"); -} - -sub slice_reset_mail { - my $hog = $_[0]; - my $top = $_[1]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); - my $date = get_date(); - send_mail("$alias_addr, $hog\@slices.planet-lab.org", - "$proc reset slice $hog on $hostname", - "As of $date, swap space is nearly exhausted on $hostname.\n\n". - "Slice $hog is being reset since it is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n". - "Please reply to this message explaining the nature of your ". - "experiment, and what you are doing to address the problem.\n". - "\nOutput of 'top -b -n 1' in your slice prior to reset:\n". - "$top\n\n$date $hostname reset $hog\n"); -} - -sub slice_warning_mail { - my $hog = $_[0]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); - my $date = get_date(); - - if ($hog =~ /^root$/) { - $to = $alias_addr; - } else { - $to = "$alias_addr, $hog\@slices.planet-lab.org"; - } - - send_mail($to, - "$proc may reset slice $hog on $hostname", - "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "Please check the memory usage of your slice to avoid a ". - "reset.\n\n$date $hostname warning $hog\n"); -} - -sub unkillable_alarm_mail { - my $hog = $_[0]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); - my $date = get_date(); - - if ($hog =~ /^root$/) { - $to = $alias_addr; - } else { - $to = "$alias_addr, $hog\@slices.planet-lab.org"; - } - - send_mail($to, - "$proc: alarm for slice $hog on $hostname", - "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "The slice will not be reset, but please verify its behavior.\n". - "\n$date $hostname alarm $hog\n"); -} - -sub slice_reset { - my $slice = $_[0]; - my $sliceid = $Slice{$slice}{ctx}; - system("chcontext --ctx $sliceid sudo kill -9 -1"); - system("/etc/init.d/vserver-init start $slice"); -} - -sub swap_used { - open (SWAP, "; - $line = ; - $line =~ s/[\t ]+/ /g; - - my ($filename, $type, $size, $used, $priority) = split(/ /, $line); - close SWAP; - - return 100*($used/$size); -} - -sub get_slice_info { - if (! $debug) { - $content = `curl -s http://127.0.0.1:3100/slicestat`; - } else { - #$content = `cat ../pl_mom-deploy/slicestat` - $content = `curl -s http://127.0.0.1:3100/slicestat`; - } - my @lines = split(/\n/, $content); - %Slice = (); - foreach $line (@lines) { - my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks) - = split(/,/,$line); - $Slice{$slice}{ctx} = $ctx; - $Slice{$slice}{cpu_pct} = $cpu_pct; - $Slice{$slice}{mem_pct} = $mem_pct; - $Slice{$slice}{pmem} = $pmem; - $Slice{$slice}{vmem} = $vmem; - $Slice{$slice}{ntasks} = $ntasks; - } -} - -sub memory_hog { - @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice); - foreach $key (@keys) { - if ($Slice{$key}{mem_pct} >= $min_thresh) { - if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) { - if (! defined ($Warning{$key})) { - $Warning{$key} = "sent"; - syslog ("warning", "pl_mom: Sending alarm mail to ". - "unkillable slice $key, using ". - "$Slice{$key}{mem_pct}%% of memory"); - unkillable_alarm_mail($key); - } - } else { - return $key; - } - } else { - #syslog ("info", "pl_mom: No killable slice using > ". - # "$min_thresh%% memory"); - return; - } - } -} - -sub daemonize { - chdir '/' or die "Can't chdir to /: $!"; - open STDIN, '/dev/null' or die "Can't read /dev/null: $!"; - open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!"; - open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!"; - defined(my $pid = fork) or die "Can't fork: $!"; - exit if $pid; - setsid or die "Can't start a new session: $!"; - umask 0; -} -- 2.43.0