From: Mark Huang Date: Tue, 9 May 2006 22:28:48 +0000 (+0000) Subject: merge to 0.4-1 (HEAD as of 2006-05-09 X-Git-Tag: planetlab-3_3-bootcd~6 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;ds=sidebyside;h=8d2808e2607471b6a660d6f296fb2d334ddfa044;p=mom.git merge to 0.4-1 (HEAD as of 2006-05-09 --- diff --git a/Makefile b/Makefile index 06048a0..c1ee193 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,20 @@ -# @COPYRIGHT@ # -# $Id: Makefile,v 1.2 2004/11/17 20:24:11 acb Exp $ - -BASE=pl_mom -VERSION=0.3 -TARFILE=$(BASE)-$(VERSION).tgz - -all: pl_mom.pl pl_mom +# pl_mom suite of node monitors +# +# Mark Huang +# Copyright (C) 2006 The Trustees of Princeton University +# +# $Id: Makefile,v 1.4 2006/05/01 18:28:22 mlhuang Exp $ +# -install: - install -D -m 755 pl_mom.pl /usr/local/planetlab/bin/pl_mom.pl - install -D -m 755 pl_mom /etc/init.d/pl_mom +ALL := leak -tarball: - rm -f $(TARFILE) - tar cvfz $(TARFILE) -C .. --exclude=CVS $(BASE)-$(VERSION) +CC := gcc +CFLAGS := -Wall -O2 -rpm:: tarball - sudo cp $(TARFILE) /usr/src/redhat/SOURCES - sudo rpmbuild -ba $(BASE).spec +all: $(ALL) +clean: + rm -f $(ALL) +.PHONY: all clean diff --git a/bwmon.py b/bwmon.py index b6f5697..8b89a40 100755 --- a/bwmon.py +++ b/bwmon.py @@ -14,7 +14,7 @@ # Andy Bavier # Copyright (C) 2004-2006 The Trustees of Princeton University # -# $Id: bwmon.py,v 1.2 2006/04/28 20:25:19 mlhuang Exp $ +# $Id: bwmon.py,v 1.3 2006/05/08 17:37:28 mlhuang Exp $ # import syslog @@ -273,11 +273,11 @@ def main(): (version, slices) = pickle.load(f) f.close() # Check version of data file - if version != "$Id: bwmon.py,v 1.2 2006/04/28 20:25:19 mlhuang Exp $": + if version != "$Id: bwmon.py,v 1.3 2006/05/08 17:37:28 mlhuang Exp $": print "Not using old version '%s' data file %s" % (version, datafile) raise Exception except Exception: - version = "$Id: bwmon.py,v 1.2 2006/04/28 20:25:19 mlhuang Exp $" + version = "$Id: bwmon.py,v 1.3 2006/05/08 17:37:28 mlhuang Exp $" slices = {} # Get special slice IDs diff --git a/leak.c b/leak.c index 271d41b..e373db8 100644 --- a/leak.c +++ b/leak.c @@ -4,7 +4,7 @@ * Mark Huang * Copyright (C) 2006 The Trustees of Princeton University * - * $Id$ + * $Id: leak.c,v 1.1 2006/05/01 18:28:22 mlhuang Exp $ */ #include diff --git a/pl_mom b/pl_mom deleted file mode 100755 index a41c481..0000000 --- a/pl_mom +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/sh -# -# chkconfig: 345 98 02 -# description: pl_mom (daemon of death) startup script -# - -CODE='/usr/local/planetlab/bin/pl_mom.pl' -PROC='pl_mom' - -. /etc/rc.d/init.d/functions - -RETVAL=0 - -pidfile=/var/run/$PROC.pid - -check_status() { - pid=`cat $pidfile 2>/dev/null` - # - # this eliminates a race condition between checking existence of pidfile - # and reading its value - # - [ -n "$pid" -a -d /proc/$pid ] -} - -case "$1" in - start) - echo -n "starting $PROC:" - [ -r $CODE ] || action "code missing" /bin/false || exit 1 - pid=`cat $pidfile 2>/dev/null` - if [ -n "$pid" ]; then - # check whether process really exists - # yes - don't try to start - [ -d /proc/$pid ] && action "already running" /bin/true && exit 1 - - # no - PID file is stale - rm -f $pidfile - fi - - $CODE - sleep 1 - - cmd=success - check_status || cmd=failure - $cmd "$PROC startup" - echo - ;; - - stop) - echo -n "shutting down $PROC: " - check_status && kill -TERM -`cat $pidfile` && sleep 1 - cmd=failure - check_status || cmd=success && rm -f $pidfile - $cmd "$PROC shutdown" - RETVAL=0 - echo - ;; - - restart|reload) - $0 stop - $0 start - RETVAL=$? - ;; - - status) - check_status && echo 'running' && exit 0 || \ - echo 'not running' && exit 1 - ;; - - *) - echo "Usage: $0 {start|stop|restart|status}" - exit 1 -esac - -exit $RETVAL diff --git a/pl_mom.cron b/pl_mom.cron index eb47892..52e0788 100644 --- a/pl_mom.cron +++ b/pl_mom.cron @@ -4,7 +4,7 @@ # Mark Huang # Copyright (C) 2005 The Trustees of Princeton University # -# $Id: pl_mop.cron,v 1.1 2005/10/11 17:34:57 mlhuang Exp $ +# $Id: pl_mom.cron,v 1.1 2006/04/28 19:12:09 mlhuang Exp $ # @M@ @H@ * * * root /usr/local/planetlab/bin/pl_mop.sh diff --git a/pl_mom.pl b/pl_mom.pl deleted file mode 100755 index e16f24c..0000000 --- a/pl_mom.pl +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/perl -w - -use POSIX qw(setsid); -use Sys::Syslog; -use Sys::Hostname; -#use LWP::Simple; - -$debug = 0; -$proc = "pl_mom"; -$alias_addr = "pl-mom\@planet-lab.org"; -$from_addr = "support\@planet-lab.org"; - -if (! $debug) { - $kill_thresh = 90; - $reboot_thresh = 95; - $log_thresh = 85; - $change_thresh = 5; - $min_thresh = 10; - #$bwcap_default = "off"; - $bwcap_default = "1.5Mbit"; - $cutoff_default = "16200000000"; # 16GB, for 1.5Mbit cap - $bwmon_sleep = 900; - - $sendmail = "/usr/sbin/sendmail -t -f$from_addr"; - $vservers = "/etc/vservers"; - $pidfile = "/var/run/$proc.pid"; - $rebootfile = "/var/lib/misc/pl_mom.reboot"; - $daily_log = "/var/lib/misc/pl_mom.daily"; - $daily_stamp = "/var/lib/misc/pl_mom.stamp"; - $configfile = "/etc/planetlab/pl_mom.conf"; - $capfile = "/var/lib/misc/pl_mom.oldcaps"; -} else { - $kill_thresh = 2; - $reboot_thresh = 20; - $log_thresh = 2; - $change_thresh = 5; - $min_thresh = 2; - $bwcap_default = "1Kbit"; - $cutoff_default = "10800"; - $bwmon_sleep = 10; - - $sendmail = "cat"; - $vservers = "./debug"; - $pidfile = "./$proc.pid"; - $rebootfile = "./debug/pl_mom.reboot"; - $daily_log = "./debug/pl_mom.daily"; - $daily_stamp = "./debug/pl_mom.stamp"; - $configfile = "./debug/pl_mom.conf"; - $capfile = "./debug/pl_mom.oldcaps"; -} - -$sleep = 30; - -# daemonize the program -if (! $debug) { - &daemonize; -} - -system("echo $$ > $pidfile"); - -read_config_file(); - -# Check to see whether pl_mom rebooted the node -if (-e $rebootfile) { - unlink($rebootfile); - syslog ("warning", "pl_mom: Sending shutdown mail"); - shutdown_mail(); -} - -my $pid = fork(); -if (! $pid) { - syslog ("info", "pl_mom: Launching reboot kicker"); - reboot_kicker(); - die (0); -} -$pid = fork(); -if (! $pid) { - syslog ("info", "pl_mom: Launching bandwidth monitor"); - if ($bwcap_default =~ /off/) { - syslog("info", "pl_mom: Max rate unlimited by default"); - } - reset_bandwidth_caps(); - bandwidth_monitor(); - die (0); -} - -while (1) { - $used = int(swap_used()); - - if (defined($old_used)) { - if ($used >= $old_used + $change_thresh) { - syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds", - $used - $old_used, $sleep); - } - } - - if ($used >= $log_thresh) { - if (! defined($old_used) || $used != $old_used) { - syslog ("info", "pl_mom: Swap used: %d%%", $used); - } - get_slice_info(); - my $hog = memory_hog(); - if ($hog) { - if ($used < $kill_thresh) { - if (! defined($Warning{$hog})) { - $Warning{$hog} = "sent"; - syslog ("warning", "pl_mom: Slice $hog is ". - "using $Slice{$hog}{mem_pct}%% of memory"); - #slice_warning_mail($hog); - } - } else { - my $id = `id -u $hog`; - chomp($id); - my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`; - syslog ("warning", "pl_mom: Resetting slice $hog"); - if (! $debug) { - slice_reset($hog); - } - syslog ("warning", "pl_mom: Sending mail to slice $hog"); - slice_reset_mail($hog, $top); - } - } - } - - sleep ($sleep); - - $old_used = $used; -} - -sub reboot_kicker { - while (1) { - $used = swap_used(); - - if ($used >= $reboot_thresh) { - syslog ("warning", "pl_mom: Rebooting node"); - - system("touch $rebootfile"); - if (! $debug) { - #system("shutdown -r now"); - system("/bin/sync; /sbin/reboot -f"); - } - die (0); - } - - sleep (1); - } -} - -sub bandwidth_monitor { - while (1) { - # See if a new day has started for bandwidth monitoring - chomp($now = `date -u +%D`); - if (-e $daily_stamp) { - chomp($stamp = `cat $daily_stamp`); - } - if (! defined($stamp) || !($stamp =~ $now)) { - open (STAMP, ">$daily_stamp") || - die "Can't open file $daily_stamp for writing: $!\n"; - print STAMP "$now\n"; - close STAMP; - unlink ($daily_log); - - # Could save the list of capped slices in a file in order to - # avoid re-sending mails if the daemon restarts. - # Also may want a list of slices that are exempt from capping. - if (defined(%Start)) { undef %Start; } - if (defined(%Now)) { undef %Now; } - if (defined(%Cap)) { undef %Cap; } - - reset_bandwidth_caps(); - - syslog("info", "pl_mom: Beginning bandwidth monitoring for $now"); - } - - get_slice_names(); - get_baseline_counts(); - get_slice_limits(); - - foreach $slice ( sort (keys %Start) ) { - if (defined $Now{$slice}) { - $today = $Now{$slice} - $Start{$slice}; - if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) { - $Cutoff{$slice} = $cutoff_default; - $Maxrate{$slice} = $bwcap_default; - } - if ($debug) { - if ($today) { - $cutoff = defined($Cutoff{$slice}) - ? $Cutoff{$slice} : ""; - print "Slice $slice sent $today bytes; ". - "cutoff $cutoff\n"; - } - } - if (defined ($Cutoff{$slice}) && - $today >= $Cutoff{$slice} && - ! defined($Cap{$slice})) { - $Cap{$slice} = "sent"; - bw_cap_mail($slice); - log_bandwidth_cap($slice, $Maxrate{$slice}); - cap_bandwidth($slice, $Maxrate{$slice}); - } - } else { - # Token bucket for this slice is gone! - } - } - - sleep($bwmon_sleep); - } -} - -sub read_config_file { - if (-e $configfile) { - open (CONFIG, "<$configfile") || - print "Cannot open $configfile; $!\n"; - while () { - if (m/^(.*)=(.*)$/) { - ${$1} = $2; - if ($debug) { - print "read_config_file: $1 = ${$1}\n"; - } - } - } - close CONFIG; - } -} - -sub get_slice_names { - # Read slice names from /etc/passwd - if (defined (%Name)) { undef %Name; } - open (PASSWD, ") { - my ($slicename, $passwd, $sliceid) = split(/:/); - $Name{$sliceid} = $slicename; - } - close PASSWD; -} - -sub get_baseline_counts { - `touch $daily_log`; - open (BASE, "+<$daily_log") || - print "Cannot open $daily_log; $!\n"; - while () { - my ($slice, $bytecount) = split(/ /); - $Start{$slice} = $bytecount; - } - - my $status = `tc -s -d qdisc show`; - my $sliceid = 0xffff; - @Lines = split(/\n/, $status); - foreach $line ( @Lines ) { - if ($line =~ /qdisc pfifo (.*): dev/) { - $sliceid = hex($1); - # "Capped" buckets all begin with 0x1000. Ignore the root - # (0x1000) and default (0x1fff) buckets, as well as - # "exempt" buckets that begin with 0x2000 (or anything - # other than 0x1000). - if (($sliceid & 0xf000) == 0x1000 && - $sliceid != 0x1000 && $sliceid != 0x1fff) { - $sliceid = $sliceid & 0x0fff; - } else { - $sliceid = 0xffff; - } - } else { - if ($line =~ /Sent (.*) bytes/) { - my $bytes = $1; - if ($sliceid != 0xffff) { - my $slice = $Name{$sliceid}; - if ($debug && $bytes) { - print "Slice: $slice ($sliceid), bytes $bytes\n"; - } - if (! defined($Start{$slice})) { - print BASE "$slice $bytes\n"; - $Start{$slice} = $bytes; - } - $Now{$slice} = $bytes; - } - } - } - } - close (BASE); -} - -sub get_slice_limits { - if (defined %Maxrate) { undef %Maxrate; } - if (defined %Cutoff) { undef %Cutoff; } - if (-e $vservers) { - my $result = `grep -H "^BWAVGRATE" $vservers/*.conf`; - chomp ($result); - my @Lines = split(/\n/,$result); - foreach $line ( @Lines ) { - if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Mm]bit/) { - $slice = $1; - $limit = $2."Mbit"; - $cutoff = ($2 * 1000000 * 86400)/8; - } else { - if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Kk]bit/) { - $slice = $1; - $limit = $2."Kbit"; - $cutoff = ($2 * 1000 * 86400)/8; - } else { - die "Could not parse line $line"; - } - } - $Maxrate{$slice} = $limit; - $Cutoff{$slice} = $cutoff; - } - } -} - -sub reset_bandwidth_caps { - if (-e $capfile) { - open(CAP, "<$capfile") or die "Cannot open $capfile: $!"; - while () { - chomp(); - ($slicename, $oldcap) = split(/ /); - syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ". - "to $slicename"); - cap_bandwidth ($slicename, $oldcap); - } - close CAP; - unlink($capfile); - } -} - -sub log_bandwidth_cap { - ($slicename, $cap) = @_; - syslog("warning", "pl_mom: Capping bandwidth of slice ". - "$slicename at $cap until midnight GMT."); - # Save current cap to $capfile - system("echo $slicename `bwlimit getcap $slicename` >> $capfile"); -} - -sub send_mail { - # Arg 0: recipient addresses, comma-separated string - # Arg 1: subject line - # Arg 2: body of message - my $to = "To: $_[0]\n"; - my $from = "From: $from_addr\n"; - my $subject = "Subject: $_[1]\n"; - my $msg = $_[2]; - - if ($debug) { - print $to; - print $subject; - } else { - open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!"; - print SENDMAIL $to; - print SENDMAIL $from; - print SENDMAIL $subject; - print SENDMAIL "Content-type: text/plain\n\n"; - print SENDMAIL $msg; - close(SENDMAIL); - } -} - -sub cap_bandwidth { - ($slicename, $cap) = @_; - system("bwlimit setcap $slicename $cap"); - system("bwlimit on $slicename"); -} - -sub get_date { - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat) - = localtime(time); - my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d", - $year+1900, $mon+1, $mday, $hour, $min, $sec); - return $date; -} - -sub bw_cap_mail { - my ($slicename) = @_; - my $hostname = hostname(); - my $date = get_date(); - my $sent = int($Cutoff{$slicename}/(1024*1024)); - my $bwcap = $Maxrate{$slicename}; - - send_mail("$alias_addr, $slicename\@slices.planet-lab.org", - "$proc capped bandwidth of slice $slicename on $hostname", - "Slice $slicename has transmitted more than ${sent}MB today". - " on $hostname. ". - "Its bandwidth will be capped at $bwcap until midnight GMT.". - "\n\n$date $hostname bwcap $slicename\n"); -} - -sub shutdown_mail { - my $hostname = hostname(); - my $date = get_date(); - send_mail($alias_addr, - "$proc rebooted $hostname", - "Swap space was exhausted on $hostname and so $proc rebooted ". - "it.\n\nAs of $date, the node has successfully come back ". - "online.\n\n$date $hostname reboot\n"); -} - -sub slice_reset_mail { - my $hog = $_[0]; - my $top = $_[1]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); - my $date = get_date(); - send_mail("$alias_addr, $hog\@slices.planet-lab.org", - "$proc reset slice $hog on $hostname", - "As of $date, swap space is nearly exhausted on $hostname.\n\n". - "Slice $hog is being reset since it is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n". - "Please reply to this message explaining the nature of your ". - "experiment, and what you are doing to address the problem.\n". - "\nOutput of 'top -b -n 1' in your slice prior to reset:\n". - "$top\n\n$date $hostname reset $hog\n"); -} - -sub slice_warning_mail { - my $hog = $_[0]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); - my $date = get_date(); - - if ($hog =~ /^root$/) { - $to = $alias_addr; - } else { - $to = "$alias_addr, $hog\@slices.planet-lab.org"; - } - - send_mail($to, - "$proc may reset slice $hog on $hostname", - "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "Please check the memory usage of your slice to avoid a ". - "reset.\n\n$date $hostname warning $hog\n"); -} - -sub unkillable_alarm_mail { - my $hog = $_[0]; - my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000); - my $hog_pct = $Slice{$hog}{mem_pct}; - my $hostname = hostname(); - my $date = get_date(); - - if ($hog =~ /^root$/) { - $to = $alias_addr; - } else { - $to = "$alias_addr, $hog\@slices.planet-lab.org"; - } - - send_mail($to, - "$proc: alarm for slice $hog on $hostname", - "As of $date, swap space is over $log_thresh% full on ". - "$hostname.\n\nSlice $hog is the largest consumer ". - "of physical memory at ${hog_mem}MB ($hog_pct%).\n". - "The slice will not be reset, but please verify its behavior.\n". - "\n$date $hostname alarm $hog\n"); -} - -sub slice_reset { - my $slice = $_[0]; - my $sliceid = $Slice{$slice}{ctx}; - system("chcontext --ctx $sliceid sudo kill -9 -1"); - system("/etc/init.d/vserver-init start $slice"); -} - -sub swap_used { - open (SWAP, "; - $line = ; - $line =~ s/[\t ]+/ /g; - - my ($filename, $type, $size, $used, $priority) = split(/ /, $line); - close SWAP; - - return 100*($used/$size); -} - -sub get_slice_info { - if (! $debug) { - $content = `curl -s http://127.0.0.1:3100/slicestat`; - } else { - #$content = `cat ../pl_mom-deploy/slicestat` - $content = `curl -s http://127.0.0.1:3100/slicestat`; - } - my @lines = split(/\n/, $content); - %Slice = (); - foreach $line (@lines) { - my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks) - = split(/,/,$line); - $Slice{$slice}{ctx} = $ctx; - $Slice{$slice}{cpu_pct} = $cpu_pct; - $Slice{$slice}{mem_pct} = $mem_pct; - $Slice{$slice}{pmem} = $pmem; - $Slice{$slice}{vmem} = $vmem; - $Slice{$slice}{ntasks} = $ntasks; - } -} - -sub memory_hog { - @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice); - foreach $key (@keys) { - if ($Slice{$key}{mem_pct} >= $min_thresh) { - if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) { - if (! defined ($Warning{$key})) { - $Warning{$key} = "sent"; - syslog ("warning", "pl_mom: Sending alarm mail to ". - "unkillable slice $key, using ". - "$Slice{$key}{mem_pct}%% of memory"); - unkillable_alarm_mail($key); - } - } else { - return $key; - } - } else { - #syslog ("info", "pl_mom: No killable slice using > ". - # "$min_thresh%% memory"); - return; - } - } -} - -sub daemonize { - chdir '/' or die "Can't chdir to /: $!"; - open STDIN, '/dev/null' or die "Can't read /dev/null: $!"; - open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!"; - open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!"; - defined(my $pid = fork) or die "Can't fork: $!"; - exit if $pid; - setsid or die "Can't start a new session: $!"; - umask 0; -} diff --git a/pl_mom.py b/pl_mom.py index d889e88..36be37a 100644 --- a/pl_mom.py +++ b/pl_mom.py @@ -5,7 +5,7 @@ # Mark Huang # Copyright (C) 2006 The Trustees of Princeton University # -# $Id: pl_mom.py,v 1.2 2006/05/08 17:37:28 mlhuang Exp $ +# $Id: pl_mom.py,v 1.3 2006/05/09 03:22:25 mlhuang Exp $ # import os diff --git a/pl_mom.spec b/pl_mom.spec index 4d3fd08..2af45ec 100644 --- a/pl_mom.spec +++ b/pl_mom.spec @@ -1,12 +1,12 @@ %define name pl_mom -%define version 0.3 -%define release 15%{?pldistro:.%{pldistro}}%{?date:.%{date}} +%define version 0.4 +%define release 1%{?pldistro:.%{pldistro}}%{?date:.%{date}} -Summary: PlanetLab mom -- Cleans up your mess +Summary: PlanetLab node monitoring tools Name: %{name} Version: %{version} Release: %{release} -License: dontknow +License: GPL Group: System Environment/Kernel Source: %{name}-%{version}.tgz Vendor: PlanetLab @@ -15,73 +15,83 @@ Distribution: PlanetLab 3.0 URL: http://cvs.planet-lab.org/cvs/pl_mom BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) Requires: expect +# swapmon requires vps and bwlimit.py +Requires: util-vserver, util-vserver-python +# bwmon requires tc +Requires: iproute %description +pl_mom is a suite of PlanetLab node monitoring tools. -A small daemon that watches the consumed swap space. At 90% utilization, it -resets the slice that is the biggest memory hog. At 95% utilization, it -reboots the machine. +swapmon is a swap monitoring daemon. Every 30 seconds, it checks +process memory usage. At 90% utilization, resets the slice that is +consuming the most physical memory. At 95% utilization, it reboots the +machine to avoid a crash. -A cron job which "fixes" various common problems with nodes (dead -services, ext3 corruption, zombie SSH sessions) is also installed. +bwmon is a cron job that monitors the average bandwidth usage of each +slice and enforces a daily byte limit for each slice. -%prep +pl_mop is a cron job that "fixes" various common problems with nodes +(dead services, ext3 corruption, zombie SSH sessions, etc.). -%setup +%prep +%setup -q %build %install -mkdir -p $RPM_BUILD_ROOT/usr/local/planetlab/bin/ -mkdir -p $RPM_BUILD_ROOT/etc/init.d/ +rm -rf $RPM_BUILD_ROOT -cp pl_mom $RPM_BUILD_ROOT/etc/init.d/ -cp pl_mom.pl $RPM_BUILD_ROOT/usr/local/planetlab/bin/ -cp pl_mop.sh $RPM_BUILD_ROOT/usr/local/planetlab/bin/ +# Utility functions +install -D -m 644 pl_mom.py $RPM_BUILD_ROOT/%{_datadir}/%{name}/pl_mom.py -install -D -m 644 pl_mop.cron $RPM_BUILD_ROOT/etc/cron.d/pl_mop +# Bandwidth monitor (bwmon), run periodically +install -D -m 755 bwmon.py $RPM_BUILD_ROOT/%{_datadir}/%{name}/bwmon.py -%clean -[ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT +# Swap monitoring daemon (swapmon) +install -D -m 755 swapmon.py $RPM_BUILD_ROOT/%{_datadir}/%{name}/swapmon.py +install -D -m 755 swapmon.init $RPM_BUILD_ROOT/%{_initrddir}/swapmon -%files -%defattr(0755, root, root) -/etc/init.d/pl_mom -/usr/local/planetlab/bin/pl_mom.pl -/usr/local/planetlab/bin/pl_mop.sh -/etc/cron.d/pl_mop +# Cleanup script +install -D -m 755 pl_mop.sh $RPM_BUILD_ROOT/usr/local/planetlab/bin/pl_mop.sh -%pre +# Runs pl_mop and bwmon periodically +install -D -m 644 pl_mom.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/pl_mom +%clean +rm -rf $RPM_BUILD_ROOT %post -if [ "$1" -ge 1 ]; then - - chkconfig --add pl_mom - chkconfig --level 3 pl_mom on +chkconfig --add swapmon +chkconfig swapmon on +if [ "$PL_BOOTCD" != "1" ] ; then + service swapmon restart +fi - if [[ "$PL_BOOTCD" != "1" ]]; then - /etc/init.d/pl_mom stop - /etc/init.d/pl_mom start - fi +# Randomize pl_mop run time +M=$((60 * $RANDOM / 32768)) +H=$((24 * $RANDOM / 32768)) +sed -i -e "s/@M@/$M/" -e "s/@H@/$H/" %{_sysconfdir}/cron.d/pl_mom - # Randomize pl_mop run time - M=$((60 * $RANDOM / 32768)) - H=$((24 * $RANDOM / 32768)) - sed -i -e "s/@M@/$M/" -e "s/@H@/$H/" /etc/cron.d/pl_mop -fi +exit 0 %preun +# 0 = erase, 1 = upgrade if [ "$1" -eq 0 ]; then - if [[ "$PL_BOOTCD" != "1" ]]; then - /etc/init.d/pl_mom stop - fi - - chkconfig --del pl_mom - chkconfig pl_mom off + if [ "$PL_BOOTCD" != "1" ] ; then + service swapmon stop + fi + chkconfig swapmon off + chkconfig --del swapmon fi +exit 0 -%postun - - +%files +%defattr(-, root, root, -) +%{_datadir}/%{name}/pl_mom.py +%{_datadir}/%{name}/bwmon.py +%{_datadir}/%{name}/swapmon.py +%{_initrddir}/swapmon +/usr/local/planetlab/bin/pl_mop.sh +%{_sysconfdir}/cron.d/pl_mom diff --git a/pl_mop.cron b/pl_mop.cron deleted file mode 100644 index 6f924f1..0000000 --- a/pl_mop.cron +++ /dev/null @@ -1,10 +0,0 @@ -# -# Runs once a day to "fix" nodes in various ways -# -# Mark Huang -# Copyright (C) 2005 The Trustees of Princeton University -# -# $Id: plb.cron,v 1.1 2005/09/27 23:55:02 mlh-pl_rock Exp $ -# - -@M@ @H@ * * * root /usr/local/planetlab/bin/pl_mop.sh diff --git a/pl_mop.sh b/pl_mop.sh index 5ac5546..2400d2a 100755 --- a/pl_mop.sh +++ b/pl_mop.sh @@ -5,11 +5,18 @@ # Mark Huang # Copyright (C) 2005 The Trustees of Princeton University # -# $Id: pl_mop.sh,v 1.4 2006/01/26 19:26:20 mlhuang Exp $ +# $Id: pl_mop.sh,v 1.7 2006/05/09 22:26:31 mlhuang Exp $ # PATH=/sbin:/usr/sbin:$PATH +# Parse PLC configuration +if [ -r /etc/planetlab/plc_config ] ; then + . /etc/planetlab/plc_config +else + PLC_SLICE_PREFIX="pl" +fi + PIDFILE=/var/run/pl_mop.pid # Record PID @@ -46,7 +53,7 @@ fix_etc_shadow() { # keep essential services running restart_services() { - for service in sshd pl_sshd pl_mom pl_nm proper ; do + for service in sshd pl_sshd swapmon pl_nm proper ; do echo "* Checking $service" status=$(service $service status) if [ $? -ne 0 ] || echo $status 2>&1 | grep -q stopped ; then @@ -79,6 +86,47 @@ restart_pl_conf() { fi } +# GPG keys are installed in /etc/pki/rpm-gpg by both the Boot Manager +# during initial installation, and by PlanetLabConf during daily +# updates. NodeUpdate imports the keys into the RPM database before +# running yum daily. vserver-reference copies and imports the keys +# into the reference images and system slices daily. The only parts of +# this process that are actually necessary, are the Boot Manager and +# vserver-reference. However, we do not want to force a re-install of +# all nodes, and we do not want to force an update of +# vserver-reference, so in the meantime, PlanetLabConf and NodeUpdate +# take care of getting the keys installed and imported in /, and this +# script takes care of getting them installed in the reference images +# and system slices, until we can get a new vserver-reference image +# pushed out. +update_vserver_reference() { + echo "* Updating VServer reference" + + shopt -s nullglob + + VROOTS="/vservers/vserver-reference /vservers/.vcache/* /vservers/${PLC_SLICE_PREFIX}_*" + + # Copy configuration files from host to slices + for file in \ + /etc/hosts /etc/resolv.conf /etc/yum.conf /etc/planetlab/node_id \ + /etc/planetlab/plc_config* /etc/planetlab/php/* \ + /etc/pki/rpm-gpg/* ; do + if [ -r $file ] ; then + for vroot in $VROOTS ; do + install -D -m 644 $file $vroot/$file + done + fi + done + + # (Re)install GPG signing keys + if [ -d /etc/pki/rpm-gpg ] ; then + for vroot in $VROOTS ; do + chroot $vroot rpm --allmatches -e gpg-pubkey || : + chroot $vroot rpm --import /etc/pki/rpm-gpg/* || : + done + fi +} + # kill all the processes running in slice contexts vkillall() { vps -A | awk '(int($2) > 1) { system("vkill -c " $2 " -s 9 " $1); }' @@ -181,3 +229,5 @@ run restart_pl_conf run restart_netflow run kill_duplicate_ssh + +run update_vserver_reference diff --git a/swapmon.init b/swapmon.init index d99530d..98597de 100755 --- a/swapmon.init +++ b/swapmon.init @@ -6,7 +6,7 @@ # # description: Resets memory hogs when swap is running low # -# $Id: host.init,v 1.6 2006/04/20 09:01:00 thierry Exp $ +# $Id: swapmon.init,v 1.1 2006/04/28 19:29:16 mlhuang Exp $ # PATH=/sbin:/bin:/usr/bin:/usr/sbin diff --git a/swapmon.py b/swapmon.py index 3f6304e..907f72f 100755 --- a/swapmon.py +++ b/swapmon.py @@ -9,7 +9,7 @@ # Andy Bavier # Copyright (C) 2004-2006 The Trustees of Princeton University # -# $Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $ +# $Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $ # import syslog @@ -349,7 +349,7 @@ def main(): (version, slices) = pickle.load(f) f.close() # Check version of data file - if version != "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $": + if version != "$Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $": print "Not using old version '%s' data file %s" % (version, datafile) raise Exception @@ -366,7 +366,7 @@ def main(): # Delete data file os.unlink(datafile) except Exception: - version = "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $" + version = "$Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $" slices = {} # Query process table every 30 seconds, or when a large change in