10 $alias_addr = "pl-mom\@planet-lab.org";
11 $from_addr = "support\@planet-lab.org";
21 $byte_cutoff = 16000000000; # 16GB
24 $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
25 $pidfile = "/var/run/$proc.pid";
26 $rebootfile = "/var/lib/misc/pl_mom.reboot";
27 $daily_log = "/var/lib/misc/pl_mom.daily";
28 $daily_stamp = "/var/lib/misc/pl_mom.stamp";
29 $configfile = "/etc/planetlab/pl_mom.conf";
30 $capfile = "/var/lib/misc/pl_mom.oldcaps";
41 $pidfile = "./$proc.pid";
42 $rebootfile = "./pl_mom.reboot";
43 $daily_log = "./pl_mom.daily";
44 $daily_stamp = "./pl_mom.stamp";
45 $configfile = "./pl_mom.conf";
46 $capfile = "./pl_mom.oldcaps";
51 # daemonize the program
56 system("echo $$ > $pidfile");
60 # Check to see whether pl_mom rebooted the node
63 syslog ("warning", "pl_mom: Sending shutdown mail");
69 syslog ("info", "pl_mom: Launching reboot kicker");
75 syslog ("info", "pl_mom: Launching bandwidth monitor");
76 if ($bwcap =~ /off/) {
77 syslog("info", "pl_mom: Bandwidth capping is off");
79 reset_bandwidth_caps();
85 $used = int(swap_used());
87 if (defined($old_used)) {
88 if ($used >= $old_used + $change_thresh) {
89 syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
90 $used - $old_used, $sleep);
94 if ($used >= $log_thresh) {
95 if (! defined($old_used) || $used != $old_used) {
96 syslog ("info", "pl_mom: Swap used: %d%%", $used);
99 my $hog = memory_hog();
101 if ($used < $kill_thresh) {
102 if (! defined($Warning{$hog})) {
103 $Warning{$hog} = "sent";
104 syslog ("warning", "pl_mom: Slice $hog is ".
105 "using $Slice{$hog}{mem_pct}%% of memory");
106 #slice_warning_mail($hog);
109 my $id = `id -u $hog`;
111 my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`;
112 syslog ("warning", "pl_mom: Resetting slice $hog");
116 syslog ("warning", "pl_mom: Sending mail to slice $hog");
117 slice_reset_mail($hog, $top);
131 if ($used >= $reboot_thresh) {
132 syslog ("warning", "pl_mom: Rebooting node");
134 system("touch $rebootfile");
136 #system("shutdown -r now");
137 system("/bin/sync; /sbin/reboot -f");
146 sub bandwidth_monitor {
148 # See if a new day has started for bandwidth monitoring
149 chomp($now = `date -u +%D`);
150 if (-e $daily_stamp) {
151 chomp($stamp = `cat $daily_stamp`);
153 if (! defined($stamp) || !($stamp =~ $now)) {
154 open (STAMP, ">$daily_stamp") ||
155 die "Can't open file $daily_stamp for writing: $!\n";
156 print STAMP "$now\n";
160 # Could save the list of capped slices in a file in order to
161 # avoid re-sending mails if the daemon restarts.
162 # Also may want a list of slices that are exempt from capping.
163 if (defined(%Start)) { undef %Start; }
164 if (defined(%Now)) { undef %Now; }
165 if (defined(%Cap)) { undef %Cap; }
167 reset_bandwidth_caps();
169 syslog("info", "pl_mom: Beginning bandwidth monitoring for $now");
172 # Get baseline counts
174 open (BASE, "+<$daily_log") ||
175 print "Cannot open $daily_log; $!\n";
177 my ($sliceid, $bytecount) = split(/ /);
178 $Start{$sliceid} = $bytecount;
181 $status = `tc -s -d qdisc show`;
182 @lines = split(/\n/, $status);
183 for ($i = 0; $i < @lines; $i++) {
184 if ($lines[$i] =~ /qdisc pfifo/) {
185 $lines[$i] =~ s/^ +//;
186 @fields = split(/ /, $lines[$i]);
190 if ($slice != 9999) {
191 $lines[$i+1] =~ s/^ +//;
192 @fields = split(/ /, $lines[$i+1]);
194 #if ($bytes) {print "Slice $slice sent $bytes bytes\n";}
196 if (! defined($Start{$slice})) {
197 print BASE "$slice $bytes\n";
198 $Start{$slice} = $bytes;
200 $Now{$slice} = $bytes;
206 if (!($bwcap =~ /off/)) {
207 foreach $slice ( sort (keys %Start) ) {
208 if (defined $Now{$slice}) {
209 $today = $Now{$slice} - $Start{$slice};
210 if ($today >= $byte_cutoff && ! defined($Cap{$slice})) {
211 $Cap{$slice} = "sent";
212 $slicename = get_slice_name($slice);
214 bw_cap_mail($slicename);
215 log_bandwidth_cap($slicename, $bwcap);
216 cap_bandwidth($slicename, $bwcap);
218 syslog("warning", "pl_mom: Could not find slice ".
219 "name for slice ID $slice");
223 # Token bucket for this slice is gone!
232 sub read_config_file {
233 if (-e $configfile) {
234 open (CONFIG, "<$configfile") ||
235 print "Cannot open $configfile; $!\n";
237 if (m/^(.*)=(.*)$/) {
240 print "read_config_file: $1 = ${$1}\n";
248 sub reset_bandwidth_caps {
250 open(CAP, "<$capfile") or die "Cannot open $capfile: $!";
253 ($slicename, $oldcap) = split(/ /);
254 syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ".
256 cap_bandwidth ($slicename, $oldcap);
263 sub log_bandwidth_cap {
264 ($slicename, $cap) = @_;
265 syslog("warning", "pl_mom: Capping bandwidth of slice ".
266 "$slicename at $cap until midnight GMT.");
267 # Save current cap to $capfile
268 system("echo $slicename `bwlimit getcap $slicename` >> $capfile");
272 # Arg 0: recipient addresses, comma-separated string
273 # Arg 1: subject line
274 # Arg 2: body of message
275 my $to = "To: $_[0]\n";
276 my $from = "From: $from_addr\n";
277 my $subject = "Subject: $_[1]\n";
280 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
282 print SENDMAIL $from;
283 print SENDMAIL $subject;
284 print SENDMAIL "Content-type: text/plain\n\n";
290 ($slicename, $cap) = @_;
291 system("bwlimit setcap $slicename $cap");
292 system("bwlimit on $slicename");
296 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
298 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
299 $year+1900, $mon+1, $mday, $hour, $min, $sec);
306 # Need to map slice id to slice name; is there a sensor?
307 # For now, get it from /etc/passwd
309 open (PASSWD, "</etc/passwd") ||
310 print "Cannot open /etc/passwd; $!\n";
312 my ($slicename, $passwd, $sliceid) = split(/:/);
313 if ($sliceid == $_[0]) {
322 my $hostname = hostname();
323 my $date = get_date();
324 my $sent = int($byte_cutoff/1000000000);
326 send_mail("$alias_addr, $slicename\@slices.planet-lab.org",
327 "$proc capped bandwidth of slice $slicename on $hostname",
328 "Slice $slicename has transmitted more than ${sent}GB today".
330 "Its bandwidth will be capped at $bwcap until midnight GMT.".
331 "\n\n$date $hostname bwcap $slicename\n");
335 my $hostname = hostname();
336 my $date = get_date();
337 send_mail($alias_addr,
338 "$proc rebooted $hostname",
339 "Swap space was exhausted on $hostname and so $proc rebooted ".
340 "it.\n\nAs of $date, the node has successfully come back ".
341 "online.\n\n$date $hostname reboot\n");
344 sub slice_reset_mail {
347 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
348 my $hog_pct = $Slice{$hog}{mem_pct};
349 my $hostname = hostname();
350 my $date = get_date();
351 send_mail("$alias_addr, $hog\@slices.planet-lab.org",
352 "$proc reset slice $hog on $hostname",
353 "As of $date, swap space is nearly exhausted on $hostname.\n\n".
354 "Slice $hog is being reset since it is the largest consumer ".
355 "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
356 "Please reply to this message explaining the nature of your ".
357 "experiment, and what you are doing to address the problem.\n".
358 "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
359 "$top\n\n$date $hostname reset $hog\n");
362 sub slice_warning_mail {
364 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
365 my $hog_pct = $Slice{$hog}{mem_pct};
366 my $hostname = hostname();
367 my $date = get_date();
369 if ($hog =~ /^root$/) {
372 $to = "$alias_addr, $hog\@slices.planet-lab.org";
376 "$proc may reset slice $hog on $hostname",
377 "As of $date, swap space is over $log_thresh% full on ".
378 "$hostname.\n\nSlice $hog is the largest consumer ".
379 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
380 "Please check the memory usage of your slice to avoid a ".
381 "reset.\n\n$date $hostname warning $hog\n");
384 sub unkillable_alarm_mail {
386 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
387 my $hog_pct = $Slice{$hog}{mem_pct};
388 my $hostname = hostname();
389 my $date = get_date();
391 if ($hog =~ /^root$/) {
394 $to = "$alias_addr, $hog\@slices.planet-lab.org";
398 "$proc: alarm for slice $hog on $hostname",
399 "As of $date, swap space is over $log_thresh% full on ".
400 "$hostname.\n\nSlice $hog is the largest consumer ".
401 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
402 "The slice will not be reset, but please verify its behavior.\n".
403 "\n$date $hostname alarm $hog\n");
408 my $sliceid = $Slice{$slice}{ctx};
409 system("chcontext --ctx $sliceid sudo kill -9 -1");
410 system("/etc/init.d/vserver-init start $slice");
414 open (SWAP, "</proc/swaps") ||
415 die "Cannot open /proc/swaps; $!\n";
419 $line =~ s/[\t ]+/ /g;
421 my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
424 return 100*($used/$size);
429 $content = `curl -s http://127.0.0.1:3100/slicestat`;
431 #$content = `cat ../pl_mom-deploy/slicestat`
432 $content = `curl -s http://127.0.0.1:3100/slicestat`;
434 my @lines = split(/\n/, $content);
436 foreach $line (@lines) {
437 my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
439 $Slice{$slice}{ctx} = $ctx;
440 $Slice{$slice}{cpu_pct} = $cpu_pct;
441 $Slice{$slice}{mem_pct} = $mem_pct;
442 $Slice{$slice}{pmem} = $pmem;
443 $Slice{$slice}{vmem} = $vmem;
444 $Slice{$slice}{ntasks} = $ntasks;
449 @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
450 foreach $key (@keys) {
451 if ($Slice{$key}{mem_pct} >= $min_thresh) {
452 if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
453 if (! defined ($Warning{$key})) {
454 $Warning{$key} = "sent";
455 syslog ("warning", "pl_mom: Sending alarm mail to ".
456 "unkillable slice $key, using ".
457 "$Slice{$key}{mem_pct}%% of memory");
458 unkillable_alarm_mail($key);
464 #syslog ("info", "pl_mom: No killable slice using > ".
465 # "$min_thresh%% memory");
472 chdir '/' or die "Can't chdir to /: $!";
473 open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
474 open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
475 open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
476 defined(my $pid = fork) or die "Can't fork: $!";
478 setsid or die "Can't start a new session: $!";