11 $alias_addr = "pl-mom\@planet-lab.org";
12 $from_addr = "support\@planet-lab.org";
20 #$bwcap_default = "off";
21 $bwcap_default = "1.5Mbit";
22 $cutoff_default = "16200000000"; # 16GB, for 1.5Mbit cap
25 $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
26 $vservers = "/etc/vservers";
27 $pidfile = "/var/run/$proc.pid";
28 $rebootfile = "/var/lib/misc/pl_mom.reboot";
29 $daily_log = "/var/lib/misc/pl_mom.daily";
30 $daily_stamp = "/var/lib/misc/pl_mom.stamp";
31 $configfile = "/etc/planetlab/pl_mom.conf";
32 $capfile = "/var/lib/misc/pl_mom.oldcaps";
39 $bwcap_default = "1Kbit";
40 $cutoff_default = "10800";
44 $vservers = "./debug";
45 $pidfile = "./$proc.pid";
46 $rebootfile = "./debug/pl_mom.reboot";
47 $daily_log = "./debug/pl_mom.daily";
48 $daily_stamp = "./debug/pl_mom.stamp";
49 $configfile = "./debug/pl_mom.conf";
50 $capfile = "./debug/pl_mom.oldcaps";
55 # daemonize the program
60 system("echo $$ > $pidfile");
64 # Check to see whether pl_mom rebooted the node
67 syslog ("warning", "pl_mom: Sending shutdown mail");
73 syslog ("info", "pl_mom: Launching reboot kicker");
79 syslog ("info", "pl_mom: Launching bandwidth monitor");
80 if ($bwcap_default =~ /off/) {
81 syslog("info", "pl_mom: Max rate unlimited by default");
83 reset_bandwidth_caps();
89 $used = int(swap_used());
91 if (defined($old_used)) {
92 if ($used >= $old_used + $change_thresh) {
93 syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
94 $used - $old_used, $sleep);
98 if ($used >= $log_thresh) {
99 if (! defined($old_used) || $used != $old_used) {
100 syslog ("info", "pl_mom: Swap used: %d%%", $used);
103 my $hog = memory_hog();
105 if ($used < $kill_thresh) {
106 if (! defined($Warning{$hog})) {
107 $Warning{$hog} = "sent";
108 syslog ("warning", "pl_mom: Slice $hog is ".
109 "using $Slice{$hog}{mem_pct}%% of memory");
110 #slice_warning_mail($hog);
113 my $id = `id -u $hog`;
115 my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`;
116 syslog ("warning", "pl_mom: Resetting slice $hog");
120 syslog ("warning", "pl_mom: Sending mail to slice $hog");
121 slice_reset_mail($hog, $top);
135 if ($used >= $reboot_thresh) {
136 syslog ("warning", "pl_mom: Rebooting node");
138 system("touch $rebootfile");
140 #system("shutdown -r now");
141 system("/bin/sync; /sbin/reboot -f");
150 sub bandwidth_monitor {
152 # See if a new day has started for bandwidth monitoring
153 chomp($now = `date -u +%D`);
154 if (-e $daily_stamp) {
155 chomp($stamp = `cat $daily_stamp`);
157 if (! defined($stamp) || !($stamp =~ $now)) {
158 open (STAMP, ">$daily_stamp") ||
159 die "Can't open file $daily_stamp for writing: $!\n";
160 print STAMP "$now\n";
164 # Could save the list of capped slices in a file in order to
165 # avoid re-sending mails if the daemon restarts.
166 # Also may want a list of slices that are exempt from capping.
167 if (defined(%Start)) { undef %Start; }
168 if (defined(%Now)) { undef %Now; }
169 if (defined(%Cap)) { undef %Cap; }
171 reset_bandwidth_caps();
173 syslog("info", "pl_mom: Beginning bandwidth monitoring for $now");
177 get_baseline_counts();
180 foreach $slice ( sort (keys %Start) ) {
181 if (defined $Now{$slice}) {
182 $today = $Now{$slice} - $Start{$slice};
183 if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) {
184 $Cutoff{$slice} = $cutoff_default;
185 $Maxrate{$slice} = $bwcap_default;
189 $cutoff = defined($Cutoff{$slice})
190 ? $Cutoff{$slice} : "<none>";
191 print "Slice $slice sent $today bytes; ".
195 if (defined ($Cutoff{$slice}) &&
196 $today >= $Cutoff{$slice} &&
197 ! defined($Cap{$slice})) {
198 $Cap{$slice} = "sent";
200 log_bandwidth_cap($slice, $Maxrate{$slice});
201 cap_bandwidth($slice, $Maxrate{$slice});
204 # Token bucket for this slice is gone!
212 sub read_config_file {
213 if (-e $configfile) {
214 open (CONFIG, "<$configfile") ||
215 print "Cannot open $configfile; $!\n";
217 if (m/^(.*)=(.*)$/) {
220 print "read_config_file: $1 = ${$1}\n";
228 sub get_slice_names {
229 # Read slice names from /etc/passwd
230 if (defined (%Name)) { undef %Name; }
231 open (PASSWD, "</etc/passwd") ||
232 print "Cannot open /etc/passwd; $!\n";
234 my ($slicename, $passwd, $sliceid) = split(/:/);
235 $Name{$sliceid} = $slicename;
240 sub get_baseline_counts {
242 open (BASE, "+<$daily_log") ||
243 print "Cannot open $daily_log; $!\n";
245 my ($slice, $bytecount) = split(/ /);
246 $Start{$slice} = $bytecount;
249 my $status = `tc -s -d qdisc show`;
250 my $sliceid = 0xffff;
251 @Lines = split(/\n/, $status);
252 foreach $line ( @Lines ) {
253 if ($line =~ /qdisc pfifo (.*): dev/) {
255 # "Capped" buckets all begin with 0x1000. Ignore the root
256 # (0x1000) and default (0x1fff) buckets, as well as
257 # "exempt" buckets that begin with 0x2000 (or anything
258 # other than 0x1000).
259 if (($sliceid & 0xf000) == 0x1000 &&
260 $sliceid != 0x1000 && $sliceid != 0x1fff) {
261 $sliceid = $sliceid & 0x0fff;
266 if ($line =~ /Sent (.*) bytes/) {
268 if ($sliceid != 0xffff) {
269 my $slice = $Name{$sliceid};
270 if ($debug && $bytes) {
271 print "Slice: $slice ($sliceid), bytes $bytes\n";
273 if (! defined($Start{$slice})) {
274 print BASE "$slice $bytes\n";
275 $Start{$slice} = $bytes;
277 $Now{$slice} = $bytes;
285 sub get_slice_limits {
286 if (defined %Maxrate) { undef %Maxrate; }
287 if (defined %Cutoff) { undef %Cutoff; }
289 my $result = `grep -H "^BWAVGRATE" $vservers/*.conf`;
291 my @Lines = split(/\n/,$result);
292 foreach $line ( @Lines ) {
293 if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Mm]bit/) {
296 $cutoff = ($2 * 1000000 * 86400)/8;
298 if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Kk]bit/) {
301 $cutoff = ($2 * 1000 * 86400)/8;
303 die "Could not parse line $line";
306 $Maxrate{$slice} = $limit;
307 $Cutoff{$slice} = $cutoff;
312 sub reset_bandwidth_caps {
314 open(CAP, "<$capfile") or die "Cannot open $capfile: $!";
317 ($slicename, $oldcap) = split(/ /);
318 syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ".
320 cap_bandwidth ($slicename, $oldcap);
327 sub log_bandwidth_cap {
328 ($slicename, $cap) = @_;
329 syslog("warning", "pl_mom: Capping bandwidth of slice ".
330 "$slicename at $cap until midnight GMT.");
331 # Save current cap to $capfile
332 system("echo $slicename `bwlimit getcap $slicename` >> $capfile");
336 # Arg 0: recipient addresses, comma-separated string
337 # Arg 1: subject line
338 # Arg 2: body of message
339 my $to = "To: $_[0]\n";
340 my $from = "From: $from_addr\n";
341 my $subject = "Subject: $_[1]\n";
348 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
350 print SENDMAIL $from;
351 print SENDMAIL $subject;
352 print SENDMAIL "Content-type: text/plain\n\n";
359 ($slicename, $cap) = @_;
360 system("bwlimit setcap $slicename $cap");
361 system("bwlimit on $slicename");
365 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
367 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
368 $year+1900, $mon+1, $mday, $hour, $min, $sec);
373 my ($slicename) = @_;
374 my $hostname = hostname();
375 my $date = get_date();
376 my $sent = int($Cutoff{$slicename}/(1024*1024));
377 my $bwcap = $Maxrate{$slicename};
379 send_mail("$alias_addr, $slicename\@slices.planet-lab.org",
380 "$proc capped bandwidth of slice $slicename on $hostname",
381 "Slice $slicename has transmitted more than ${sent}MB today".
383 "Its bandwidth will be capped at $bwcap until midnight GMT.".
384 "\n\n$date $hostname bwcap $slicename\n");
388 my $hostname = hostname();
389 my $date = get_date();
390 send_mail($alias_addr,
391 "$proc rebooted $hostname",
392 "Swap space was exhausted on $hostname and so $proc rebooted ".
393 "it.\n\nAs of $date, the node has successfully come back ".
394 "online.\n\n$date $hostname reboot\n");
397 sub slice_reset_mail {
400 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
401 my $hog_pct = $Slice{$hog}{mem_pct};
402 my $hostname = hostname();
403 my $date = get_date();
404 send_mail("$alias_addr, $hog\@slices.planet-lab.org",
405 "$proc reset slice $hog on $hostname",
406 "As of $date, swap space is nearly exhausted on $hostname.\n\n".
407 "Slice $hog is being reset since it is the largest consumer ".
408 "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
409 "Please reply to this message explaining the nature of your ".
410 "experiment, and what you are doing to address the problem.\n".
411 "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
412 "$top\n\n$date $hostname reset $hog\n");
415 sub slice_warning_mail {
417 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
418 my $hog_pct = $Slice{$hog}{mem_pct};
419 my $hostname = hostname();
420 my $date = get_date();
422 if ($hog =~ /^root$/) {
425 $to = "$alias_addr, $hog\@slices.planet-lab.org";
429 "$proc may reset slice $hog on $hostname",
430 "As of $date, swap space is over $log_thresh% full on ".
431 "$hostname.\n\nSlice $hog is the largest consumer ".
432 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
433 "Please check the memory usage of your slice to avoid a ".
434 "reset.\n\n$date $hostname warning $hog\n");
437 sub unkillable_alarm_mail {
439 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
440 my $hog_pct = $Slice{$hog}{mem_pct};
441 my $hostname = hostname();
442 my $date = get_date();
444 if ($hog =~ /^root$/) {
447 $to = "$alias_addr, $hog\@slices.planet-lab.org";
451 "$proc: alarm for slice $hog on $hostname",
452 "As of $date, swap space is over $log_thresh% full on ".
453 "$hostname.\n\nSlice $hog is the largest consumer ".
454 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
455 "The slice will not be reset, but please verify its behavior.\n".
456 "\n$date $hostname alarm $hog\n");
461 my $sliceid = $Slice{$slice}{ctx};
462 system("chcontext --ctx $sliceid sudo kill -9 -1");
463 system("/etc/init.d/vserver-init start $slice");
467 open (SWAP, "</proc/swaps") ||
468 die "Cannot open /proc/swaps; $!\n";
472 $line =~ s/[\t ]+/ /g;
474 my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
477 return 100*($used/$size);
482 $content = `curl -s http://127.0.0.1:3100/slicestat`;
484 #$content = `cat ../pl_mom-deploy/slicestat`
485 $content = `curl -s http://127.0.0.1:3100/slicestat`;
487 my @lines = split(/\n/, $content);
489 foreach $line (@lines) {
490 my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
492 $Slice{$slice}{ctx} = $ctx;
493 $Slice{$slice}{cpu_pct} = $cpu_pct;
494 $Slice{$slice}{mem_pct} = $mem_pct;
495 $Slice{$slice}{pmem} = $pmem;
496 $Slice{$slice}{vmem} = $vmem;
497 $Slice{$slice}{ntasks} = $ntasks;
502 @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
503 foreach $key (@keys) {
504 if ($Slice{$key}{mem_pct} >= $min_thresh) {
505 if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
506 if (! defined ($Warning{$key})) {
507 $Warning{$key} = "sent";
508 syslog ("warning", "pl_mom: Sending alarm mail to ".
509 "unkillable slice $key, using ".
510 "$Slice{$key}{mem_pct}%% of memory");
511 unkillable_alarm_mail($key);
517 #syslog ("info", "pl_mom: No killable slice using > ".
518 # "$min_thresh%% memory");
525 chdir '/' or die "Can't chdir to /: $!";
526 open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
527 open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
528 open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
529 defined(my $pid = fork) or die "Can't fork: $!";
531 setsid or die "Can't start a new session: $!";