10 $alias_addr = "pl-mom\@planet-lab.org";
11 $from_addr = "support\@planet-lab.org";
19 #$bwcap_default = "off";
20 $bwcap_default = "1.5Mbit";
21 $cutoff_default = "16200000000"; # 16GB, for 1.5Mbit cap
24 $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
25 $vservers = "/etc/vservers";
26 $pidfile = "/var/run/$proc.pid";
27 $rebootfile = "/var/lib/misc/pl_mom.reboot";
28 $daily_log = "/var/lib/misc/pl_mom.daily";
29 $daily_stamp = "/var/lib/misc/pl_mom.stamp";
30 $configfile = "/etc/planetlab/pl_mom.conf";
31 $capfile = "/var/lib/misc/pl_mom.oldcaps";
38 $bwcap_default = "1Kbit";
39 $cutoff_default = "10800";
43 $vservers = "./debug";
44 $pidfile = "./$proc.pid";
45 $rebootfile = "./debug/pl_mom.reboot";
46 $daily_log = "./debug/pl_mom.daily";
47 $daily_stamp = "./debug/pl_mom.stamp";
48 $configfile = "./debug/pl_mom.conf";
49 $capfile = "./debug/pl_mom.oldcaps";
54 # daemonize the program
59 system("echo $$ > $pidfile");
63 # Check to see whether pl_mom rebooted the node
66 syslog ("warning", "pl_mom: Sending shutdown mail");
72 syslog ("info", "pl_mom: Launching reboot kicker");
78 syslog ("info", "pl_mom: Launching bandwidth monitor");
79 if ($bwcap_default =~ /off/) {
80 syslog("info", "pl_mom: Max rate unlimited by default");
82 reset_bandwidth_caps();
88 $used = int(swap_used());
90 if (defined($old_used)) {
91 if ($used >= $old_used + $change_thresh) {
92 syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
93 $used - $old_used, $sleep);
97 if ($used >= $log_thresh) {
98 if (! defined($old_used) || $used != $old_used) {
99 syslog ("info", "pl_mom: Swap used: %d%%", $used);
102 my $hog = memory_hog();
104 if ($used < $kill_thresh) {
105 if (! defined($Warning{$hog})) {
106 $Warning{$hog} = "sent";
107 syslog ("warning", "pl_mom: Slice $hog is ".
108 "using $Slice{$hog}{mem_pct}%% of memory");
109 #slice_warning_mail($hog);
112 my $id = `id -u $hog`;
114 my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`;
115 syslog ("warning", "pl_mom: Resetting slice $hog");
119 syslog ("warning", "pl_mom: Sending mail to slice $hog");
120 slice_reset_mail($hog, $top);
134 if ($used >= $reboot_thresh) {
135 syslog ("warning", "pl_mom: Rebooting node");
137 system("touch $rebootfile");
139 #system("shutdown -r now");
140 system("/bin/sync; /sbin/reboot -f");
149 sub bandwidth_monitor {
151 # See if a new day has started for bandwidth monitoring
152 chomp($now = `date -u +%D`);
153 if (-e $daily_stamp) {
154 chomp($stamp = `cat $daily_stamp`);
156 if (! defined($stamp) || !($stamp =~ $now)) {
157 open (STAMP, ">$daily_stamp") ||
158 die "Can't open file $daily_stamp for writing: $!\n";
159 print STAMP "$now\n";
163 # Could save the list of capped slices in a file in order to
164 # avoid re-sending mails if the daemon restarts.
165 # Also may want a list of slices that are exempt from capping.
166 if (defined(%Start)) { undef %Start; }
167 if (defined(%Now)) { undef %Now; }
168 if (defined(%Cap)) { undef %Cap; }
170 reset_bandwidth_caps();
172 syslog("info", "pl_mom: Beginning bandwidth monitoring for $now");
176 get_baseline_counts();
179 foreach $slice ( sort (keys %Start) ) {
180 if (defined $Now{$slice}) {
181 $today = $Now{$slice} - $Start{$slice};
182 if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) {
183 $Cutoff{$slice} = $cutoff_default;
184 $Maxrate{$slice} = $bwcap_default;
188 $cutoff = defined($Cutoff{$slice})
189 ? $Cutoff{$slice} : "<none>";
190 print "Slice $slice sent $today bytes; ".
194 if (defined ($Cutoff{$slice}) &&
195 $today >= $Cutoff{$slice} &&
196 ! defined($Cap{$slice})) {
197 $Cap{$slice} = "sent";
199 log_bandwidth_cap($slice, $Maxrate{$slice});
200 cap_bandwidth($slice, $Maxrate{$slice});
203 # Token bucket for this slice is gone!
211 sub read_config_file {
212 if (-e $configfile) {
213 open (CONFIG, "<$configfile") ||
214 print "Cannot open $configfile; $!\n";
216 if (m/^(.*)=(.*)$/) {
219 print "read_config_file: $1 = ${$1}\n";
227 sub get_slice_names {
228 # Read slice names from /etc/passwd
229 if (defined (%Name)) { undef %Name; }
230 open (PASSWD, "</etc/passwd") ||
231 print "Cannot open /etc/passwd; $!\n";
233 my ($slicename, $passwd, $sliceid) = split(/:/);
234 $Name{$sliceid} = $slicename;
239 sub get_baseline_counts {
241 open (BASE, "+<$daily_log") ||
242 print "Cannot open $daily_log; $!\n";
244 my ($slice, $bytecount) = split(/ /);
245 $Start{$slice} = $bytecount;
248 my $status = `tc -s -d qdisc show`;
249 my $sliceid = 0xffff;
250 @Lines = split(/\n/, $status);
251 foreach $line ( @Lines ) {
252 if ($line =~ /qdisc pfifo (.*): dev/) {
254 # "Capped" buckets all begin with 0x1000. Ignore the root
255 # (0x1000) and default (0x1fff) buckets, as well as
256 # "exempt" buckets that begin with 0x2000 (or anything
257 # other than 0x1000).
258 if (($sliceid & 0xf000) == 0x1000 &&
259 $sliceid != 0x1000 && $sliceid != 0x1fff) {
260 $sliceid = $sliceid & 0x0fff;
265 if ($line =~ /Sent (.*) bytes/) {
267 if ($sliceid != 0xffff) {
268 my $slice = $Name{$sliceid};
269 if ($debug && $bytes) {
270 print "Slice: $slice ($sliceid), bytes $bytes\n";
272 if (! defined($Start{$slice})) {
273 print BASE "$slice $bytes\n";
274 $Start{$slice} = $bytes;
276 $Now{$slice} = $bytes;
284 sub get_slice_limits {
285 if (defined %Maxrate) { undef %Maxrate; }
286 if (defined %Cutoff) { undef %Cutoff; }
288 my $result = `grep -H "^BWAVGRATE" $vservers/*.conf`;
290 my @Lines = split(/\n/,$result);
291 foreach $line ( @Lines ) {
292 if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Mm]bit/) {
295 $cutoff = ($2 * 1000000 * 86400)/8;
297 if ($line =~ /\/([^\/]*).conf:BWAVGRATE=(.*)[Kk]bit/) {
300 $cutoff = ($2 * 1000 * 86400)/8;
302 die "Could not parse line $line";
305 $Maxrate{$slice} = $limit;
306 $Cutoff{$slice} = $cutoff;
311 sub reset_bandwidth_caps {
313 open(CAP, "<$capfile") or die "Cannot open $capfile: $!";
316 ($slicename, $oldcap) = split(/ /);
317 syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ".
319 cap_bandwidth ($slicename, $oldcap);
326 sub log_bandwidth_cap {
327 ($slicename, $cap) = @_;
328 syslog("warning", "pl_mom: Capping bandwidth of slice ".
329 "$slicename at $cap until midnight GMT.");
330 # Save current cap to $capfile
331 system("echo $slicename `bwlimit getcap $slicename` >> $capfile");
335 # Arg 0: recipient addresses, comma-separated string
336 # Arg 1: subject line
337 # Arg 2: body of message
338 my $to = "To: $_[0]\n";
339 my $from = "From: $from_addr\n";
340 my $subject = "Subject: $_[1]\n";
347 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
349 print SENDMAIL $from;
350 print SENDMAIL $subject;
351 print SENDMAIL "Content-type: text/plain\n\n";
358 ($slicename, $cap) = @_;
359 system("bwlimit setcap $slicename $cap");
360 system("bwlimit on $slicename");
364 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
366 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
367 $year+1900, $mon+1, $mday, $hour, $min, $sec);
372 my ($slicename) = @_;
373 my $hostname = hostname();
374 my $date = get_date();
375 my $sent = int($Cutoff{$slicename}/(1024*1024));
376 my $bwcap = $Maxrate{$slicename};
378 send_mail("$alias_addr, $slicename\@slices.planet-lab.org",
379 "$proc capped bandwidth of slice $slicename on $hostname",
380 "Slice $slicename has transmitted more than ${sent}MB today".
382 "Its bandwidth will be capped at $bwcap until midnight GMT.".
383 "\n\n$date $hostname bwcap $slicename\n");
387 my $hostname = hostname();
388 my $date = get_date();
389 send_mail($alias_addr,
390 "$proc rebooted $hostname",
391 "Swap space was exhausted on $hostname and so $proc rebooted ".
392 "it.\n\nAs of $date, the node has successfully come back ".
393 "online.\n\n$date $hostname reboot\n");
396 sub slice_reset_mail {
399 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
400 my $hog_pct = $Slice{$hog}{mem_pct};
401 my $hostname = hostname();
402 my $date = get_date();
403 send_mail("$alias_addr, $hog\@slices.planet-lab.org",
404 "$proc reset slice $hog on $hostname",
405 "As of $date, swap space is nearly exhausted on $hostname.\n\n".
406 "Slice $hog is being reset since it is the largest consumer ".
407 "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
408 "Please reply to this message explaining the nature of your ".
409 "experiment, and what you are doing to address the problem.\n".
410 "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
411 "$top\n\n$date $hostname reset $hog\n");
414 sub slice_warning_mail {
416 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
417 my $hog_pct = $Slice{$hog}{mem_pct};
418 my $hostname = hostname();
419 my $date = get_date();
421 if ($hog =~ /^root$/) {
424 $to = "$alias_addr, $hog\@slices.planet-lab.org";
428 "$proc may reset slice $hog on $hostname",
429 "As of $date, swap space is over $log_thresh% full on ".
430 "$hostname.\n\nSlice $hog is the largest consumer ".
431 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
432 "Please check the memory usage of your slice to avoid a ".
433 "reset.\n\n$date $hostname warning $hog\n");
436 sub unkillable_alarm_mail {
438 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
439 my $hog_pct = $Slice{$hog}{mem_pct};
440 my $hostname = hostname();
441 my $date = get_date();
443 if ($hog =~ /^root$/) {
446 $to = "$alias_addr, $hog\@slices.planet-lab.org";
450 "$proc: alarm for slice $hog on $hostname",
451 "As of $date, swap space is over $log_thresh% full on ".
452 "$hostname.\n\nSlice $hog is the largest consumer ".
453 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
454 "The slice will not be reset, but please verify its behavior.\n".
455 "\n$date $hostname alarm $hog\n");
460 my $sliceid = $Slice{$slice}{ctx};
461 system("chcontext --ctx $sliceid sudo kill -9 -1");
462 system("/etc/init.d/vserver-init start $slice");
466 open (SWAP, "</proc/swaps") ||
467 die "Cannot open /proc/swaps; $!\n";
471 $line =~ s/[\t ]+/ /g;
473 my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
476 return 100*($used/$size);
481 $content = `curl -s http://127.0.0.1:3100/slicestat`;
483 #$content = `cat ../pl_mom-deploy/slicestat`
484 $content = `curl -s http://127.0.0.1:3100/slicestat`;
486 my @lines = split(/\n/, $content);
488 foreach $line (@lines) {
489 my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
491 $Slice{$slice}{ctx} = $ctx;
492 $Slice{$slice}{cpu_pct} = $cpu_pct;
493 $Slice{$slice}{mem_pct} = $mem_pct;
494 $Slice{$slice}{pmem} = $pmem;
495 $Slice{$slice}{vmem} = $vmem;
496 $Slice{$slice}{ntasks} = $ntasks;
501 @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
502 foreach $key (@keys) {
503 if ($Slice{$key}{mem_pct} >= $min_thresh) {
504 if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
505 if (! defined ($Warning{$key})) {
506 $Warning{$key} = "sent";
507 syslog ("warning", "pl_mom: Sending alarm mail to ".
508 "unkillable slice $key, using ".
509 "$Slice{$key}{mem_pct}%% of memory");
510 unkillable_alarm_mail($key);
516 #syslog ("info", "pl_mom: No killable slice using > ".
517 # "$min_thresh%% memory");
524 chdir '/' or die "Can't chdir to /: $!";
525 open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
526 open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
527 open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
528 defined(my $pid = fork) or die "Can't fork: $!";
530 setsid or die "Can't start a new session: $!";