10 $alias_addr = "pl-mom\@planet-lab.org";
11 $from_addr = "support\@planet-lab.org";
19 #$bwcap_default = "off";
20 $bwcap_default = "1.5Mbit";
21 $cutoff_default = "16200000000"; # 16GB, for 1.5Mbit cap
24 $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
25 $vservers = "/etc/vservers";
26 $pidfile = "/var/run/$proc.pid";
27 $rebootfile = "/var/lib/misc/pl_mom.reboot";
28 $daily_log = "/var/lib/misc/pl_mom.daily";
29 $daily_stamp = "/var/lib/misc/pl_mom.stamp";
30 $configfile = "/etc/planetlab/pl_mom.conf";
31 $capfile = "/var/lib/misc/pl_mom.oldcaps";
38 $bwcap_default = "1Kbit";
39 $cutoff_default = "10800";
43 $vservers = "./debug";
44 $pidfile = "./$proc.pid";
45 $rebootfile = "./debug/pl_mom.reboot";
46 $daily_log = "./debug/pl_mom.daily";
47 $daily_stamp = "./debug/pl_mom.stamp";
48 $configfile = "./debug/pl_mom.conf";
49 $capfile = "./debug/pl_mom.oldcaps";
54 # daemonize the program
59 system("echo $$ > $pidfile");
63 # Check to see whether pl_mom rebooted the node
66 syslog ("warning", "pl_mom: Sending shutdown mail");
72 syslog ("info", "pl_mom: Launching reboot kicker");
78 syslog ("info", "pl_mom: Launching bandwidth monitor");
79 if ($bwcap_default =~ /off/) {
80 syslog("info", "pl_mom: Max rate unlimited by default");
82 reset_bandwidth_caps();
88 $used = int(swap_used());
90 if (defined($old_used)) {
91 if ($used >= $old_used + $change_thresh) {
92 syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
93 $used - $old_used, $sleep);
97 if ($used >= $log_thresh) {
98 if (! defined($old_used) || $used != $old_used) {
99 syslog ("info", "pl_mom: Swap used: %d%%", $used);
102 my $hog = memory_hog();
104 if ($used < $kill_thresh) {
105 if (! defined($Warning{$hog})) {
106 $Warning{$hog} = "sent";
107 syslog ("warning", "pl_mom: Slice $hog is ".
108 "using $Slice{$hog}{mem_pct}%% of memory");
109 #slice_warning_mail($hog);
112 my $id = `id -u $hog`;
114 my $top = `/usr/sbin/chcontext --ctx $id /usr/bin/top -b -n 1`;
115 syslog ("warning", "pl_mom: Resetting slice $hog");
119 syslog ("warning", "pl_mom: Sending mail to slice $hog");
120 slice_reset_mail($hog, $top);
134 if ($used >= $reboot_thresh) {
135 syslog ("warning", "pl_mom: Rebooting node");
137 system("touch $rebootfile");
139 #system("shutdown -r now");
140 system("/bin/sync; /sbin/reboot -f");
149 sub bandwidth_monitor {
151 # See if a new day has started for bandwidth monitoring
152 chomp($now = `date -u +%D`);
153 if (-e $daily_stamp) {
154 chomp($stamp = `cat $daily_stamp`);
156 if (! defined($stamp) || !($stamp =~ $now)) {
157 open (STAMP, ">$daily_stamp") ||
158 die "Can't open file $daily_stamp for writing: $!\n";
159 print STAMP "$now\n";
163 # Could save the list of capped slices in a file in order to
164 # avoid re-sending mails if the daemon restarts.
165 # Also may want a list of slices that are exempt from capping.
166 if (defined(%Start)) { undef %Start; }
167 if (defined(%Now)) { undef %Now; }
168 if (defined(%Cap)) { undef %Cap; }
170 reset_bandwidth_caps();
172 syslog("info", "pl_mom: Beginning bandwidth monitoring for $now");
176 get_baseline_counts();
179 foreach $slice ( sort (keys %Start) ) {
180 if (defined $Now{$slice}) {
181 $today = $Now{$slice} - $Start{$slice};
182 if (! (defined ($Cutoff{$slice})||$bwcap_default =~ /off/)) {
183 $Cutoff{$slice} = $cutoff_default;
184 $Maxrate{$slice} = $bwcap_default;
188 $cutoff = defined($Cutoff{$slice})
189 ? $Cutoff{$slice} : "<none>";
190 print "Slice $slice sent $today bytes; ".
194 if (defined ($Cutoff{$slice}) &&
195 $today >= $Cutoff{$slice} &&
196 ! defined($Cap{$slice})) {
197 $Cap{$slice} = "sent";
200 log_bandwidth_cap($slice, $Maxrate{$slice});
201 cap_bandwidth($slice, $Maxrate{$slice});
205 # Token bucket for this slice is gone!
213 sub read_config_file {
214 if (-e $configfile) {
215 open (CONFIG, "<$configfile") ||
216 print "Cannot open $configfile; $!\n";
218 if (m/^(.*)=(.*)$/) {
221 print "read_config_file: $1 = ${$1}\n";
229 sub get_slice_names {
230 # Read slice names from /etc/passwd
231 if (defined (%Name)) { undef %Name; }
232 open (PASSWD, "</etc/passwd") ||
233 print "Cannot open /etc/passwd; $!\n";
235 my ($slicename, $passwd, $sliceid) = split(/:/);
236 $Name{$sliceid} = $slicename;
241 sub get_baseline_counts {
243 open (BASE, "+<$daily_log") ||
244 print "Cannot open $daily_log; $!\n";
246 my ($slice, $bytecount) = split(/ /);
247 $Start{$slice} = $bytecount;
250 my $status = `tc -s -d qdisc show`;
251 my $sliceid = "9999";
252 @Lines = split(/\n/, $status);
253 foreach $line ( @Lines ) {
254 if ($line =~ /qdisc pfifo (.*): dev/) {
257 if ($line =~ /Sent (.*) bytes/) {
259 if ($sliceid != 9999) {
260 my $slice = $Name{$sliceid};
261 if ($debug && $bytes) {
262 print "Slice: $slice ($sliceid), bytes $bytes\n";
264 if (! defined($Start{$slice})) {
265 print BASE "$slice $bytes\n";
266 $Start{$slice} = $bytes;
268 $Now{$slice} = $bytes;
276 sub get_slice_limits {
277 if (defined %Maxrate) { undef %Maxrate; }
278 if (defined %Cutoff) { undef %Cutoff; }
280 my $result = `grep -H "^BWMAXRATE" $vservers/*.conf`;
282 my @Lines = split(/\n/,$result);
283 foreach $line ( @Lines ) {
284 if ($line =~ /\/([^\/]*).conf:BWMAXRATE=(.*)[Mm]bit/) {
287 $cutoff = ($2 * 1000000 * 86400)/8;
289 if ($line =~ /\/([^\/]*).conf:BWMAXRATE=(.*)[Kk]bit/) {
292 $cutoff = ($2 * 1000 * 86400)/8;
294 die "Could not parse line $line";
297 $Maxrate{$slice} = $limit;
298 $Cutoff{$slice} = $cutoff;
300 print "Slice $slice, maxrate $Maxrate{$slice}, ".
301 "cutoff $Cutoff{$slice}\n";
307 sub reset_bandwidth_caps {
309 open(CAP, "<$capfile") or die "Cannot open $capfile: $!";
312 ($slicename, $oldcap) = split(/ /);
313 syslog("info", "pl_mom: Restoring bandwidth cap of $oldcap ".
315 cap_bandwidth ($slicename, $oldcap);
322 sub log_bandwidth_cap {
323 ($slicename, $cap) = @_;
324 syslog("warning", "pl_mom: Capping bandwidth of slice ".
325 "$slicename at $cap until midnight GMT.");
326 # Save current cap to $capfile
327 system("echo $slicename `bwlimit getcap $slicename` >> $capfile");
331 # Arg 0: recipient addresses, comma-separated string
332 # Arg 1: subject line
333 # Arg 2: body of message
334 my $to = "To: $_[0]\n";
335 my $from = "From: $from_addr\n";
336 my $subject = "Subject: $_[1]\n";
339 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
341 print SENDMAIL $from;
342 print SENDMAIL $subject;
343 print SENDMAIL "Content-type: text/plain\n\n";
349 ($slicename, $cap) = @_;
350 system("bwlimit setcap $slicename $cap");
351 system("bwlimit on $slicename");
355 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
357 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
358 $year+1900, $mon+1, $mday, $hour, $min, $sec);
363 my ($slicename) = @_;
364 my $hostname = hostname();
365 my $date = get_date();
366 my $sent = int($Cutoff{$slicename}/(1024*1024));
367 my $bwcap = $Maxrate{$slicename};
369 send_mail("$alias_addr, $slicename\@slices.planet-lab.org",
370 "$proc capped bandwidth of slice $slicename on $hostname",
371 "Slice $slicename has transmitted more than ${sent}MB today".
373 "Its bandwidth will be capped at $bwcap until midnight GMT.".
374 "\n\n$date $hostname bwcap $slicename\n");
378 my $hostname = hostname();
379 my $date = get_date();
380 send_mail($alias_addr,
381 "$proc rebooted $hostname",
382 "Swap space was exhausted on $hostname and so $proc rebooted ".
383 "it.\n\nAs of $date, the node has successfully come back ".
384 "online.\n\n$date $hostname reboot\n");
387 sub slice_reset_mail {
390 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
391 my $hog_pct = $Slice{$hog}{mem_pct};
392 my $hostname = hostname();
393 my $date = get_date();
394 send_mail("$alias_addr, $hog\@slices.planet-lab.org",
395 "$proc reset slice $hog on $hostname",
396 "As of $date, swap space is nearly exhausted on $hostname.\n\n".
397 "Slice $hog is being reset since it is the largest consumer ".
398 "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
399 "Please reply to this message explaining the nature of your ".
400 "experiment, and what you are doing to address the problem.\n".
401 "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
402 "$top\n\n$date $hostname reset $hog\n");
405 sub slice_warning_mail {
407 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
408 my $hog_pct = $Slice{$hog}{mem_pct};
409 my $hostname = hostname();
410 my $date = get_date();
412 if ($hog =~ /^root$/) {
415 $to = "$alias_addr, $hog\@slices.planet-lab.org";
419 "$proc may reset slice $hog on $hostname",
420 "As of $date, swap space is over $log_thresh% full on ".
421 "$hostname.\n\nSlice $hog is the largest consumer ".
422 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
423 "Please check the memory usage of your slice to avoid a ".
424 "reset.\n\n$date $hostname warning $hog\n");
427 sub unkillable_alarm_mail {
429 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
430 my $hog_pct = $Slice{$hog}{mem_pct};
431 my $hostname = hostname();
432 my $date = get_date();
434 if ($hog =~ /^root$/) {
437 $to = "$alias_addr, $hog\@slices.planet-lab.org";
441 "$proc: alarm for slice $hog on $hostname",
442 "As of $date, swap space is over $log_thresh% full on ".
443 "$hostname.\n\nSlice $hog is the largest consumer ".
444 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
445 "The slice will not be reset, but please verify its behavior.\n".
446 "\n$date $hostname alarm $hog\n");
451 my $sliceid = $Slice{$slice}{ctx};
452 system("chcontext --ctx $sliceid sudo kill -9 -1");
453 system("/etc/init.d/vserver-init start $slice");
457 open (SWAP, "</proc/swaps") ||
458 die "Cannot open /proc/swaps; $!\n";
462 $line =~ s/[\t ]+/ /g;
464 my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
467 return 100*($used/$size);
472 $content = `curl -s http://127.0.0.1:3100/slicestat`;
474 #$content = `cat ../pl_mom-deploy/slicestat`
475 $content = `curl -s http://127.0.0.1:3100/slicestat`;
477 my @lines = split(/\n/, $content);
479 foreach $line (@lines) {
480 my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
482 $Slice{$slice}{ctx} = $ctx;
483 $Slice{$slice}{cpu_pct} = $cpu_pct;
484 $Slice{$slice}{mem_pct} = $mem_pct;
485 $Slice{$slice}{pmem} = $pmem;
486 $Slice{$slice}{vmem} = $vmem;
487 $Slice{$slice}{ntasks} = $ntasks;
492 @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
493 foreach $key (@keys) {
494 if ($Slice{$key}{mem_pct} >= $min_thresh) {
495 if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
496 if (! defined ($Warning{$key})) {
497 $Warning{$key} = "sent";
498 syslog ("warning", "pl_mom: Sending alarm mail to ".
499 "unkillable slice $key, using ".
500 "$Slice{$key}{mem_pct}%% of memory");
501 unkillable_alarm_mail($key);
507 #syslog ("info", "pl_mom: No killable slice using > ".
508 # "$min_thresh%% memory");
515 chdir '/' or die "Can't chdir to /: $!";
516 open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
517 open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
518 open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
519 defined(my $pid = fork) or die "Can't fork: $!";
521 setsid or die "Can't start a new session: $!";