10 $alias_addr = "pl-mom\@planet-lab.org";
11 $from_addr = "support\@planet-lab.org";
20 $sendmail = "/usr/sbin/sendmail -t -f$from_addr";
21 $pidfile = "/var/run/$proc.pid";
22 $rebootfile = "/var/lib/misc/pl_mom.reboot";
31 $pidfile = "./$proc.pid";
32 $rebootfile = "./pl_mom.reboot";
37 # daemonize the program
42 system("echo $$ > $pidfile");
46 syslog ("warning", "pl_mom: Sending shutdown mail");
52 syslog ("info", "pl_mom: Launching reboot kicker");
58 $used = int(swap_used());
60 if (defined($old_used)) {
61 if ($used >= $old_used + $change_thresh) {
62 syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
63 $used - $old_used, $sleep);
67 if ($used >= $log_thresh) {
68 if (! defined($old_used) || $used != $old_used) {
69 syslog ("info", "pl_mom: Swap used: %d%%", $used);
72 my $hog = memory_hog();
74 if ($used < $kill_thresh) {
75 if (! defined($Warning{$hog})) {
76 $Warning{$hog} = "sent";
77 syslog ("warning", "pl_mom: Slice $hog is ".
78 "using $Slice{$hog}{mem_pct}%% of memory");
79 #slice_warning_mail($hog);
82 my $id = `id -u $hog`;
84 my $top = `chcontext --ctx $id top -b -n 1`;
85 syslog ("warning", "pl_mom: Resetting slice $hog");
89 syslog ("warning", "pl_mom: Sending mail to slice $hog");
90 slice_reset_mail($hog, $top);
104 if ($used >= $reboot_thresh) {
105 syslog ("warning", "pl_mom: Rebooting node");
107 system("touch $rebootfile");
109 system("shutdown -r now");
119 my $hostname = hostname();
120 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
122 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
123 $year+1900, $mon+1, $mday, $hour, $min, $sec);
124 my $to = "To: $alias_addr\n";
125 my $from = "From: $from_addr\n";
126 my $subject = "Subject: $proc rebooted $hostname\n";
127 my $msg ="Swap space was exhausted on $hostname and so $proc rebooted ".
128 "it.\n\nAs of $date, the node has successfully come back online.\n".
129 "\n$date $hostname reboot\n";
131 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
133 print SENDMAIL $from;
134 print SENDMAIL $subject;
135 print SENDMAIL "Content-type: text/plain\n\n";
140 sub slice_reset_mail {
143 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
144 my $hog_pct = $Slice{$hog}{mem_pct};
145 my $hostname = hostname();
146 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
148 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
149 $year+1900, $mon+1, $mday, $hour, $min, $sec);
150 my $to = "To: $alias_addr, $hog\@slices.planet-lab.org\n";
151 my $from = "From: $from_addr\n";
152 my $subject = "Subject: $proc reset slice $hog on $hostname\n";
153 my $msg = "As of $date, swap space is nearly exhausted on $hostname.\n\n".
154 "Slice $hog is being reset since it is the largest consumer ".
155 "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
156 "Please reply to this message explaining the nature of your ".
157 "experiment, and what you are doing to address the problem.\n".
158 "\nOutput of 'top -b -n 1' in your slice prior to reset:\n".
159 "$top\n\n$date $hostname reset $hog\n";
161 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
163 print SENDMAIL $from;
164 print SENDMAIL $subject;
165 print SENDMAIL "Content-type: text/plain\n\n";
170 sub slice_warning_mail {
172 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
173 my $hog_pct = $Slice{$hog}{mem_pct};
174 my $hostname = hostname();
175 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
177 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
178 $year+1900, $mon+1, $mday, $hour, $min, $sec);
180 if ($hog =~ /^root$/) {
181 $to = "To: $alias_addr\n";
183 $to = "To: $alias_addr, $hog\@slices.planet-lab.org\n";
186 my $from = "From: $from_addr\n";
187 my $subject = "Subject: $proc may reset slice $hog on $hostname\n";
188 my $msg = "As of $date, swap space is over $log_thresh% full on ".
189 "$hostname.\n\nSlice $hog is the largest consumer ".
190 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
191 "Please check the memory usage of your slice to avoid a reset.\n".
192 "\n$date $hostname warning $hog\n";
194 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
196 print SENDMAIL $from;
197 print SENDMAIL $subject;
198 print SENDMAIL "Content-type: text/plain\n\n";
203 sub unkillable_alarm_mail {
205 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
206 my $hog_pct = $Slice{$hog}{mem_pct};
207 my $hostname = hostname();
208 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
210 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
211 $year+1900, $mon+1, $mday, $hour, $min, $sec);
213 if ($hog =~ /^root$/) {
214 $to = "To: $alias_addr\n";
216 $to = "To: $alias_addr, $hog\@slices.planet-lab.org\n";
219 my $from = "From: $from_addr\n";
220 my $subject = "Subject: $proc: alarm for slice $hog on $hostname\n";
221 my $msg = "As of $date, swap space is over $log_thresh% full on ".
222 "$hostname.\n\nSlice $hog is the largest consumer ".
223 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
224 "The slice will not be reset, but please verify its behavior.\n".
225 "\n$date $hostname alarm $hog\n";
227 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
229 print SENDMAIL $from;
230 print SENDMAIL $subject;
231 print SENDMAIL "Content-type: text/plain\n\n";
238 my $sliceid = $Slice{$slice}{ctx};
239 system("chcontext --ctx $sliceid sudo kill -9 -1");
240 system("/etc/init.d/vserver-init start $slice");
244 open (SWAP, "</proc/swaps") ||
245 die "Cannot open /proc/swaps; $!\n";
249 $line =~ s/[\t ]+/ /g;
251 my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
254 return 100*($used/$size);
259 #$content = get "http://127.0.0.1:3100/slicestat";
260 $content = `curl -s http://127.0.0.1:3100/slicestat`;
262 $content = `cat ../pl_mom-deploy/slicestat`
264 my @lines = split(/\n/, $content);
266 foreach $line (@lines) {
267 my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
269 $Slice{$slice}{ctx} = $ctx;
270 $Slice{$slice}{cpu_pct} = $cpu_pct;
271 $Slice{$slice}{mem_pct} = $mem_pct;
272 $Slice{$slice}{pmem} = $pmem;
273 $Slice{$slice}{vmem} = $vmem;
274 $Slice{$slice}{ntasks} = $ntasks;
279 @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
280 foreach $key (@keys) {
281 if ($Slice{$key}{mem_pct} >= $min_thresh) {
282 if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
283 if (! defined ($Warning{$key})) {
284 $Warning{$key} = "sent";
285 syslog ("warning", "pl_mom: Sending alarm mail to ".
286 "unkillable slice $key, using ".
287 "$Slice{$key}{mem_pct}%% of memory");
288 unkillable_alarm_mail($key);
294 #syslog ("info", "pl_mom: No killable slice using > ".
295 # "$min_thresh%% memory");
302 chdir '/' or die "Can't chdir to /: $!";
303 open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
304 open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
305 open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
306 defined(my $pid = fork) or die "Can't fork: $!";
308 setsid or die "Can't start a new session: $!";