19 $sendmail = "/usr/sbin/sendmail -t";
20 $pidfile = "/var/run/$proc.pid";
21 $rebootfile = "/var/lib/misc/pl_mom.reboot";
30 $pidfile = "./$proc.pid";
31 $rebootfile = "./pl_mom.reboot";
36 # daemonize the program
41 system("echo $$ > $pidfile");
45 syslog ("warning", "pl_mom: Sending shutdown mail");
51 syslog ("info", "pl_mom: Launching reboot kicker");
57 $used = int(swap_used());
59 if (defined($old_used)) {
60 if ($used >= $old_used + $change_thresh) {
61 syslog ("info", "pl_mom: %d%% swap consumed in last %d seconds",
62 $used - $old_used, $sleep);
66 if ($used >= $log_thresh) {
67 if (! defined($old_used) || $used != $old_used) {
68 syslog ("info", "pl_mom: Swap used: %d%%", $used);
71 my $hog = memory_hog();
73 if ($used < $kill_thresh) {
74 if (! defined($Warning{$hog})) {
75 $Warning{$hog} = "sent";
76 syslog ("warning", "pl_mom: Slice $hog is ".
77 "using $Slice{$hog}{mem_pct}%% of memory");
78 #slice_warning_mail($hog);
81 syslog ("warning", "pl_mom: Resetting slice $hog");
85 syslog ("warning", "pl_mom: Sending mail to slice $hog");
86 slice_reset_mail($hog);
100 if ($used >= $reboot_thresh) {
101 syslog ("warning", "pl_mom: Rebooting node");
103 system("touch $rebootfile");
105 system("shutdown -r now");
115 my $hostname = hostname();
116 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
118 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
119 $year+1900, $mon+1, $mday, $hour, $min, $sec);
120 my $to = "To: $alias\@planet-lab.org\n";
121 my $from = "From: support\@planet-lab.org\n";
122 my $subject = "Subject: $proc rebooted $hostname\n";
123 my $msg ="Swap space was exhausted on $hostname and so $proc rebooted ".
124 "it.\n\nAs of $date, the node has successfully come back online.\n".
125 "\n$date $hostname reboot\n";
127 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
129 print SENDMAIL $from;
130 print SENDMAIL $subject;
131 print SENDMAIL "Content-type: text/plain\n\n";
136 sub slice_reset_mail {
138 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
139 my $hog_pct = $Slice{$hog}{mem_pct};
140 my $hostname = hostname();
141 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
143 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
144 $year+1900, $mon+1, $mday, $hour, $min, $sec);
145 my $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
146 my $from = "From: support\@planet-lab.org\n";
147 my $subject = "Subject: $proc resetting slice $hog on $hostname\n";
148 my $msg = "As of $date, swap space is nearly exhausted on $hostname.\n\n".
149 "Slice $hog is being reset since it is the largest consumer ".
150 "of physical memory at ${hog_mem}MB ($hog_pct%).\n\n".
151 "Please reply to this message explaining the nature of your ".
152 "experiment, and what you are doing to address the problem.\n".
153 "\n$date $hostname reset $hog\n";
155 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
157 print SENDMAIL $from;
158 print SENDMAIL $subject;
159 print SENDMAIL "Content-type: text/plain\n\n";
164 sub slice_warning_mail {
166 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
167 my $hog_pct = $Slice{$hog}{mem_pct};
168 my $hostname = hostname();
169 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
171 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
172 $year+1900, $mon+1, $mday, $hour, $min, $sec);
174 if ($hog =~ /^root$/) {
175 $to = "To: $alias\@planet-lab.org\n";
177 $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
180 my $from = "From: support\@planet-lab.org\n";
181 my $subject = "Subject: $proc may reset slice $hog on $hostname\n";
182 my $msg = "As of $date, swap space is over $log_thresh% full on ".
183 "$hostname.\n\nSlice $hog is the largest consumer ".
184 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
185 "Please check the memory usage of your slice to avoid a reset.\n".
186 "\n$date $hostname warning $hog\n";
188 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
190 print SENDMAIL $from;
191 print SENDMAIL $subject;
192 print SENDMAIL "Content-type: text/plain\n\n";
197 sub unkillable_alarm_mail {
199 my $hog_mem = sprintf ("%0.f", $Slice{$hog}{pmem}/1000);
200 my $hog_pct = $Slice{$hog}{mem_pct};
201 my $hostname = hostname();
202 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdat)
204 my $date = sprintf("%4d-%02d-%02d %02d:%02d:%02d",
205 $year+1900, $mon+1, $mday, $hour, $min, $sec);
207 if ($hog =~ /^root$/) {
208 $to = "To: $alias\@planet-lab.org\n";
210 $to = "To: $alias\@planet-lab.org, $hog\@slices.planet-lab.org\n";
213 my $from = "From: support\@planet-lab.org\n";
214 my $subject = "Subject: $proc: alarm for slice $hog on $hostname\n";
215 my $msg = "As of $date, swap space is over $log_thresh% full on ".
216 "$hostname.\n\nSlice $hog is the largest consumer ".
217 "of physical memory at ${hog_mem}MB ($hog_pct%).\n".
218 "The slice will not be reset, but please verify its behavior.\n".
219 "\n$date $hostname alarm $hog\n";
221 open(SENDMAIL, "|$sendmail") or die "Cannot open $sendmail: $!";
223 print SENDMAIL $from;
224 print SENDMAIL $subject;
225 print SENDMAIL "Content-type: text/plain\n\n";
232 my $sliceid = $Slice{$slice}{ctx};
233 system("chcontext --ctx $sliceid sudo kill -9 -1");
234 system("/etc/init.d/vserver-init start $slice");
238 open (SWAP, "</proc/swaps") ||
239 die "Cannot open /proc/swaps; $!\n";
243 $line =~ s/[\t ]+/ /g;
245 my ($filename, $type, $size, $used, $priority) = split(/ /, $line);
248 return 100*($used/$size);
253 $content = get "http://127.0.0.1:3100/slicestat";
255 $content = `cat ../pl_mom-deploy/slicestat`
257 my @lines = split(/\n/, $content);
259 foreach $line (@lines) {
260 my ($slice, $ctx, $cpu_pct, $mem_pct, $pmem, $vmem, $ntasks)
262 $Slice{$slice}{ctx} = $ctx;
263 $Slice{$slice}{cpu_pct} = $cpu_pct;
264 $Slice{$slice}{mem_pct} = $mem_pct;
265 $Slice{$slice}{pmem} = $pmem;
266 $Slice{$slice}{vmem} = $vmem;
267 $Slice{$slice}{ntasks} = $ntasks;
272 @keys = sort { $Slice{$b}{mem_pct} <=> $Slice{$a}{mem_pct} } (keys %Slice);
273 foreach $key (@keys) {
274 if ($Slice{$key}{mem_pct} >= $min_thresh) {
275 if ($key =~ /^root$/ || $key =~ /slicestat/ || $key =~ /netflow/) {
276 if (! defined ($Warning{$key})) {
277 $Warning{$key} = "sent";
278 syslog ("warning", "pl_mom: Sending alarm mail to ".
279 "unkillable slice $key, using ".
280 "$Slice{$key}{mem_pct}%% of memory");
281 unkillable_alarm_mail($key);
287 #syslog ("info", "pl_mom: No killable slice using > ".
288 # "$min_thresh%% memory");
295 chdir '/' or die "Can't chdir to /: $!";
296 open STDIN, '/dev/null' or die "Can't read /dev/null: $!";
297 open STDOUT, '>>/dev/null' or die "Can't write to /dev/null: $!";
298 open STDERR, '>>/dev/null' or die "Can't write to /dev/null: $!";
299 defined(my $pid = fork) or die "Can't fork: $!";
301 setsid or die "Can't start a new session: $!";