fixed version
[mom.git] / swapmon.py
index 37743b1..da8f3a0 100755 (executable)
@@ -10,7 +10,7 @@
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Copyright (C) 2004-2006 The Trustees of Princeton University
 #
 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Copyright (C) 2004-2006 The Trustees of Princeton University
 #
-# $Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $
+# $Id$
 #
 
 import syslog
 #
 
 import syslog
@@ -51,7 +51,7 @@ reset_thresh = 80
 reboot_thresh = 95
 
 # Time to wait before checking slice again after reset
 reboot_thresh = 95
 
 # Time to wait before checking slice again after reset
-reset_timeout = 15
+reset_timeout = 25
 
 # Number of strikes before killing (strike, strike, kill)
 kill_thresh = 2
 
 # Number of strikes before killing (strike, strike, kill)
 kill_thresh = 2
@@ -97,6 +97,8 @@ physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
 Please reply to this message explaining the nature of your experiment,
 and what you are doing to address the problem.
 
 Please reply to this message explaining the nature of your experiment,
 and what you are doing to address the problem.
 
+http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
+
 %(slice)s processes prior to reset:
 
 %(table)s
 %(slice)s processes prior to reset:
 
 %(table)s
@@ -146,107 +148,104 @@ and what you are doing to address the problem.
 
 
 class Reset:
 
 
 class Reset:
-       """
-       Keeps track of state information for resets and kills
-
-       resettimeleft - timeout before checking for next reset
-       resetcount - number of strikes 
-       killtimeleft - time out before removing from kill queue
-       {kill,reset}mail - Time of last email
-       kill - State of kill.  If slice is already being killed, wait before retry.
-       """
-
-       def __init__(self,name):
-               self.name = name
-               self.resettimeleft = reset_timeout
-               self.resetcount = 0 
-               self.resetmail = 0
-               self.killtimeleft = kill_timeout
-               self.killmail = 0
-
-       def __repr__(self):
-               return self.name
-       
-       def update(self):
-               # Count down for next check of reset slice.
-                       if self.resettimeleft > 0:
-                        self.resettimeleft -= 1
-                        if debug and verbose:
-                                       print "%s has %s seconds in probation" \
-                                       %(self.name, self.killtimeleft)
-               if self.killtimeleft > 0:
-                       # Count down kill probation timer (killtimeleft)
-                       self.killtimeleft -= 1
-                       if self.killtimeleft == 1:
-                               print "%s is out of probation" % self.name
-               else:
-                       # Once out of probation period (killtimeleft), remove strikes
-                       self.resetcount = 0
-
-
-       # Check to see if a slice needs to be killed.  If it has been killed more 
-       # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
-       def checkkill(self,params):
-               if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
-                       if debug:
-                                print kill_subject % params
-                                print kill_body % params
-                       try:
-                               pid = os.fork()
-                               if pid == 0:
-                                       print "Slice %s is being killed." % self.name   
-                                               vserver = VServer(self.name)
-                                               vserver.stop()
-                                       os._exit(0)
-                               else:
-                                       os.waitpid(pid,0)
-                       except Exception, err:
-                                       print "Warning: Exception received while killing slice %s: %s" \
-                                       % self.name, err
-                       if (time.time() - self.killmail) > email_timeout:
-                               slicemail(self.name, kill_subject % params, kill_body % params)
-                               print "Sending KILL email for slice %s" % self.name
-                               self.killmail = time.time() 
-                       return True
-               return False 
-
-       # Reset slice after checking to see if slice is out of timeout.
-       # Increment resetcount, check to see if larger than kill_thresh.
-       def reset(self, params):
-               # If its the first reset (came back after kill)
-               # or if its been reset before
-               # and we are out of the reset timeout.
-               if self.resetcount == 0 or self.resettimeleft == 0:
-                       # Do we need to kill this slice?  Check history first.
-                       if self.checkkill(params):
-                               return
-                       # Update counters
-                       self.resetcount += 1
-                       self.killtimeleft = kill_timeout
-                       self.resettimeleft = reset_timeout
-                       print "%s has %s seconds to die and has been reset %s times" \
-                               %(self.name, self.resettimeleft, self.resetcount)
-                       if debug:
-                               print reset_subject % params
-                               print reset_body % params
-                       try:
-                               pid = os.fork()
-                               if pid == 0:
-                                               print "Resetting slice " + self.name 
-                                               vserver = VServer(self.name)
-                                               vserver.stop()
-                                               vserver.start(wait = False)
-                                               os._exit(0)
-                               else:
-                                       os.waitpid(pid,0)
-                       except Exception, err:
-                                       print "Warning: Exception received while resetting slice %s:" \
-                                       % self.name, err
-                       if (time.time() - self.resetmail) > email_timeout:
-                                       slicemail(self.name, reset_subject % params, reset_body % params)
-                               print "Sending Reset email for slice %s" % self.name
-                               self.resetmail = time.time() 
+    """
+    Keeps track of state information for resets and kills
+
+    resettimeleft - timeout before checking for next reset
+    resetcount - number of strikes 
+    killtimeleft - time out before removing from kill queue
+    {kill,reset}mail - Time of last email
+    kill - State of kill.  If slice is already being killed, wait before retry.
+    """
+
+    def __init__(self,name):
+        self.name = name
+        self.resettimeleft = reset_timeout
+        self.resetcount = 0 
+        self.resetmail = 0
+        self.killtimeleft = kill_timeout
+        self.killmail = 0
 
 
+    def __repr__(self):
+        return self.name
+    
+    def update(self):
+        # Count down for next check of reset slice.
+        if self.resettimeleft > 0:
+           self.resettimeleft -= 1
+           if debug and verbose:  print "%s has %s seconds in probation" \
+                                    %(self.name, self.killtimeleft)
+        if self.killtimeleft > 0:
+            # Count down kill probation timer (killtimeleft)
+            self.killtimeleft -= 1
+            if self.killtimeleft == 1:
+                print "%s is out of probation" % self.name
+        else:
+            # Once out of probation period (killtimeleft), remove strikes
+            self.resetcount = 0
+
+
+    # Check to see if a slice needs to be killed.  If it has been killed more 
+    # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
+    def checkkill(self,params):
+        if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
+            if debug:
+                print kill_subject % params
+                print kill_body % params
+            try:
+                pid = os.fork()
+                if pid == 0:
+                   print "Slice %s is being killed." % self.name   
+                   vserver = VServer(self.name)
+                   vserver.stop()
+                   os._exit(0)
+                else:
+                    os.waitpid(pid,0)
+            except Exception, err:
+                print "Warning: Exception received while killing slice %s: %s" \
+                   % (self.name, err)
+            if (time.time() - self.killmail) > email_timeout:
+                slicemail(self.name, kill_subject % params, kill_body % params)
+                print "Sending KILL email for slice %s" % self.name
+                self.killmail = time.time() 
+            return True
+        return False 
+
+    # Reset slice after checking to see if slice is out of timeout.
+    # Increment resetcount, check to see if larger than kill_thresh.
+    def reset(self, params):
+        # If its the first reset (came back after kill)
+        # or if its been reset before
+        # and we are out of the reset timeout.
+        if self.resetcount == 0 or self.resettimeleft == 0:
+            # Do we need to kill this slice?  Check history first.
+            if self.checkkill(params):  return
+            # Update counters
+            self.resetcount += 1
+            self.killtimeleft = kill_timeout
+            self.resettimeleft = reset_timeout
+            print "%s has %s seconds to die and has been reset %s times" \
+                %(self.name, self.resettimeleft, self.resetcount)
+            if debug:
+                print reset_subject % params
+                print reset_body % params
+                try:
+                    pid = os.fork()
+                    if pid == 0:  
+                        print "Resetting slice " + self.name 
+                        vserver = VServer(self.name)
+                        vserver.stop()
+                        vserver.start(wait = False)
+                        os._exit(0)
+                    else:
+                        os.waitpid(pid,0)
+                except Exception, err:
+                    print "Warning: Exception received while resetting slice %s:" \
+                        % self.name, err
+            if (time.time() - self.resetmail) > email_timeout:
+                slicemail(self.name, reset_subject % params, reset_body % params)
+                print "Sending Reset email for slice %s" % self.name
+                self.resetmail = time.time() 
 
 def usage():
     print """
 
 def usage():
     print """
@@ -272,13 +271,13 @@ def slicestat(names = None):
     slices). vsize, sz, and rss are in KiB. Returns
 
     {xid: {'xid': slice_id,
     slices). vsize, sz, and rss are in KiB. Returns
 
     {xid: {'xid': slice_id,
-           'name': slice_name,
-           'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
+            'name': slice_name,
+            'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
                       'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
                       'pcpu': cpu_percent, 'pmem': mem_percent}]
                       'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
                       'pcpu': cpu_percent, 'pmem': mem_percent}]
-           'vsize': total_virtual_kib,
-          'sz': total_potential_kib,
-           'rss': total_physical_kib}}
+            'vsize': total_virtual_kib,
+            'sz': total_potential_kib,
+            'rss': total_physical_kib}}
     """
     
     # Mandatory fields. xid is a virtual field inserted by vps. Make
     """
     
     # Mandatory fields. xid is a virtual field inserted by vps. Make
@@ -362,7 +361,7 @@ def slicestat(names = None):
 
         slice['procs'].append(proc)
         slice['vsize'] += proc['vsize']
 
         slice['procs'].append(proc)
         slice['vsize'] += proc['vsize']
-       slice['sz'] += proc['sz']
+        slice['sz'] += proc['sz']
         slice['rss'] += proc['rss']
 
         slices[proc['xid']] = slice
         slice['rss'] += proc['rss']
 
         slices[proc['xid']] = slice
@@ -373,32 +372,27 @@ def memtotal():
     """
     Returns total physical and swap memory on the system in KiB.
     """
     """
     Returns total physical and swap memory on the system in KiB.
     """
-
     mem = 0
     swap = 0
     mem = 0
     swap = 0
-
     meminfo = open("/proc/meminfo", "r")
     for line in meminfo.readlines():
     meminfo = open("/proc/meminfo", "r")
     for line in meminfo.readlines():
-       try:
-           (name, value, kb) = line.split()
-       except:
-           continue
-       if name == "MemTotal:": 
-           mem = int(value)
-       elif name == "SwapTotal:":
-           swap = int(value)
+        try:
+            (name, value, kb) = line.split()
+        except:
+            continue
+        if name == "MemTotal:": 
+            mem = int(value)
+        elif name == "SwapTotal:":
+            swap = int(value)
     meminfo.close()
     meminfo.close()
-
     return (mem, swap)
 
 def swap_used():
     """
     Returns swap utilization on the system as a whole percentage (0-100).
     """
     return (mem, swap)
 
 def swap_used():
     """
     Returns swap utilization on the system as a whole percentage (0-100).
     """
-
     total_swap = 0
     total_used = 0
     total_swap = 0
     total_used = 0
-
     try:
         swaps = open("/proc/swaps", "r")
         # Eat header line
     try:
         swaps = open("/proc/swaps", "r")
         # Eat header line
@@ -412,21 +406,21 @@ def swap_used():
                 total_used += int(used)
             except ValueEror, err:
                 pass
                 total_used += int(used)
             except ValueEror, err:
                 pass
-    except (IOError, KeyError), err:
-        pass
+    except (IOError, KeyError), err:  pass
 
 
-    return 100 * total_used / total_swap
+    swapused = 100 * total_used / total_swap
+    if debug: print "%s percent swap used" % swapused
+    return swapused
 
 def summary(slices = None, total_mem = None, total_swap = None):
     """
     Return a summary of memory usage by slice.
     """
 
 def summary(slices = None, total_mem = None, total_swap = None):
     """
     Return a summary of memory usage by slice.
     """
-    if not slices:
-       slices = slicestat()
+    if not slices:  slices = slicestat()
     slicelist = slices.values()
     slicelist.sort(lambda a, b: b['sz'] - a['sz'])
     if total_mem is None or total_swap is None:
     slicelist = slices.values()
     slicelist.sort(lambda a, b: b['sz'] - a['sz'])
     if total_mem is None or total_swap is None:
-       (total_mem, total_swap) = memtotal()
+        (total_mem, total_swap) = memtotal()
 
     table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
     for slice in slicelist:
 
     table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
     for slice in slicelist:
@@ -436,10 +430,79 @@ def summary(slices = None, total_mem = None, total_swap = None):
                   100. * slice['rss'] / total_mem,
                   format_bytes(slice['sz'] * 1024, si = False),
                   100. * slice['sz'] / (total_mem + total_swap))
                   100. * slice['rss'] / total_mem,
                   format_bytes(slice['sz'] * 1024, si = False),
                   100. * slice['sz'] / (total_mem + total_swap))
-                 
-
     return table
 
     return table
 
+def formtable(slice, percent):
+    '''
+    Makes pretty message to email with human readable ps values.
+    '''
+    table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
+        ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
+    for proc in slice['procs']:
+        table += "%5s %10s %10s %10s %4.1f %4.1f %s\n" % \
+            (proc['pid'],
+            format_bytes(proc['vsize'] * 1024, si = False),
+            format_bytes(proc['sz'] * 1024, si = False),
+            format_bytes(proc['rss'] * 1024, si = False),
+            proc['pcpu'],
+            proc['pmem'],
+            proc['cmd'])
+    
+    prettytable = {'hostname': socket.gethostname(),
+             'date': time.asctime(time.gmtime()) + " GMT",
+             'table': table,
+             'slice': slice['name'],
+             'rss': format_bytes(slice['rss'] * 1024, si = False),
+             'sz': format_bytes(slice['sz'] * 1024, si = False),
+             'percent': percent}
+    return prettytable
+
+def readdat():
+    '''
+    Return dictionary of vps (slicestat) from datfile left behind by OOM
+    before rebooting.  If none file, just grab the latest dict (slicestat)
+    and return that.  If dat file found, means we rebooted, send an email to 
+    pl_mom@pl.
+    '''
+    try:
+        f = open(datafile, "r+")
+        if verbose:
+            print "Loading %s" % datafile
+        (version, slices) = pickle.load(f)
+        f.close()
+        # Check version of data file
+        if version != "$Id$":
+            print "Not using old version '%s' data file %s" % (version, datafile)
+            raise Exception
+
+        params = {'hostname': socket.gethostname(),
+                  'date': time.asctime(time.gmtime()) + " GMT",
+                  'table': summary(slices, total_mem, total_swap)}
+        if debug:
+            print rebooted_subject % params
+            print rebooted_body % params
+        else:
+            slicemail(None, rebooted_subject % params, rebooted_body % params)
+
+        # Delete data file
+        os.unlink(datafile)
+    except Exception:
+        version = "$Id$"
+        slices = slicestat()
+
+    return slices
+
+
+def writedat(slices):
+    """
+    Write (slices) to pickled datfile.
+    """
+    if verbose:  print "Saving %s" % datafile
+    f = open(datafile, "w")
+    pickle.dump((version, slices), f)
+    f.close()
+
+
 def main():
     # Defaults
     global debug, verbose, datafile
 def main():
     # Defaults
     global debug, verbose, datafile
@@ -497,33 +560,7 @@ def main():
 
     # Get total memory
     (total_mem, total_swap) = memtotal()
 
     # Get total memory
     (total_mem, total_swap) = memtotal()
-
-    try:
-        f = open(datafile, "r+")
-        if verbose:
-            print "Loading %s" % datafile
-        (version, slices) = pickle.load(f)
-        f.close()
-        # Check version of data file
-        if version != "$Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $":
-            print "Not using old version '%s' data file %s" % (version, datafile)
-            raise Exception
-
-        params = {'hostname': socket.gethostname(),
-                  'date': time.asctime(time.gmtime()) + " GMT",
-                  'table': summary(slices, total_mem, total_swap)}
-
-        if debug:
-            print rebooted_subject % params
-            print rebooted_body % params
-        else:
-            slicemail(None, rebooted_subject % params, rebooted_body % params)
-
-        # Delete data file
-        os.unlink(datafile)
-    except Exception:
-        version = "$Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $"
-        slices = {}
+    slices = readdat()
 
     # Query process table every 30 seconds, or when a large change in
     # swap utilization is detected.
 
     # Query process table every 30 seconds, or when a large change in
     # swap utilization is detected.
@@ -539,98 +576,72 @@ def main():
 
     while True:
         used = swap_used()
 
     while True:
         used = swap_used()
-
-       for resetslice in resetlist.keys():
-           resetlist[resetslice].update()
-       
-        if last_used is None:
-            last_used = used
-
-       if verbose:
-            print "%d%% swap consumed" % used
-
+        if last_used is None:  last_used = used
+        
+        # If we've reset you recently, update timers.
+        for resetslice in resetlist.keys(): 
+            resetlist[resetslice].update()
+            # If you've been good, remove you from our list.
+            if resetlist[resetslice].killtimeleft == 0 and \
+            resetlist[resetslice].resettimeleft == 0:
+                del resetlist[resetslice]
+
+        if verbose:  print "%d%% swap consumed" % used
+    
         if used >= reboot_thresh:
             # Dump slice state before rebooting
         if used >= reboot_thresh:
             # Dump slice state before rebooting
-            if verbose:
-                print "Saving %s" % datafile
-            f = open(datafile, "w")
-            pickle.dump((version, slices), f)
-            f.close()
-
+            writedat(slices)    
             # Goodbye, cruel world
             print "%d%% swap consumed, rebooting" % used
             # Goodbye, cruel world
             print "%d%% swap consumed, rebooting" % used
-            if not debug:
-                bwlimit.run("/bin/sync; /sbin/reboot -f")
-
+            if not debug:  bwlimit.run("/bin/sync; /sbin/reboot -f")
         elif used >= reset_thresh:
         elif used >= reset_thresh:
-           if debug:
-               print "Memory used = %s" %(used)
             # Try and find a hog
             slicelist = slices.values()
             # Try and find a hog
             slicelist = slices.values()
+            # Puts largest on top.
             slicelist.sort(lambda a, b: b['rss'] - a['rss'])
             for slice in slicelist:
                 percent = 100. * slice['rss'] / total_mem
             slicelist.sort(lambda a, b: b['rss'] - a['rss'])
             for slice in slicelist:
                 percent = 100. * slice['rss'] / total_mem
-
-                if slice['rss'] < rss_min:
-                    continue
-               
+                if slice['rss'] < rss_min: continue
                 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
                 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
-                      (used,
-                       slice['name'],
-                       format_bytes(slice['rss'] * 1024, si = False),
-                       percent)
-
+                    (used,
+                    slice['name'],
+                    format_bytes(slice['rss'] * 1024, si = False),
+                    percent)
                 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
                 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
-
-                table = "%5s %10s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
-                for proc in slice['procs']:
-                    table += "%5s %10s %10s %10s %4.1f %4.1f %s\n" % \
-                             (proc['pid'],
-                              format_bytes(proc['vsize'] * 1024, si = False),
-                              format_bytes(proc['sz'] * 1024, si = False),
-                              format_bytes(proc['rss'] * 1024, si = False),
-                              proc['pcpu'], proc['pmem'], proc['cmd'])
-
-                params = {'hostname': socket.gethostname(),
-                          'date': time.asctime(time.gmtime()) + " GMT",
-                          'table': table,
-                          'slice': slice['name'],
-                          'rss': format_bytes(slice['rss'] * 1024, si = False),
-                         'sz': format_bytes(slice['sz'] * 1024, si = False),
-                          'percent': percent}
-
+                # Make a pretty table.
+                params = formtable(slice, percent)
                 # Match slice name against system slice patterns
                 # Match slice name against system slice patterns
-                is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices])
-
+                is_system_slice = filter(None, 
+                    [re.match(pattern, slice['name']) for pattern in system_slices])
+    
+                # Do not reset system slices, just warn once
                 if is_system_slice: 
                 if is_system_slice: 
-                   # Do not reset system slices, just warn once
-                   if slice['name'] not in warned:
-                       warned.append(slice['name'])
-                       if debug:
-                           print alarm_subject % params
-                           print alarm_body % params
-                       else:
-                           print "Warning slice " + slice['name']
-                           slicemail(slice['name'], alarm_subject % params, 
-                                     alarm_body % params)
+                    if slice['name'] not in warned:
+                        warned.append(slice['name'])
+                        print "Warning slice " + slice['name']
+                        if debug:
+                            print alarm_subject % params
+                            print alarm_body % params
+                        else:
+                            slicemail(slice['name'], alarm_subject % params, 
+                              alarm_body % params)
                 else:
                 else:
-                   # Reset slice
-                   if not resetlist.has_key(slice['name']):
-                       resetlist[slice['name']] = Reset(slice['name'])
-                   resetlist[slice['name']].reset(params)
-                   slices = slicestat(names)
+                    # Reset slice
+                    if not resetlist.has_key(slice['name']):
+                        resetlist[slice['name']] = Reset(slice['name'])
+                    resetlist[slice['name']].reset(params)
 
 
+        # wait period vefore recalculating swap.  If in danger, recalc.
         if timer <= 0 or used >= (last_used + change_thresh):
             if used >= (last_used + change_thresh):
                 print "%d%% swap consumed, %d%% in last %d seconds" % \
         if timer <= 0 or used >= (last_used + change_thresh):
             if used >= (last_used + change_thresh):
                 print "%d%% swap consumed, %d%% in last %d seconds" % \
-                      (used, used - last_used, period - timer)
+                    (used, used - last_used, period - timer)
             # Get slice state
             slices = slicestat(names)
             # Reset timer
             timer = period
             # Keep track of large changes in swap utilization
             last_used = used
             # Get slice state
             slices = slicestat(names)
             # Reset timer
             timer = period
             # Keep track of large changes in swap utilization
             last_used = used
-
         timer -= 1
         time.sleep(1)
 
         timer -= 1
         time.sleep(1)