#!/usr/bin/python
#
# Swap monitoring daemon. Every 30 seconds, checks process memory
# usage. At 90% utilization, resets the slice that is consuming the
# most physical memory. At 95% utilization, reboots the machine to
# avoid a crash.
#
# Mark Huang <mlhuang@cs.princeton.edu>
# Andy Bavier <acb@cs.princeton.edu>
# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
# Copyright (C) 2004-2006 The Trustees of Princeton University
#
# $Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $
#

import syslog
import os
import sys
import getopt
import re
import pickle
import socket
import time

# util-vserver/python/vserver.py allows us to control slices directly
# from Python
from vserver import VServer

# bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
import bwlimit

# Utility functions
from pl_mom import *

# Defaults
debug = False
verbose = 0
datafile = "/var/lib/misc/swapmon.dat"

# Seconds between process analysis
period = 30

# Minimum change in swap utilization over 30 seconds that will trigger
# early process analysis.
change_thresh = 5

# Swap utilization at which the largest consumer of physical memory is reset
reset_thresh = 80

# Swap utilization at which the machine is rebooted
reboot_thresh = 95

# Time to wait before checking slice again after reset
reset_timeout = 15

# Number of strikes before killing (strike, strike, kill)
kill_thresh = 2

# Time to wait before removing slice from kill queue (probation) 
kill_timeout = 120 

# Don't email the same message more than once in the same emailtimeout interval
email_timeout = 1800

# Physical size threshold to be considered a consumer.  Rationale is if there are no procs
# with a size at least as large as this, then there is a slow leaker;  better to just reboot.
rss_min = 150 * 1024 

# System slices that should not be reset (regexps)
system_slices = ['root', PLC_SLICE_PREFIX + '_']

# Message sent after a critical reboot
rebooted_subject = "pl_mom rebooted %(hostname)s"
rebooted_body = \
"""
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s, so pl_mom rebooted it.

Slices active prior to reboot are listed below. Memory usage
statistics are not entirely accurate due to threading.

%(table)s

%(date)s %(hostname)s reboot
""".lstrip()

# Message sent after a hog is reset
reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
reset_body = \
"""
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s.

Slice %(slice)s was reset since it was the largest consumer of
physical memory at %(rss)s (%(percent)4.1f%%).

Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.

%(slice)s processes prior to reset:

%(table)s

%(date)s %(hostname)s reset %(slice)s
""".lstrip()

# Message sent to system slices that should not be reset
alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
alarm_body = \
"""           
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s.

System slice %(slice)s was the largest consumer of physical memory at
%(rss)s (%(percent)4.1f%%). It was not reset, but please verify its
behavior.

%(slice)s processes prior to alarm:

%(table)s

%(date)s %(hostname)s alarm %(slice)s
""".lstrip()

# Message sent after a slice has been killed
kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
kill_body = \
"""
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s.

Slice %(slice)s was killed since it was the largest consumer of
physical memory at %(rss)s (%(percent)4.1f%%) after repeated restarts.

Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.

%(slice)s processes prior to reset:

%(table)s

%(date)s %(hostname)s reset %(slice)s
""".lstrip()


class Reset:
	"""
	Keeps track of state information for resets and kills

	resettimeleft - timeout before checking for next reset
	resetcount - number of strikes 
	killtimeleft - time out before removing from kill queue
	{kill,reset}mail - Time of last email
	kill - State of kill.  If slice is already being killed, wait before retry.
	"""

	def __init__(self,name):
		self.name = name
		self.resettimeleft = reset_timeout
		self.resetcount = 0 
		self.resetmail = 0
		self.killtimeleft = kill_timeout
		self.killmail = 0

	def __repr__(self):
		return self.name
	
	def update(self):
        	# Count down for next check of reset slice.
               	if self.resettimeleft > 0:
                        self.resettimeleft -= 1
                        if debug and verbose:
                               	print "%s has %s seconds in probation" \
					%(self.name, self.killtimeleft)
		if self.killtimeleft > 0:
			# Count down kill probation timer (killtimeleft)
			self.killtimeleft -= 1
			if self.killtimeleft == 1:
				print "%s is out of probation" % self.name
		else:
			# Once out of probation period (killtimeleft), remove strikes
			self.resetcount = 0


	# Check to see if a slice needs to be killed.  If it has been killed more 
	# than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
	def checkkill(self,params):
		if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
			if debug:
                                print kill_subject % params
                                print kill_body % params
			try:
				pid = os.fork()
				if pid == 0:
					print "Slice %s is being killed." % self.name	
                       			vserver = VServer(self.name)
                       			vserver.stop()
					os._exit(0)
				else:
					os.waitpid(pid,0)
			except Exception, err:
                               	print "Warning: Exception received while killing slice %s: %s" \
					% self.name, err
			if (time.time() - self.killmail) > email_timeout:
				slicemail(self.name, kill_subject % params, kill_body % params)
				print "Sending KILL email for slice %s" % self.name
				self.killmail = time.time() 
			return True
		return False 

	# Reset slice after checking to see if slice is out of timeout.
	# Increment resetcount, check to see if larger than kill_thresh.
	def reset(self, params):
		# If its the first reset (came back after kill)
		# or if its been reset before
		# and we are out of the reset timeout.
		if self.resetcount == 0 or self.resettimeleft == 0:
			# Do we need to kill this slice?  Check history first.
			if self.checkkill(params):
				return
			# Update counters
			self.resetcount += 1
			self.killtimeleft = kill_timeout
			self.resettimeleft = reset_timeout
			print "%s has %s seconds to die and has been reset %s times" \
                        	%(self.name, self.resettimeleft, self.resetcount)
			if debug:
                		print reset_subject % params
                        	print reset_body % params
                	try:
                        	pid = os.fork()
                        	if pid == 0:
                               		print "Resetting slice " + self.name 
                               		vserver = VServer(self.name)
                               		vserver.stop()
                               		vserver.start(wait = False)
                               		os._exit(0)
				else:
					os.waitpid(pid,0)
                	except Exception, err:
                       		print "Warning: Exception received while resetting slice %s:" \
					% self.name, err
			if (time.time() - self.resetmail) > email_timeout:
               			slicemail(self.name, reset_subject % params, reset_body % params)
				print "Sending Reset email for slice %s" % self.name
				self.resetmail = time.time() 


def usage():
    print """
Usage: %s [OPTIONS]...

Options:
        -d, --debug             Enable debugging (default: %s)
        -v, --verbose           Increase verbosity level (default: %d)
        -f, --file=FILE         Data file (default: %s)
        -s, --slice=SLICE       Constrain monitoring to these slices (default: all)
        -p, --period=SECONDS    Seconds between normal process analysis (default: %s)
        --reset-thresh=PERCENT  Swap utilization at which slice reset is attempted
        --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
        --min-thresh=PERCENT    Minimum physical memory utilization to be considered a hog
        --system-slice=SLICE    System slice that should not be reset
        --status                Print memory usage statistics and exit
        -h, --help              This message
""".lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period))

def slicestat(names = None):
    """
    Get status of specified slices (if names is None or empty, all
    slices). vsize and rss are in KiB. Returns

    {xid: {'xid': slice_id,
           'name': slice_name,
           'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
                      'vsize': virtual_kib, 'rss': physical_kib,
                      'pcpu': cpu_percent, 'pmem': mem_percent}]
           'vsize': total_virtual_kib,
           'rss': total_physical_kib}}
    """
    
    # Mandatory fields. xid is a virtual field inserted by vps. Make
    # sure cmd is last so that it does not get truncated
    # automatically.
    fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd']

    # vps inserts xid after pid in the output, but ps doesn't know
    # what the field means.
    ps_fields = list(fields)
    ps_fields.remove('xid')

    slices = {}

    # Eat the header line. vps depends on the header to figure out
    # which column is the PID column, so we can't just tell ps not to
    # print it.
    for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]:
        # Chomp newline
        line = line.strip()

        # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
        # vps uses to denote the root context and the "all contexts"
        # context) with "0" so that we can just split() on whitespace.
        line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")

        # Represent process as a dict of fields
        values = line.split(None, len(fields) - 1)
        if len(values) != len(fields):
            continue
        proc = dict(zip(fields, values))

        # Convert ints and floats
        for field in proc:
            try:
                proc[field] = int(proc[field])
            except ValueError:
                try:
                    proc[field] = float(proc[field])
                except ValueError:
                    pass

        # vps sometimes prints ERR instead of a context ID if it
        # cannot identify the context of an orphaned (usually dying)
        # process. Skip these processes.
        if type(proc['xid']) != int:
            continue

        # Assign (pl_)sshd processes to slice instead of root
        m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
        if m is not None:
            xid = bwlimit.get_xid(m.group(1))
            if xid is not None:
                proc['xid'] = xid

        name = bwlimit.get_slice(proc['xid'])
        if name is None:
            # Orphaned (not associated with a slice) class
            name = "%d?" % proc['xid']

        # Monitor only the specified slices
        if names and name not in names:
            continue

        # Additional overhead calculations from slicestat

        # Include 12 KiB of process overhead =
        # 4 KiB top-level page table +
        # 4 KiB kernel structure +
        # 4 KiB basic page table
        proc['rss'] += 12

        # Include additional page table overhead
        if proc['vsize'] > 4096:
            proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)

        if slices.has_key(proc['xid']):
            slice = slices[proc['xid']]
        else:
            slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0}

        slice['procs'].append(proc)
        slice['vsize'] += proc['vsize']
        slice['rss'] += proc['rss']

        slices[proc['xid']] = slice

    return slices

def memtotal():
    """
    Returns total physical memory on the system in KiB.
    """

    meminfo = open("/proc/meminfo", "r")
    line = meminfo.readline()
    meminfo.close()
    if line[0:8] == "MemTotal":
        # MemTotal: 255396 kB
        (name, value, kb) = line.split()
        return int(value)

    return 0

def swap_used():
    """
    Returns swap utilization on the system as a whole percentage (0-100).
    """

    total_swap = 0
    total_used = 0

    try:
        swaps = open("/proc/swaps", "r")
        # Eat header line
        lines = swaps.readlines()[1:]
        swaps.close()
        for line in lines:
            # /dev/mapper/planetlab-swap partition 1048568 3740 -1
            (filename, type, size, used, priority) = line.strip().split()
            try:
                total_swap += int(size)
                total_used += int(used)
            except ValueEror, err:
                pass
    except (IOError, KeyError), err:
        pass

    return 100 * total_used / total_swap

def summary(names = None, total_rss = memtotal()):
    """
    Return a summary of memory usage by slice.
    """
    slicelist = slicestat(names).values()
    slicelist.sort(lambda a, b: b['rss'] - a['rss'])

    table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage")
    for slice in slicelist:
        table += "%-20s%10d%16s (%4.1f%%)\n" % \
                 (slice['name'], len(slice['procs']),
                  format_bytes(slice['rss'] * 1024, si = False),
                  100. * slice['rss'] / total_rss)

    return table

def main():
    # Defaults
    global debug, verbose, datafile
    global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
    # All slices
    names = []

    try:
        longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
        longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
        (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
    except getopt.GetoptError, err:
        print "Error: " + err.msg
        usage()
        sys.exit(1)

    for (opt, optval) in opts:
        if opt == "-d" or opt == "--debug":
            debug = True
        elif opt == "-v" or opt == "--verbose":
            verbose += 1
        elif opt == "-f" or opt == "--file":
            datafile = optval
        elif opt == "-s" or opt == "--slice":
            names.append(optval)
        elif opt == "-p" or opt == "--period":
            period = int(optval)
        elif opt == "--change-thresh":
            change_thresh = int(optval)
        elif opt == "--reset-thresh":
            reset_thresh = int(optval)
        elif opt == "--reboot-thresh":
            reboot_thresh = int(optval)
        elif opt == "--system-slice":
            system_slices.append(optval)
        elif opt == "--status":
            print summary(names)
            sys.exit(0)
        else:
            usage()
            sys.exit(0)

    # Check if we are already running
    writepid("swapmon")

    if not debug:
        daemonize()
        # Rewrite PID file
        writepid("swapmon")
        # Redirect stdout and stderr to syslog
        syslog.openlog("swapmon")
        sys.stdout = sys.stderr = Logger()

    # Get total physical memory
    total_rss = memtotal()

    try:
        f = open(datafile, "r+")
        if verbose:
            print "Loading %s" % datafile
        (version, slices) = pickle.load(f)
        f.close()
        # Check version of data file
        if version != "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $":
            print "Not using old version '%s' data file %s" % (version, datafile)
            raise Exception

        params = {'hostname': socket.gethostname(),
                  'date': time.asctime(time.gmtime()) + " GMT",
                  'table': summary(total_rss)}

        if debug:
            print rebooted_subject % params
            print rebooted_body % params
        else:
            slicemail(None, rebooted_subject % params, rebooted_body % params)

        # Delete data file
        os.unlink(datafile)
    except Exception:
        version = "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $"
        slices = {}

    # Query process table every 30 seconds, or when a large change in
    # swap utilization is detected.
    timer = period
    last_used = None
    used = None

    # System slices that we have warned but could not reset
    warned = []

    # Slices that were reset
    resetlist = {}

    while True:
        used = swap_used()

	for resetslice in resetlist.keys():
		resetlist[resetslice].update()
	
        if last_used is None:
            last_used = used

	if verbose:
            print "%d%% swap consumed" % used

        if used >= reboot_thresh:
            # Dump slice state before rebooting
            if verbose:
                print "Saving %s" % datafile
            f = open(datafile, "w")
            pickle.dump((version, slices), f)
            f.close()

            # Goodbye, cruel world
            print "%d%% swap consumed, rebooting" % used
            if not debug:
                bwlimit.run("/bin/sync; /sbin/reboot -f")

        elif used >= reset_thresh:
	    if debug:
		print "Memory used = %s" %(used)
            # Try and find a hog
            slicelist = slices.values()
            slicelist.sort(lambda a, b: b['rss'] - a['rss'])
            for slice in slicelist:
                percent = 100. * slice['rss'] / total_rss

                if slice['rss'] < rss_min:
                    continue
		
                print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
                      (used,
                       slice['name'],
                       format_bytes(slice['rss'] * 1024, si = False),
                       percent)

                slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])

                table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND')
                for proc in slice['procs']:
                    table += "%5s %10s %10s %4.1f %4.1f %s\n" % \
                             (proc['pid'],
                              format_bytes(proc['vsize'] * 1024, si = False),
                              format_bytes(proc['rss'] * 1024, si = False),
                              proc['pcpu'], proc['pmem'], proc['cmd'])

                params = {'hostname': socket.gethostname(),
                          'date': time.asctime(time.gmtime()) + " GMT",
                          'table': table,
                          'slice': slice['name'],
                          'rss': format_bytes(slice['rss'] * 1024, si = False),
                          'percent': percent}

                # Match slice name against system slice patterns
                is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices])

                if is_system_slice: 
                    	if slice['name'] not in warned:
                       		warned.append(slice['name'])
                       		if debug:
                       			print alarm_subject % params
                       			print alarm_body % params
                       	else:
                       		print "Warning slice " + slice['name']
                       		slicemail(slice['name'], alarm_subject % params, 
					alarm_body % params)
                else:
			# Reset slice
			if not resetlist.has_key(slice['name']):
				resetlist[slice['name']] = Reset(slice['name'])
                        resetlist[slice['name']].reset(params)
			slices = slicestat(names)

        if timer <= 0 or used >= (last_used + change_thresh):
            if used >= (last_used + change_thresh):
                print "%d%% swap consumed, %d%% in last %d seconds" % \
                      (used, used - last_used, period - timer)
            # Get slice state
            slices = slicestat(names)
            # Reset timer
            timer = period
            # Keep track of large changes in swap utilization
            last_used = used

        timer -= 1
        time.sleep(1)

    removepid("swapmon")

if __name__ == '__main__':
    main()