#!/usr/bin/python
#
# Swap monitoring daemon. Every 30 seconds, checks process memory
# usage. At 90% utilization, resets the slice that is consuming the
# most physical memory. At 95% utilization, reboots the machine to
# avoid a crash.
#
# Mark Huang <mlhuang@cs.princeton.edu>
# Andy Bavier <acb@cs.princeton.edu>
# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
# Copyright (C) 2004-2006 The Trustees of Princeton University
#

import syslog
import os
import sys
import getopt
import re
import pickle
import socket
import time

# bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
import plnode.bwlimit as bwlimit

# Utility functions
from pl_mom import *

# Defaults
debug = False
verbose = 0
DATAFILE = "/var/lib/misc/swapmon.dat"
# xxx fixme - this is broken under git
VERSION = "$Id$"
# Seconds between process analysis
period = 30

# Minimum change in swap utilization over 30 seconds that will trigger
# early process analysis.
change_thresh = 5

# Swap utilization at which the largest consumer of physical memory is reset
reset_thresh = 80

# Swap utilization at which the machine is rebooted
reboot_thresh = 95

# Don't email the same message more than once in the same emailtimeout interval
email_timeout = 1800

# Physical size threshold to be considered a consumer.  Rationale is if there are no procs
# with a size at least as large as this, then there is a slow leaker;  better to just reboot.
rss_min = 150 * 1024 

# System slices that should not be reset (regexps)
system_slices = ['root', PLC_SLICE_PREFIX + '_']

# Message sent after a critical reboot
rebooted_subject = "pl_mom rebooted %(hostname)s"
rebooted_body = \
"""
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s, so pl_mom rebooted it.

Slices active prior to reboot are listed below. Memory usage
statistics are not entirely accurate due to threading.

%(table)s

%(date)s %(hostname)s reboot
""".lstrip()

# Message sent to system slices that should not be reset
alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
alarm_body = \
"""           
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s.

System slice %(slice)s was the largest consumer of physical memory at
%(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
but please verify its behavior.

%(slice)s processes prior to alarm:

%(table)s

%(date)s %(hostname)s alarm %(slice)s
""".lstrip()

# Message sent after a slice has been killed
kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
kill_body = \
"""
Sometime before %(date)s, swap space was
nearly exhausted on %(hostname)s.

Slice %(slice)s was killed since it was the largest consumer of
physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
after repeated restarts.

Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.

%(slice)s processes prior to reset:

%(table)s

%(date)s %(hostname)s reset %(slice)s
""".lstrip()

def killsliverprocs(xid):
    bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)    


def usage():
    print """
Usage: %s [OPTIONS]...

Options:
        -d, --debug             Enable debugging (default: %s)
        -v, --verbose           Increase verbosity level (default: %d)
        -f, --file=FILE         Data file (default: %s)
        -s, --slice=SLICE       Constrain monitoring to these slices (default: all)
        -p, --period=SECONDS    Seconds between normal process analysis (default: %s)
        --reset-thresh=PERCENT  Swap utilization at which slice reset is attempted
        --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
        --min-thresh=PERCENT    Minimum physical memory utilization to be considered a hog
        --system-slice=SLICE    System slice that should not be reset
        --status                Print memory usage statistics and exit
        --memstatus             Print total memory, total swap, and swap used
        -h, --help              This message
""".lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))

def slicestat(names = None):
    """
    Get status of specified slices (if names is None or empty, all
    slices). vsize, sz, and rss are in KiB. Returns
    PID CONTEXT             VSZ    SZ   RSS %MEM CMD
    {xid: {'xid': slice_id,
            'name': slice_name,
            'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
                      'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
                      'pcpu': cpu_percent, 'pmem': mem_percent}]
            'vsize': total_virtual_kib,
            'sz': total_potential_kib,
            'rss': total_physical_kib}}
    """
    
    # Mandatory fields. xid is a virtual field inserted by vps. Make
    # sure cmd is last so that it does not get truncated
    # automatically.
    fields = ['pid', 'xid', 'vsname', 'vsize', 'sz', 'rss', 'pmem', 'cmd']

    # vps inserts xid after pid in the output, but ps doesn't know
    # what the field means.
    ps_fields = list(fields)
    ps_fields.remove('xid')
    ps_fields.remove('vsname')

    slices = {}

    # Eat the header line. vps depends on the header to figure out
    # which column is the PID column, so we can't just tell ps not to
    # print it.
    for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
        # Chomp newline
        line = line.strip()

        # Represent process as a dict of fields
        values = line.split(None, len(fields) - 1)
        if len(values) != len(fields):
            if "ERR" in line:
                pass # ignore spurious error message from vps
            else:
                print "slicestat: failed to parse line: " + line
            continue
        proc = dict(zip(fields, values))

        # Convert ints and floats
        for field in proc:
            try:
                proc[field] = int(proc[field])
            except ValueError:
                try:
                    proc[field] = float(proc[field])
                except ValueError:
                    pass

        # vps sometimes prints ERR or the name of the slice
            # instead of a context ID if it
        # cannot identify the context of an orphaned (usually dying)
        # process. Skip these processes.
        if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
            if "ERR" in line:
                pass # ignore spurious error message from vps
            else:
                print "slicestat: failed to parse line: " + line
            continue

        # Assign (pl_)sshd processes to slice instead of root
        m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])

        if m is not None:
            xid = bwlimit.get_xid(m.group(1))
            if xid is not None:
                proc['xid'] = xid

        name = bwlimit.get_slice(proc['xid'])
        if name is None:
            # Orphaned (not associated with a slice) class
            name = "%d?" % proc['xid']

        # Monitor only the specified slices
        if names and name not in names:
            continue

        # Additional overhead calculations from slicestat

        # Include 12 KiB of process overhead =
        # 4 KiB top-level page table +
        # 4 KiB kernel structure +
        # 4 KiB basic page table
        proc['rss'] += 12

        # Include additional page table overhead
        try:
            if proc['vsize'] > 4096:
                proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
        except: pass

        if slices.has_key(proc['xid']):
            slice = slices[proc['xid']]
        else:
            slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}

        slice['procs'].append(proc)
        slice['vsize'] += proc['vsize']
        slice['sz'] += proc['sz']
        slice['rss'] += proc['rss']

        slices[proc['xid']] = slice

    return slices

def memtotal():
    """
    Returns total physical and swap memory on the system in KiB.
    """
    mem = 0
    swap = 0
    meminfo = open("/proc/meminfo", "r")
    for line in meminfo.readlines():
        try:
            (name, value, kb) = line.split()
        except:
            continue
        if name == "MemTotal:": 
            mem = int(value)
        elif name == "SwapTotal:":
            swap = int(value)
    meminfo.close()
    return (mem, swap)

def swap_used():
    """
    Returns swap utilization on the system as a whole percentage (0-100).
    """
    total_swap = 0
    total_used = 0
    try:
        swaps = open("/proc/swaps", "r")
        # Eat header line
        lines = swaps.readlines()[1:]
        swaps.close()
        for line in lines:
            # /dev/mapper/planetlab-swap partition 1048568 3740 -1
            (filename, type, size, used, priority) = line.strip().split()
            try:
                total_swap += int(size)
                total_used += int(used)
            except ValueEror, err:
                pass
    except (IOError, KeyError), err:  pass

    swapused = 100 * total_used / total_swap
    if debug: print "%s percent swap used" % swapused
    return swapused

def summary(slices = None, total_mem = None, total_swap = None):
    """
    Return a summary of memory usage by slice.
    """
    if not slices:  slices = slicestat()
    slicelist = slices.values()
    slicelist.sort(lambda a, b: b['sz'] - a['sz'])
    if total_mem is None or total_swap is None:
        (total_mem, total_swap) = memtotal()

    table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
    for slice in slicelist:
        table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
                 (slice['name'], len(slice['procs']),
                  format_bytes(slice['rss'] * 1024, si = False),
                  100. * slice['rss'] / total_mem,
                  format_bytes(slice['sz'] * 1024, si = False),
                  100. * slice['sz'] / (total_mem + total_swap))
    return table

def formtable(slice, percent):
    '''
    Makes pretty message to email with human readable ps values.
    '''
    table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
        ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
    for proc in slice['procs']:
        table += "%5s %10s %10s %10s %4.1f %s\n" % \
            (proc['pid'],
            format_bytes(proc['vsize'] * 1024, si = False),
            format_bytes(proc['sz'] * 1024, si = False),
            format_bytes(proc['rss'] * 1024, si = False),
            proc['pmem'],
            proc['cmd'])
    
    prettytable = {'hostname': socket.gethostname(),
             'date': time.asctime(time.gmtime()) + " GMT",
             'table': table,
             'slice': slice['name'],
             'rss': format_bytes(slice['rss'] * 1024, si = False),
             'sz': format_bytes(slice['sz'] * 1024, si = False),
             'percent': percent}
    return prettytable

def readdat():
    '''
    Return dictionary of vps (slicestat) from datfile left behind by OOM
    before rebooting.  If none file, just grab the latest dict (slicestat)
    and return that.  If dat file found, means we rebooted, send an email to 
    pl_mom@pl.
    '''
    try:
        f = open(DATAFILE, "r+")
        if verbose:
            print "Loading %s" % DATAFILE
        (v, slices) = pickle.load(f)
        f.close()
        # Check version of data file
        if v != VERSION:
            print "Not using old version '%s' data file %s" % (v, DATAFILE)
            raise Exception

        params = {'hostname': socket.gethostname(),
                  'date': time.asctime(time.gmtime()) + " GMT",
                  'table': summary(slices, total_mem, total_swap)}
        if debug:
            print rebooted_subject % params
            print rebooted_body % params
        else:
            slicemail(None, rebooted_subject % params, rebooted_body % params)

        # Delete data file
        os.unlink(DATAFILE)
    except Exception:
        slices = slicestat()

    return slices


def writedat(slices):
    """
    Write (slices) to pickled datfile.
    """
    if verbose:  print "Saving %s" % DATAFILE
    f = open(DATAFILE, "w")
    pickle.dump((VERSION, slices), f)
    f.close()


def main():
    # Defaults
    global debug, verbose, DATAFILE, VERSION 
    global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
    # All slices
    names = []
    timer = period
    last_used = None
    used = None
    warned = []
    emailed = {}

    try:
        longopts = ["debug", "verbose", "file=", "slice=", "status", "memstatus", "help"]
        longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
        (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
    except getopt.GetoptError, err:
        print "Error: " + err.msg
        usage()
        sys.exit(1)

    for (opt, optval) in opts:
        if opt == "-d" or opt == "--debug":
            debug = True
        elif opt == "-v" or opt == "--verbose":
            verbose += 1
        elif opt == "-f" or opt == "--file":
            DATAFILE = optval
        elif opt == "-s" or opt == "--slice":
            names.append(optval)
        elif opt == "-p" or opt == "--period":
            period = int(optval)
        elif opt == "--change-thresh":
            change_thresh = int(optval)
        elif opt == "--reset-thresh":
            reset_thresh = int(optval)
        elif opt == "--reboot-thresh":
            reboot_thresh = int(optval)
        elif opt == "--min-thresh":
            rss_min = int(optval)
        elif opt == "--system-slice":
            system_slices.append(optval)
        elif opt == "--status":
            print summary(slicestat(names))
            sys.exit(0)
        elif opt == "--memstatus":
            (mem, swap) = memtotal()
            swap_pct = swap_used()
            print "memory total:", mem
            print "swap total:", swap
            print "swap used:", swap_pct
            sys.exit(0)
        else:
            usage()
            sys.exit(0)

    # Check if we are already running
    writepid("swapmon")

    if not debug:
        daemonize()
        # Rewrite PID file
        writepid("swapmon")
        # Redirect stdout and stderr to syslog
        syslog.openlog("swapmon")
        sys.stdout = sys.stderr = Logger()

    # Get total memory
    (total_mem, total_swap) = memtotal()
    slices = readdat()

    # Query process table every 30 seconds, or when a large change in
    # swap utilization is detected.

    while True:
        used = swap_used()
        if last_used is None:  last_used = used
   
        if used >= reboot_thresh:
            # Dump slice state before rebooting
            writedat(slices)    
            # Goodbye, cruel world
            print "%d%% swap consumed, rebooting" % used
            if not debug:  bwlimit.run("/bin/sync; /sbin/reboot -f")
        elif used >= reset_thresh:
            # Try and find a hog
            slicelist = slices.values()
            # Puts largest on top.
            slicelist.sort(lambda a, b: b['rss'] - a['rss'])
            for slice in slicelist:
                percent = 100. * slice['rss'] / total_mem
                if slice['rss'] < rss_min: continue
                print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
                    (used,
                    slice['name'],
                    format_bytes(slice['rss'] * 1024, si = False),
                    percent)
                slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
                # Make a pretty table.
                params = formtable(slice, percent)
                # Match slice name against system slice patterns
                is_system_slice = filter(None, 
                    [re.match(pattern, slice['name']) for pattern in system_slices])
    
                # Do not reset system slices, just warn once
                if is_system_slice: 
                    if slice['name'] not in warned:
                        warned.append(slice['name'])
                        print "Warning slice " + slice['name']
                        if debug:
                            print alarm_subject % params
                            print alarm_body % params
                        else:
                            slicemail(slice['name'], alarm_subject % params, 
                              alarm_body % params)
                else:
                    # Reset slice
                    if not debug:
                        if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout): 
                            slicemail(slice['name'], kill_subject % params, kill_body % params)
                            emailed[slice['name']] = time.time()
                    else:
                        print kill_subject % params
                        print kill_body % params
                    print "Killing procs in %s" % slice['name']
                    killsliverprocs(slice['xid'])

        # wait period before recalculating swap.  If in danger, recalc.
        if timer <= 0 or used >= (last_used + change_thresh):
            if used >= (last_used + change_thresh):
                print "%d%% swap consumed, %d%% in last %d seconds" % \
                    (used, used - last_used, period - timer)
            # Get slice state
            slices = slicestat(names)
            # Reset timer
            timer = period
            # Keep track of large changes in swap utilization
            last_used = used
        timer -= 1
        time.sleep(1)

    removepid("swapmon")

if __name__ == '__main__':
    main()