used as a metric
- memtotal: return SwapTotal as well
- summary: completely broken when used in the emergency reboot case, fix
- parse --min-thresh
- just warn system slices once (again)
# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
# Copyright (C) 2004-2006 The Trustees of Princeton University
#
# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
# Copyright (C) 2004-2006 The Trustees of Princeton University
#
-# $Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $
+# $Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $
nearly exhausted on %(hostname)s.
Slice %(slice)s was reset since it was the largest consumer of
nearly exhausted on %(hostname)s.
Slice %(slice)s was reset since it was the largest consumer of
-physical memory at %(rss)s (%(percent)4.1f%%).
+physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.
Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.
nearly exhausted on %(hostname)s.
System slice %(slice)s was the largest consumer of physical memory at
nearly exhausted on %(hostname)s.
System slice %(slice)s was the largest consumer of physical memory at
-%(rss)s (%(percent)4.1f%%). It was not reset, but please verify its
-behavior.
+%(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
+but please verify its behavior.
%(slice)s processes prior to alarm:
%(slice)s processes prior to alarm:
nearly exhausted on %(hostname)s.
Slice %(slice)s was killed since it was the largest consumer of
nearly exhausted on %(hostname)s.
Slice %(slice)s was killed since it was the largest consumer of
-physical memory at %(rss)s (%(percent)4.1f%%) after repeated restarts.
+physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
+after repeated restarts.
Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.
Please reply to this message explaining the nature of your experiment,
and what you are doing to address the problem.
def slicestat(names = None):
"""
Get status of specified slices (if names is None or empty, all
def slicestat(names = None):
"""
Get status of specified slices (if names is None or empty, all
- slices). vsize and rss are in KiB. Returns
+ slices). vsize, sz, and rss are in KiB. Returns
{xid: {'xid': slice_id,
'name': slice_name,
'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
{xid: {'xid': slice_id,
'name': slice_name,
'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
- 'vsize': virtual_kib, 'rss': physical_kib,
+ 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
'pcpu': cpu_percent, 'pmem': mem_percent}]
'vsize': total_virtual_kib,
'pcpu': cpu_percent, 'pmem': mem_percent}]
'vsize': total_virtual_kib,
+ 'sz': total_potential_kib,
'rss': total_physical_kib}}
"""
# Mandatory fields. xid is a virtual field inserted by vps. Make
# sure cmd is last so that it does not get truncated
# automatically.
'rss': total_physical_kib}}
"""
# Mandatory fields. xid is a virtual field inserted by vps. Make
# sure cmd is last so that it does not get truncated
# automatically.
- fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd']
+ fields = ['pid', 'xid', 'user', 'vsize', 'sz', 'rss', 'pcpu', 'pmem', 'cmd']
# vps inserts xid after pid in the output, but ps doesn't know
# what the field means.
# vps inserts xid after pid in the output, but ps doesn't know
# what the field means.
if slices.has_key(proc['xid']):
slice = slices[proc['xid']]
else:
if slices.has_key(proc['xid']):
slice = slices[proc['xid']]
else:
- slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0}
+ slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
slice['procs'].append(proc)
slice['vsize'] += proc['vsize']
slice['procs'].append(proc)
slice['vsize'] += proc['vsize']
+ slice['sz'] += proc['sz']
slice['rss'] += proc['rss']
slices[proc['xid']] = slice
slice['rss'] += proc['rss']
slices[proc['xid']] = slice
- Returns total physical memory on the system in KiB.
+ Returns total physical and swap memory on the system in KiB.
meminfo = open("/proc/meminfo", "r")
meminfo = open("/proc/meminfo", "r")
- line = meminfo.readline()
+ for line in meminfo.readlines():
+ try:
+ (name, value, kb) = line.split()
+ except:
+ continue
+ if name == "MemTotal:":
+ mem = int(value)
+ elif name == "SwapTotal:":
+ swap = int(value)
- if line[0:8] == "MemTotal":
- # MemTotal: 255396 kB
- (name, value, kb) = line.split()
- return int(value)
return 100 * total_used / total_swap
return 100 * total_used / total_swap
-def summary(names = None, total_rss = memtotal()):
+def summary(slices = None, total_mem = None, total_swap = None):
"""
Return a summary of memory usage by slice.
"""
"""
Return a summary of memory usage by slice.
"""
- slicelist = slicestat(names).values()
- slicelist.sort(lambda a, b: b['rss'] - a['rss'])
-
- table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage")
+ if not slices:
+ slices = slicestat()
+ slicelist = slices.values()
+ slicelist.sort(lambda a, b: b['sz'] - a['sz'])
+ if total_mem is None or total_swap is None:
+ (total_mem, total_swap) = memtotal()
+
+ table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
- table += "%-20s%10d%16s (%4.1f%%)\n" % \
+ table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
(slice['name'], len(slice['procs']),
format_bytes(slice['rss'] * 1024, si = False),
(slice['name'], len(slice['procs']),
format_bytes(slice['rss'] * 1024, si = False),
- 100. * slice['rss'] / total_rss)
+ 100. * slice['rss'] / total_mem,
+ format_bytes(slice['sz'] * 1024, si = False),
+ 100. * slice['sz'] / (total_mem + total_swap))
+
reset_thresh = int(optval)
elif opt == "--reboot-thresh":
reboot_thresh = int(optval)
reset_thresh = int(optval)
elif opt == "--reboot-thresh":
reboot_thresh = int(optval)
+ elif opt == "--min-thresh":
+ rss_min = int(optval)
elif opt == "--system-slice":
system_slices.append(optval)
elif opt == "--status":
elif opt == "--system-slice":
system_slices.append(optval)
elif opt == "--status":
+ print summary(slicestat(names))
sys.exit(0)
else:
usage()
sys.exit(0)
else:
usage()
syslog.openlog("swapmon")
sys.stdout = sys.stderr = Logger()
syslog.openlog("swapmon")
sys.stdout = sys.stderr = Logger()
- # Get total physical memory
- total_rss = memtotal()
+ # Get total memory
+ (total_mem, total_swap) = memtotal()
try:
f = open(datafile, "r+")
try:
f = open(datafile, "r+")
(version, slices) = pickle.load(f)
f.close()
# Check version of data file
(version, slices) = pickle.load(f)
f.close()
# Check version of data file
- if version != "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $":
+ if version != "$Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $":
print "Not using old version '%s' data file %s" % (version, datafile)
raise Exception
params = {'hostname': socket.gethostname(),
'date': time.asctime(time.gmtime()) + " GMT",
print "Not using old version '%s' data file %s" % (version, datafile)
raise Exception
params = {'hostname': socket.gethostname(),
'date': time.asctime(time.gmtime()) + " GMT",
- 'table': summary(total_rss)}
+ 'table': summary(slices, total_mem, total_swap)}
if debug:
print rebooted_subject % params
if debug:
print rebooted_subject % params
# Delete data file
os.unlink(datafile)
except Exception:
# Delete data file
os.unlink(datafile)
except Exception:
- version = "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $"
+ version = "$Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $"
slices = {}
# Query process table every 30 seconds, or when a large change in
slices = {}
# Query process table every 30 seconds, or when a large change in
used = swap_used()
for resetslice in resetlist.keys():
used = swap_used()
for resetslice in resetlist.keys():
- resetlist[resetslice].update()
+ resetlist[resetslice].update()
if last_used is None:
last_used = used
if last_used is None:
last_used = used
slicelist = slices.values()
slicelist.sort(lambda a, b: b['rss'] - a['rss'])
for slice in slicelist:
slicelist = slices.values()
slicelist.sort(lambda a, b: b['rss'] - a['rss'])
for slice in slicelist:
- percent = 100. * slice['rss'] / total_rss
+ percent = 100. * slice['rss'] / total_mem
if slice['rss'] < rss_min:
continue
if slice['rss'] < rss_min:
continue
slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
- table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND')
+ table = "%5s %10s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
for proc in slice['procs']:
for proc in slice['procs']:
- table += "%5s %10s %10s %4.1f %4.1f %s\n" % \
+ table += "%5s %10s %10s %10s %4.1f %4.1f %s\n" % \
(proc['pid'],
format_bytes(proc['vsize'] * 1024, si = False),
(proc['pid'],
format_bytes(proc['vsize'] * 1024, si = False),
+ format_bytes(proc['sz'] * 1024, si = False),
format_bytes(proc['rss'] * 1024, si = False),
proc['pcpu'], proc['pmem'], proc['cmd'])
format_bytes(proc['rss'] * 1024, si = False),
proc['pcpu'], proc['pmem'], proc['cmd'])
'table': table,
'slice': slice['name'],
'rss': format_bytes(slice['rss'] * 1024, si = False),
'table': table,
'slice': slice['name'],
'rss': format_bytes(slice['rss'] * 1024, si = False),
+ 'sz': format_bytes(slice['sz'] * 1024, si = False),
'percent': percent}
# Match slice name against system slice patterns
is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices])
if is_system_slice:
'percent': percent}
# Match slice name against system slice patterns
is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices])
if is_system_slice:
- if slice['name'] not in warned:
- warned.append(slice['name'])
- if debug:
- print alarm_subject % params
- print alarm_body % params
- else:
- print "Warning slice " + slice['name']
- slicemail(slice['name'], alarm_subject % params,
- alarm_body % params)
+ # Do not reset system slices, just warn once
+ if slice['name'] not in warned:
+ warned.append(slice['name'])
+ if debug:
+ print alarm_subject % params
+ print alarm_body % params
+ else:
+ print "Warning slice " + slice['name']
+ slicemail(slice['name'], alarm_subject % params,
+ alarm_body % params)
- # Reset slice
- if not resetlist.has_key(slice['name']):
- resetlist[slice['name']] = Reset(slice['name'])
- resetlist[slice['name']].reset(params)
- slices = slicestat(names)
+ # Reset slice
+ if not resetlist.has_key(slice['name']):
+ resetlist[slice['name']] = Reset(slice['name'])
+ resetlist[slice['name']].reset(params)
+ slices = slicestat(names)
if timer <= 0 or used >= (last_used + change_thresh):
if used >= (last_used + change_thresh):
if timer <= 0 or used >= (last_used + change_thresh):
if used >= (last_used + change_thresh):