Add bw, dns, and uptime checks.
[myops.git] / web / collect / client / check_dns.py
diff --git a/web/collect/client/check_dns.py b/web/collect/client/check_dns.py
new file mode 100755 (executable)
index 0000000..ffd7359
--- /dev/null
@@ -0,0 +1,192 @@
+#!/usr/bin/python
+
+# can't probe comon directly from node.
+# http://comon.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='dns1udp>80 && dns2udp>80&&name=="planetlab-01.cs.princeton.edu"'&format=formatcsv&dumpcols='dns1udp,dns1tcp,dns2udp,dns2tcp'
+
+import commands
+import os
+import re
+import socket
+import struct
+import DNS
+import time
+#import ctypes
+# TODO: maybe when there's more time; for better readability.
+#class History(Structure):
+#    _fields_ = [ ("version", c_int),
+#                 ("index", c_int),
+#                 ("history", c_float * HISTORY_LENGTH), ]
+
+# allocate fixed space on disk to save persistent state.
+# what to store in this file?
+# slice_history : x,x,x,x,x,...
+# root_history : y,y,y,y,y,y...
+
+HISTORY_LENGTH = 24*30  # 30 days, if checked once an hour
+HISTORY_fmt = ('ii', 'f'*HISTORY_LENGTH )
+HISTORY_version = 1
+
+def read_safe_history(filename):
+    """
+        This function guarantees that space is preserved.
+        If one of the file operations fail, it will throw an exception.
+    """
+    if os.path.exists(filename):
+        # read existing data
+        fd = os.open(filename, os.O_RDONLY)
+        a = os.read(fd, os.path.getsize(filename))
+        try:
+            (version, i) = struct.unpack_from(HISTORY_fmt[0], a, 0)
+            assert version == HISTORY_version
+            history = struct.unpack_from(HISTORY_fmt[1], a, struct.calcsize(HISTORY_fmt[0]))
+            history = [ h for h in history ]
+        except:
+            # TODO: in the future a more clever version migration might be nice.
+            os.remove(filename) # just nuke the old version
+            # create for the first time, with empty data
+            (i, history) = (0, [0]*HISTORY_LENGTH)
+            write_safe_history(filename, (i, history), False)
+
+        os.close(fd)
+            
+    else:
+        # create for the first time, with empty data
+        (i, history) = (0, [0]*HISTORY_LENGTH)
+        write_safe_history(filename, (i, history), False)
+
+    return (i, history)
+
+def write_safe_history(filename, (i, history), check_for_file=True):
+    # length should match, and the file should already exist
+    assert len(history) == HISTORY_LENGTH
+    if check_for_file:
+        assert os.path.exists(filename)
+
+    # open without TRUNC nor APPEND, then seek to beginning to preserve space on disk
+    fd = os.open(filename, os.O_WRONLY|os.O_CREAT)
+    os.lseek(fd, 0, 0)
+    ret  = os.write(fd, struct.pack(HISTORY_fmt[0], HISTORY_version, i))
+    ret += os.write(fd, struct.pack(HISTORY_fmt[1], *history))
+    os.close(fd)
+    return ret
+
+def add_to_history((i, history), data):
+    history[i] = data
+    i += 1
+    i = i % HISTORY_LENGTH
+    return (i, history)
+
+def record_status_record(filename, status):
+    rh = read_safe_history(filename)
+    return write_safe_history(filename, add_to_history(rh, status))
+
+def get_success_ratio(filename):
+    rh = read_safe_history(filename)
+    idx = rh[0]
+    summary = rh[1][idx:] + rh[1][:idx]
+    measured = filter(lambda x: x != 0, summary)
+    if len(measured) == 0: 
+        return 0
+
+    return float(len(filter(lambda x: x > 0, measured)))/float(len(measured))
+
+def timed(method):
+
+    def timeit(*args, **kw):
+        ts = time.time()
+        result = method(*args, **kw)
+        te = time.time()
+
+        #print '%r (%r, %r) %2.2f sec' % \
+        #      (method.__name__, args, kw, te-ts)
+        return (result, te-ts)
+
+    return timeit
+
+@timed
+def check_dns(ip, protocol='udp'):
+    try:
+        #ip = ip[:-1] + "0"
+        ro = DNS.Request(name="www.yahoo.com", qtype="A", server=ip)
+        r = ro.req(protocol=protocol)
+        r = "OK"
+    except DNS.Base.DNSError, e:
+        r = "Error: %s" % e
+    return r
+        
+def get_nameserver_ips(filename):
+    ip_re = re.compile("\d+\.\d+\.\d+\.\d+")
+    ret = {}
+    if not os.path.exists(filename):
+        return ret
+
+    f = open(filename, 'r')
+
+    if 'resolv' in filename:
+        for l in f:
+            for field in l.strip().split():
+                if ip_re.match(field) and field not in ret:
+                    ret[field] = 0
+
+    if 'ifcfg' in filename:
+        for l in f:
+            if 'DNS' not in l:
+                continue
+            for field in l.strip().split('='):
+                field = field.replace('"', '')
+                field = field.replace("'", '')
+                if ip_re.match(field) and field not in ret:
+                    ret[field] = 0
+    return ret
+
+def main():
+
+    root_ips  = get_nameserver_ips('/etc/resolv.conf')
+    slice_ips = get_nameserver_ips( '/vservers/princeton_comon/etc/resolv.conf')
+
+    for i,ip in enumerate(root_ips.keys()): 
+        (s,t) = check_dns(ip, 'udp')
+        if "Error" in s: t = -1
+        record_status_record("dns_history_root_udp%s.dat" % i, t)
+
+        (s,t) = check_dns(ip, 'tcp')
+        if "Error" in s: t = -1
+        record_status_record("dns_history_root_tcp%s.dat" % i, t)
+    
+    for i,ip in enumerate(slice_ips.keys()):
+        (s,t) = check_dns(ip, 'udp')
+        if "Error" in s: t = -1
+        record_status_record("dns_history_slice_udp%s.dat" % i, t)
+
+        (s,t) = check_dns(ip, 'tcp')
+        if "Error" in s: t = -1
+        record_status_record("dns_history_slice_tcp%s.dat" % i, t)
+
+    if set(root_ips.keys()) == set(slice_ips.keys()):
+        print "CONF-ROOT_SLICE-MATCH",
+    else:
+        print "CONF-ROOT_SLICE-MISMATCH",
+        #if set(root_ips.keys()) != set(slice_ips.keys()):
+        #if set(root_ips.keys()) != set(ifcfg_ips.keys()) and len(set(ifcfg_ips.keys())) > 0:
+        #    print "CONF-IFCFG_ROOT-MISMATCH",
+
+    print get_success_ratio('dns_history_root_udp0.dat'),
+    print get_success_ratio('dns_history_root_udp1.dat'),
+    print get_success_ratio('dns_history_slice_udp0.dat'),
+    print get_success_ratio('dns_history_slice_udp1.dat'),
+    c_dns = os.popen("curl -s http://localhost:3121 | grep -a DNSFail").read().strip()
+    if len(c_dns) > 9 and "DNS" in c_dns:
+        c_dns = "cm " + c_dns[9:]
+    else:
+        c_dns = ""
+    print c_dns,
+
+    print ""
+
+
+if __name__ == "__main__":
+    main()
+
+
+# TODO: comon?
+#url = """http://comon.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&select='dns1udp>80 && dns2udp>80&&name=="%s"'&format=formatcsv&dumpcols='dns1udp,dns1tcp,dns2udp,dns2tcp'""" % os.popen("hostname").read().strip()