creation
authorbuild <build@41d37cc5-eb28-0410-a9bf-d37491348ade>
Sun, 18 Feb 2007 15:37:23 +0000 (15:37 +0000)
committerbuild <build@41d37cc5-eb28-0410-a9bf-d37491348ade>
Sun, 18 Feb 2007 15:37:23 +0000 (15:37 +0000)
nagios/plugin/check_planetlab.py [new file with mode: 0755]
nagios/plugin/comon_query.py [new file with mode: 0755]
nagios/plugin/comon_sensor.py [new file with mode: 0755]
nagios/plugin/nagios.py [new symlink]

diff --git a/nagios/plugin/check_planetlab.py b/nagios/plugin/check_planetlab.py
new file mode 100755 (executable)
index 0000000..02fd473
--- /dev/null
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+#
+# This script is a nagios plugin that allows to check for a host
+#
+
+import sys
+import getopt
+
+import nagios
+import comon_query
+import comon_sensor
+
+command=sys.argv[0]
+revision="$Revision: 1.3 $"
+
+#nagios_plugins_dir="/usr/lib/nagios/plugins"
+
+
+options = 'vhnk:t:'
+long_opts = [ 'version' , 'help', 'no-comon' , 'key=', 'time-out=' ]
+
+
+usage_string="""Usage : %s [-n] [-k private_key] nodename
+Revision %s
+This nagios plugin checks for a given (planetlab) host
+The regular approach is to
+* First try to reach the comon query interface.
+  default host is %s
+  this can be overridden with the --host option (NIY)
+TODO : prevent this stage if a 'none' host is provided
+* If we cannot conclude from this, we then try and reach
+  the comon sensor on the node itself on port %d, 
+TODO : skip on -n option
+* Then, if an ssh private key is provided with the -k option,
+  we try to enter the node as root and check that
+  the pl_conf slice is up and running on the node
+TODO : do this only of the -k option is provided
+* if none of this is conclusive we just check for the ssh server on the node
+  with the standard ssh plugin
+TODO : probably this should be left to the nagios config
+"""%(command,revision,comon_query.SERVER,comon_sensor.PORT)
+
+def usage ():
+    print usage_string
+    sys.exit(1)
+
+####################
+def main ():
+
+    try:
+        opts,args = getopt.getopt(sys.argv[1:], options, long_opts)
+    except getopt.GetoptError:
+        print "Unknown option"
+        usage()
+
+    opt_comon = True
+    opt_key = None
+    opt_timeout = 10
+
+    for o,a in opts:
+        if o in ['-v','--version']:
+            print command,'--',revision
+            sys.exit(3)
+        elif o in ['-h','--help']:
+            usage()
+        elif o in ['-n','--no-comon']:
+            opt_comon = False            
+        elif o in ['-k','--key']:
+            opt_key=a
+        elif o in ['-t','--time-out']:
+            opt_timeout=int(a)
+        else:
+            print "Unknown option",o
+            usage()
+
+    if not len(args) == 1:
+        usage()
+    nodename = args[0]
+
+    status = nagios.UNKNOWN
+
+    status = comon_query.check(nodename)
+    if status != nagios.UNKNOWN:
+        return status
+    
+    if opt_comon:
+        status = comon_sensor.check(nodename,opt_timeout)
+
+#    print "status",status
+    return status
+
+
+if __name__=='__main__':
+    sys.exit(main())
+        
+            
diff --git a/nagios/plugin/comon_query.py b/nagios/plugin/comon_query.py
new file mode 100755 (executable)
index 0000000..9aa2685
--- /dev/null
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+
+#
+# This module checks for a planetlab node by
+# (*) connecting to the comon central query interface
+# (*) retrieving the latest info for a given node
+#
+
+import sys
+import socket
+import re
+import string
+import urllib2
+
+import nagios
+
+# default server
+SERVER='summer.cs.princeton.edu'
+
+NodeUnknownException="NodeUnknownByComon"
+
+####################
+def check (node,server=None):
+
+    if (server == None):
+        server = SERVER
+
+    try:
+    
+        # compute node IP number
+        node_address = compute_address (node)
+#        print 'int address',node_address
+        # build URL, connect and return attributes dict
+        node_dict = query_node (server,node_address)
+        for key in node_dict.keys():
+            print key,node_dict[key]
+        # interpret
+        return  interpret (node_dict)
+
+    except NodeUnknownException:
+        return nagios.UNKNOWN
+    except Exception,e:
+        print "comon_query.check got exception",e
+        return nagios.UNKNOWN
+
+##########
+re_dec="([0-9]{1,3})"
+re_ipsep="\."
+re_ip=(re_dec+re_ipsep)*3+re_dec
+ma_ip = re.compile (re_ip)
+
+def compute_address (nodename):
+    ip=socket.gethostbyname(nodename)
+    ints=map(int,ma_ip.match(ip).groups())
+    res=ints[0]
+    for i in range(1,4):
+        res = (res*256)+ints[i]
+    return res
+
+##########
+URL_FORMAT="http://%s/status/tabulator.cgi"
+ARGS_FORMAT="table=table_nodeviewshort&select='%s'"
+FILTER_FORMAT="address==%d"
+CSV_FORMAT="&format=formatcsv"
+
+def filter_address (address):
+    return FILTER_FORMAT%address
+def filter_node (nodename):
+    return filter_address(compute_address(nodename))
+
+def full_url (server, filter):
+    return (URL_FORMAT%server
+            + '?'
+            + ARGS_FORMAT%filter)
+
+def full_url_csv (server,filter):
+    return full_url(server,filter)+CSV_FORMAT
+
+
+# mention field here means we'll parse it and keep it
+# see store_dict below 
+FIELDS_FOCUS={
+    'resptime': 'float',
+    'sshstatus':'int',
+    'bootstate':'string',
+    }
+
+NOTHING_MATCHED='nothing matched select statement'
+##########
+def query_node (server,address):
+    filter=filter_address(address)
+    full_url=full_url_csv(server,filter)
+    req = urllib2.urlopen(full_url)
+    # let's parse this manually
+    headers=map(string.strip,req.readline().split(','))
+    # handle the case where the node is unknown to comon
+    values=req.readline()
+    if values.find(NOTHING_MATCHED) != -1:
+        raise NodeUnknownException
+
+    values=values.split(',')
+
+    print 'h',headers
+    print 'v', values
+    
+    dict={}
+#    store_dict=lambda key,val: dict[key]=val
+#    -> SyntaxError: can't assign to lambda
+    def store_dict (key,val):
+        if FIELDS_FOCUS.has_key(key):
+            format = FIELDS_FOCUS[key]
+            if format == 'float':
+                dict[key]=float(val)
+            elif format == 'int':
+                dict[key]=int(val)
+            elif format == 'string':
+                dict[key]=string.strip(val)
+    map (store_dict, headers, values)
+
+    return dict
+
+##########
+# function for decomposing a number along units
+# tuple [n] must be a multiple of tuple[n+1]
+# e.g.
+#   1223456, (10000, 500, 10)  -> (122, 68, 5)
+# coz 1223456 = 122*10000 + 68*500 + 5*10 + 6 
+def split_number (n,tuple):
+    result=()
+    for i in range(0,len(tuple)):
+        base=tuple[i]
+        result+=(n/base,)
+        n=n%base
+    return result
+
+###
+MINUTE=60
+HOUR=60*MINUTE
+DAY=HOUR*24
+WEEK=DAY*7
+MONTH=DAY*30
+
+# from a delay in seconds, returns a human-readable string
+def seconds_printable (seconds):
+    month,week,day,hour,minute = split_number(seconds,
+                                              (MONTH,WEEK,DAY,HOUR,MINUTE))
+    if month != 0:
+        return "%d month(s), %d weeks, %d day(s)"%(month,week,day)
+    elif week !=0:
+        return "%d weeks, %d day(s), %d hour(s)"%(week,day,hour)
+    elif day != 0:
+        return "%d day(s), %d hour(s)"%(day,hour)
+    elif hour != 0:
+        return "%d hour(s), %d minute(s)"%(hour,minute)
+    else:
+        return "%d minute(s)"%(minute)
+
+##########
+def interpret (dict):
+    # check sshstatus is null
+    sshstatus=dict['sshstatus']
+    if sshstatus != 0:
+        print 'No response to comon/ssh for %s'%seconds_printable(sshstatus)
+        if sshstatus >= 10*MINUTE:
+            return nagios.KO
+        else:
+            return nagios.WARNING
+    else:
+        # let's focus on resptime
+        resptime = dict['resptime']
+        print "Response time as measured by comon = %.2f s"%resptime
+        if resptime >= 10.0:
+            return nagios.KO
+        elif resptime >= 5.0:
+            return nagios.WARNING
+        else:
+            return nagios.OK
+
+#################### quick test
+def usage():
+    print "Usage comon_query.py node"
+    sys.exit(1)
+
+if __name__=='__main__':
+    if len(sys.argv) != 2:
+        usage()
+    print 'comon_query.check would return %d'%check(sys.argv[1])
+#     print 'get >%s<',seconds_printable(int(sys.argv[1]))
diff --git a/nagios/plugin/comon_sensor.py b/nagios/plugin/comon_sensor.py
new file mode 100755 (executable)
index 0000000..610f32a
--- /dev/null
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+import re
+import nagios
+import socket
+import signal
+import string
+
+import sys
+
+# default port number
+PORT=3121
+# default timeout
+TIMEOUT=10
+
+# exception raised when timeout occurs
+TimeoutException = "Timeout"
+
+#################### comon-based acquisition
+def check (hostname, timeout=None):
+
+#    print "Entering comon_sensor::check",hostname
+
+    if (timeout == None):
+        timeout=TIMEOUT
+
+    try:
+        # connect to comon and read data
+        page = read_data (hostname,timeout)
+        # parse lines and store in dict
+        dict = parse_data (page)
+        # keep only relevant stuff and refine parsing
+        dict = filter (dict)
+        # make decision
+        return interpret (dict)
+
+    except TimeoutException:
+        print "While connecting to comon sensor : timeout expired %d s"%timeout
+        ### XXX - in some cases this is a KO, but in general
+        # maybe comon does not run on these nodes
+        return nagios.UNKNOWN
+
+### implement timeout as an alarm signal
+def alarm_handler(s,closure):
+    if s == signal.SIGALRM:
+        raise TimeoutException
+    else:
+        print "unexpected signal s in alarm_handler"
+
+###
+# returns a list of lines
+# dont use httplib nor urllib2
+# the server side replies its data even before you send a GET request
+# with urllib2 you basically get a 'Connection reset by peer' error
+def read_data (hostname,timeout,port=None):
+
+    if (port ==None):
+        port=PORT
+
+    signal.signal(signal.SIGALRM,alarm_handler)
+    signal.alarm(timeout)
+    s =socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.connect((hostname,port))
+    signal.alarm(0)
+    
+    lines=[]
+    line=""
+    while 1:
+        char=s.recv(1)
+        if not char:
+            break
+        elif char == '\n':
+            lines += [line]
+            line=""
+        else:
+            line+=char
+    return lines
+
+### pattern for interpreting sensor output
+re_line="^([a-zA-Z0-9_]+): (.*)$"
+ma_line=re.compile(re_line)
+
+def parse_data (page):
+    dict={}
+    for line in page:
+        matched=ma_line.match(line)
+        if matched:
+            key,val=matched.groups()
+            dict[key]=val
+    return dict
+        
+FIELDS_FOCUS = {
+    'Loads':'floats',
+    'VMStat':'ints',
+    'CPUUse':'ints',
+    'MemInfo':'floats',
+    'Date':'floats',
+    'DfDot':'percent-floats',
+    'LastSsh':'ints',
+    }
+
+
+def filter (dict):
+
+    filtered = {}
+    for key in dict.keys():
+        if key in FIELDS_FOCUS.keys():
+            format=FIELDS_FOCUS[key]
+            value=dict[key].rstrip()
+            if format == 'ints':
+                filtered[key]=map(int,value.split(' '))
+            elif format == 'floats':
+                filtered[key]=map(float,value.split(' '))
+            elif format == 'percent-floats':
+                fields=value.split()
+                pval=string.replace(fields[0],'%','')
+                filtered[key]=map(float,[pval]+fields[1:])
+            # simpler access to single-fields
+            value = filtered[key]
+            if len(value)==1:
+                filtered[key]=value[0]
+    return filtered
+
+def interpret (dict):
+
+    status = nagios.OK
+
+    ### check ssh status
+    ssh_delay = dict['Date']- dict['LastSsh']
+    print ssh_delay
+
+    ### 
+    return status
+
+###
+def usage():
+    print "Usage comon_sensor.py node timeout"
+    sys.exit(1)
+
+if __name__=='__main__':
+    if len(sys.argv) != 3:
+        usage()
+    check(sys.argv[1],int(sys.argv[2]))
diff --git a/nagios/plugin/nagios.py b/nagios/plugin/nagios.py
new file mode 120000 (symlink)
index 0000000..8433a01
--- /dev/null
@@ -0,0 +1 @@
+../configurator/nagios.py
\ No newline at end of file