creation
[infrastructure.git] / nagios / plugin / comon_sensor.py
diff --git a/nagios/plugin/comon_sensor.py b/nagios/plugin/comon_sensor.py
new file mode 100755 (executable)
index 0000000..610f32a
--- /dev/null
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+import re
+import nagios
+import socket
+import signal
+import string
+
+import sys
+
+# default port number
+PORT=3121
+# default timeout
+TIMEOUT=10
+
+# exception raised when timeout occurs
+TimeoutException = "Timeout"
+
+#################### comon-based acquisition
+def check (hostname, timeout=None):
+
+#    print "Entering comon_sensor::check",hostname
+
+    if (timeout == None):
+        timeout=TIMEOUT
+
+    try:
+        # connect to comon and read data
+        page = read_data (hostname,timeout)
+        # parse lines and store in dict
+        dict = parse_data (page)
+        # keep only relevant stuff and refine parsing
+        dict = filter (dict)
+        # make decision
+        return interpret (dict)
+
+    except TimeoutException:
+        print "While connecting to comon sensor : timeout expired %d s"%timeout
+        ### XXX - in some cases this is a KO, but in general
+        # maybe comon does not run on these nodes
+        return nagios.UNKNOWN
+
+### implement timeout as an alarm signal
+def alarm_handler(s,closure):
+    if s == signal.SIGALRM:
+        raise TimeoutException
+    else:
+        print "unexpected signal s in alarm_handler"
+
+###
+# returns a list of lines
+# dont use httplib nor urllib2
+# the server side replies its data even before you send a GET request
+# with urllib2 you basically get a 'Connection reset by peer' error
+def read_data (hostname,timeout,port=None):
+
+    if (port ==None):
+        port=PORT
+
+    signal.signal(signal.SIGALRM,alarm_handler)
+    signal.alarm(timeout)
+    s =socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.connect((hostname,port))
+    signal.alarm(0)
+    
+    lines=[]
+    line=""
+    while 1:
+        char=s.recv(1)
+        if not char:
+            break
+        elif char == '\n':
+            lines += [line]
+            line=""
+        else:
+            line+=char
+    return lines
+
+### pattern for interpreting sensor output
+re_line="^([a-zA-Z0-9_]+): (.*)$"
+ma_line=re.compile(re_line)
+
+def parse_data (page):
+    dict={}
+    for line in page:
+        matched=ma_line.match(line)
+        if matched:
+            key,val=matched.groups()
+            dict[key]=val
+    return dict
+        
+FIELDS_FOCUS = {
+    'Loads':'floats',
+    'VMStat':'ints',
+    'CPUUse':'ints',
+    'MemInfo':'floats',
+    'Date':'floats',
+    'DfDot':'percent-floats',
+    'LastSsh':'ints',
+    }
+
+
+def filter (dict):
+
+    filtered = {}
+    for key in dict.keys():
+        if key in FIELDS_FOCUS.keys():
+            format=FIELDS_FOCUS[key]
+            value=dict[key].rstrip()
+            if format == 'ints':
+                filtered[key]=map(int,value.split(' '))
+            elif format == 'floats':
+                filtered[key]=map(float,value.split(' '))
+            elif format == 'percent-floats':
+                fields=value.split()
+                pval=string.replace(fields[0],'%','')
+                filtered[key]=map(float,[pval]+fields[1:])
+            # simpler access to single-fields
+            value = filtered[key]
+            if len(value)==1:
+                filtered[key]=value[0]
+    return filtered
+
+def interpret (dict):
+
+    status = nagios.OK
+
+    ### check ssh status
+    ssh_delay = dict['Date']- dict['LastSsh']
+    print ssh_delay
+
+    ### 
+    return status
+
+###
+def usage():
+    print "Usage comon_sensor.py node timeout"
+    sys.exit(1)
+
+if __name__=='__main__':
+    if len(sys.argv) != 3:
+        usage()
+    check(sys.argv[1],int(sys.argv[2]))