--- /dev/null
+#!/usr/bin/env python
+
+#
+# This script is a nagios plugin that allows to check for a host
+#
+
+import sys
+import getopt
+
+import nagios
+import comon_query
+import comon_sensor
+
+command=sys.argv[0]
+revision="$Revision: 1.3 $"
+
+#nagios_plugins_dir="/usr/lib/nagios/plugins"
+
+
+options = 'vhnk:t:'
+long_opts = [ 'version' , 'help', 'no-comon' , 'key=', 'time-out=' ]
+
+
+usage_string="""Usage : %s [-n] [-k private_key] nodename
+Revision %s
+This nagios plugin checks for a given (planetlab) host
+The regular approach is to
+* First try to reach the comon query interface.
+ default host is %s
+ this can be overridden with the --host option (NIY)
+TODO : prevent this stage if a 'none' host is provided
+* If we cannot conclude from this, we then try and reach
+ the comon sensor on the node itself on port %d,
+TODO : skip on -n option
+* Then, if an ssh private key is provided with the -k option,
+ we try to enter the node as root and check that
+ the pl_conf slice is up and running on the node
+TODO : do this only of the -k option is provided
+* if none of this is conclusive we just check for the ssh server on the node
+ with the standard ssh plugin
+TODO : probably this should be left to the nagios config
+"""%(command,revision,comon_query.SERVER,comon_sensor.PORT)
+
+def usage ():
+ print usage_string
+ sys.exit(1)
+
+####################
+def main ():
+
+ try:
+ opts,args = getopt.getopt(sys.argv[1:], options, long_opts)
+ except getopt.GetoptError:
+ print "Unknown option"
+ usage()
+
+ opt_comon = True
+ opt_key = None
+ opt_timeout = 10
+
+ for o,a in opts:
+ if o in ['-v','--version']:
+ print command,'--',revision
+ sys.exit(3)
+ elif o in ['-h','--help']:
+ usage()
+ elif o in ['-n','--no-comon']:
+ opt_comon = False
+ elif o in ['-k','--key']:
+ opt_key=a
+ elif o in ['-t','--time-out']:
+ opt_timeout=int(a)
+ else:
+ print "Unknown option",o
+ usage()
+
+ if not len(args) == 1:
+ usage()
+ nodename = args[0]
+
+ status = nagios.UNKNOWN
+
+ status = comon_query.check(nodename)
+ if status != nagios.UNKNOWN:
+ return status
+
+ if opt_comon:
+ status = comon_sensor.check(nodename,opt_timeout)
+
+# print "status",status
+ return status
+
+
+if __name__=='__main__':
+ sys.exit(main())
+
+
--- /dev/null
+#!/usr/bin/env python
+
+#
+# This module checks for a planetlab node by
+# (*) connecting to the comon central query interface
+# (*) retrieving the latest info for a given node
+#
+
+import sys
+import socket
+import re
+import string
+import urllib2
+
+import nagios
+
+# default server
+SERVER='summer.cs.princeton.edu'
+
+NodeUnknownException="NodeUnknownByComon"
+
+####################
+def check (node,server=None):
+
+ if (server == None):
+ server = SERVER
+
+ try:
+
+ # compute node IP number
+ node_address = compute_address (node)
+# print 'int address',node_address
+ # build URL, connect and return attributes dict
+ node_dict = query_node (server,node_address)
+ for key in node_dict.keys():
+ print key,node_dict[key]
+ # interpret
+ return interpret (node_dict)
+
+ except NodeUnknownException:
+ return nagios.UNKNOWN
+ except Exception,e:
+ print "comon_query.check got exception",e
+ return nagios.UNKNOWN
+
+##########
+re_dec="([0-9]{1,3})"
+re_ipsep="\."
+re_ip=(re_dec+re_ipsep)*3+re_dec
+ma_ip = re.compile (re_ip)
+
+def compute_address (nodename):
+ ip=socket.gethostbyname(nodename)
+ ints=map(int,ma_ip.match(ip).groups())
+ res=ints[0]
+ for i in range(1,4):
+ res = (res*256)+ints[i]
+ return res
+
+##########
+URL_FORMAT="http://%s/status/tabulator.cgi"
+ARGS_FORMAT="table=table_nodeviewshort&select='%s'"
+FILTER_FORMAT="address==%d"
+CSV_FORMAT="&format=formatcsv"
+
+def filter_address (address):
+ return FILTER_FORMAT%address
+def filter_node (nodename):
+ return filter_address(compute_address(nodename))
+
+def full_url (server, filter):
+ return (URL_FORMAT%server
+ + '?'
+ + ARGS_FORMAT%filter)
+
+def full_url_csv (server,filter):
+ return full_url(server,filter)+CSV_FORMAT
+
+
+# mention field here means we'll parse it and keep it
+# see store_dict below
+FIELDS_FOCUS={
+ 'resptime': 'float',
+ 'sshstatus':'int',
+ 'bootstate':'string',
+ }
+
+NOTHING_MATCHED='nothing matched select statement'
+##########
+def query_node (server,address):
+ filter=filter_address(address)
+ full_url=full_url_csv(server,filter)
+ req = urllib2.urlopen(full_url)
+ # let's parse this manually
+ headers=map(string.strip,req.readline().split(','))
+ # handle the case where the node is unknown to comon
+ values=req.readline()
+ if values.find(NOTHING_MATCHED) != -1:
+ raise NodeUnknownException
+
+ values=values.split(',')
+
+ print 'h',headers
+ print 'v', values
+
+ dict={}
+# store_dict=lambda key,val: dict[key]=val
+# -> SyntaxError: can't assign to lambda
+ def store_dict (key,val):
+ if FIELDS_FOCUS.has_key(key):
+ format = FIELDS_FOCUS[key]
+ if format == 'float':
+ dict[key]=float(val)
+ elif format == 'int':
+ dict[key]=int(val)
+ elif format == 'string':
+ dict[key]=string.strip(val)
+ map (store_dict, headers, values)
+
+ return dict
+
+##########
+# function for decomposing a number along units
+# tuple [n] must be a multiple of tuple[n+1]
+# e.g.
+# 1223456, (10000, 500, 10) -> (122, 68, 5)
+# coz 1223456 = 122*10000 + 68*500 + 5*10 + 6
+def split_number (n,tuple):
+ result=()
+ for i in range(0,len(tuple)):
+ base=tuple[i]
+ result+=(n/base,)
+ n=n%base
+ return result
+
+###
+MINUTE=60
+HOUR=60*MINUTE
+DAY=HOUR*24
+WEEK=DAY*7
+MONTH=DAY*30
+
+# from a delay in seconds, returns a human-readable string
+def seconds_printable (seconds):
+ month,week,day,hour,minute = split_number(seconds,
+ (MONTH,WEEK,DAY,HOUR,MINUTE))
+ if month != 0:
+ return "%d month(s), %d weeks, %d day(s)"%(month,week,day)
+ elif week !=0:
+ return "%d weeks, %d day(s), %d hour(s)"%(week,day,hour)
+ elif day != 0:
+ return "%d day(s), %d hour(s)"%(day,hour)
+ elif hour != 0:
+ return "%d hour(s), %d minute(s)"%(hour,minute)
+ else:
+ return "%d minute(s)"%(minute)
+
+##########
+def interpret (dict):
+ # check sshstatus is null
+ sshstatus=dict['sshstatus']
+ if sshstatus != 0:
+ print 'No response to comon/ssh for %s'%seconds_printable(sshstatus)
+ if sshstatus >= 10*MINUTE:
+ return nagios.KO
+ else:
+ return nagios.WARNING
+ else:
+ # let's focus on resptime
+ resptime = dict['resptime']
+ print "Response time as measured by comon = %.2f s"%resptime
+ if resptime >= 10.0:
+ return nagios.KO
+ elif resptime >= 5.0:
+ return nagios.WARNING
+ else:
+ return nagios.OK
+
+#################### quick test
+def usage():
+ print "Usage comon_query.py node"
+ sys.exit(1)
+
+if __name__=='__main__':
+ if len(sys.argv) != 2:
+ usage()
+ print 'comon_query.check would return %d'%check(sys.argv[1])
+# print 'get >%s<',seconds_printable(int(sys.argv[1]))
--- /dev/null
+#!/usr/bin/env python
+import re
+import nagios
+import socket
+import signal
+import string
+
+import sys
+
+# default port number
+PORT=3121
+# default timeout
+TIMEOUT=10
+
+# exception raised when timeout occurs
+TimeoutException = "Timeout"
+
+#################### comon-based acquisition
+def check (hostname, timeout=None):
+
+# print "Entering comon_sensor::check",hostname
+
+ if (timeout == None):
+ timeout=TIMEOUT
+
+ try:
+ # connect to comon and read data
+ page = read_data (hostname,timeout)
+ # parse lines and store in dict
+ dict = parse_data (page)
+ # keep only relevant stuff and refine parsing
+ dict = filter (dict)
+ # make decision
+ return interpret (dict)
+
+ except TimeoutException:
+ print "While connecting to comon sensor : timeout expired %d s"%timeout
+ ### XXX - in some cases this is a KO, but in general
+ # maybe comon does not run on these nodes
+ return nagios.UNKNOWN
+
+### implement timeout as an alarm signal
+def alarm_handler(s,closure):
+ if s == signal.SIGALRM:
+ raise TimeoutException
+ else:
+ print "unexpected signal s in alarm_handler"
+
+###
+# returns a list of lines
+# dont use httplib nor urllib2
+# the server side replies its data even before you send a GET request
+# with urllib2 you basically get a 'Connection reset by peer' error
+def read_data (hostname,timeout,port=None):
+
+ if (port ==None):
+ port=PORT
+
+ signal.signal(signal.SIGALRM,alarm_handler)
+ signal.alarm(timeout)
+ s =socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ s.connect((hostname,port))
+ signal.alarm(0)
+
+ lines=[]
+ line=""
+ while 1:
+ char=s.recv(1)
+ if not char:
+ break
+ elif char == '\n':
+ lines += [line]
+ line=""
+ else:
+ line+=char
+ return lines
+
+### pattern for interpreting sensor output
+re_line="^([a-zA-Z0-9_]+): (.*)$"
+ma_line=re.compile(re_line)
+
+def parse_data (page):
+ dict={}
+ for line in page:
+ matched=ma_line.match(line)
+ if matched:
+ key,val=matched.groups()
+ dict[key]=val
+ return dict
+
+FIELDS_FOCUS = {
+ 'Loads':'floats',
+ 'VMStat':'ints',
+ 'CPUUse':'ints',
+ 'MemInfo':'floats',
+ 'Date':'floats',
+ 'DfDot':'percent-floats',
+ 'LastSsh':'ints',
+ }
+
+
+def filter (dict):
+
+ filtered = {}
+ for key in dict.keys():
+ if key in FIELDS_FOCUS.keys():
+ format=FIELDS_FOCUS[key]
+ value=dict[key].rstrip()
+ if format == 'ints':
+ filtered[key]=map(int,value.split(' '))
+ elif format == 'floats':
+ filtered[key]=map(float,value.split(' '))
+ elif format == 'percent-floats':
+ fields=value.split()
+ pval=string.replace(fields[0],'%','')
+ filtered[key]=map(float,[pval]+fields[1:])
+ # simpler access to single-fields
+ value = filtered[key]
+ if len(value)==1:
+ filtered[key]=value[0]
+ return filtered
+
+def interpret (dict):
+
+ status = nagios.OK
+
+ ### check ssh status
+ ssh_delay = dict['Date']- dict['LastSsh']
+ print ssh_delay
+
+ ###
+ return status
+
+###
+def usage():
+ print "Usage comon_sensor.py node timeout"
+ sys.exit(1)
+
+if __name__=='__main__':
+ if len(sys.argv) != 3:
+ usage()
+ check(sys.argv[1],int(sys.argv[2]))
--- /dev/null
+../configurator/nagios.py
\ No newline at end of file