From 7ac26d99242db506fda7e7e88ed31ab8f0944748 Mon Sep 17 00:00:00 2001 From: build Date: Sun, 18 Feb 2007 15:37:23 +0000 Subject: [PATCH] creation --- nagios/plugin/check_planetlab.py | 97 ++++++++++++++++ nagios/plugin/comon_query.py | 188 +++++++++++++++++++++++++++++++ nagios/plugin/comon_sensor.py | 142 +++++++++++++++++++++++ nagios/plugin/nagios.py | 1 + 4 files changed, 428 insertions(+) create mode 100755 nagios/plugin/check_planetlab.py create mode 100755 nagios/plugin/comon_query.py create mode 100755 nagios/plugin/comon_sensor.py create mode 120000 nagios/plugin/nagios.py diff --git a/nagios/plugin/check_planetlab.py b/nagios/plugin/check_planetlab.py new file mode 100755 index 0000000..02fd473 --- /dev/null +++ b/nagios/plugin/check_planetlab.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +# +# This script is a nagios plugin that allows to check for a host +# + +import sys +import getopt + +import nagios +import comon_query +import comon_sensor + +command=sys.argv[0] +revision="$Revision: 1.3 $" + +#nagios_plugins_dir="/usr/lib/nagios/plugins" + + +options = 'vhnk:t:' +long_opts = [ 'version' , 'help', 'no-comon' , 'key=', 'time-out=' ] + + +usage_string="""Usage : %s [-n] [-k private_key] nodename +Revision %s +This nagios plugin checks for a given (planetlab) host +The regular approach is to +* First try to reach the comon query interface. + default host is %s + this can be overridden with the --host option (NIY) +TODO : prevent this stage if a 'none' host is provided +* If we cannot conclude from this, we then try and reach + the comon sensor on the node itself on port %d, +TODO : skip on -n option +* Then, if an ssh private key is provided with the -k option, + we try to enter the node as root and check that + the pl_conf slice is up and running on the node +TODO : do this only of the -k option is provided +* if none of this is conclusive we just check for the ssh server on the node + with the standard ssh plugin +TODO : probably this should be left to the nagios config +"""%(command,revision,comon_query.SERVER,comon_sensor.PORT) + +def usage (): + print usage_string + sys.exit(1) + +#################### +def main (): + + try: + opts,args = getopt.getopt(sys.argv[1:], options, long_opts) + except getopt.GetoptError: + print "Unknown option" + usage() + + opt_comon = True + opt_key = None + opt_timeout = 10 + + for o,a in opts: + if o in ['-v','--version']: + print command,'--',revision + sys.exit(3) + elif o in ['-h','--help']: + usage() + elif o in ['-n','--no-comon']: + opt_comon = False + elif o in ['-k','--key']: + opt_key=a + elif o in ['-t','--time-out']: + opt_timeout=int(a) + else: + print "Unknown option",o + usage() + + if not len(args) == 1: + usage() + nodename = args[0] + + status = nagios.UNKNOWN + + status = comon_query.check(nodename) + if status != nagios.UNKNOWN: + return status + + if opt_comon: + status = comon_sensor.check(nodename,opt_timeout) + +# print "status",status + return status + + +if __name__=='__main__': + sys.exit(main()) + + diff --git a/nagios/plugin/comon_query.py b/nagios/plugin/comon_query.py new file mode 100755 index 0000000..9aa2685 --- /dev/null +++ b/nagios/plugin/comon_query.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python + +# +# This module checks for a planetlab node by +# (*) connecting to the comon central query interface +# (*) retrieving the latest info for a given node +# + +import sys +import socket +import re +import string +import urllib2 + +import nagios + +# default server +SERVER='summer.cs.princeton.edu' + +NodeUnknownException="NodeUnknownByComon" + +#################### +def check (node,server=None): + + if (server == None): + server = SERVER + + try: + + # compute node IP number + node_address = compute_address (node) +# print 'int address',node_address + # build URL, connect and return attributes dict + node_dict = query_node (server,node_address) + for key in node_dict.keys(): + print key,node_dict[key] + # interpret + return interpret (node_dict) + + except NodeUnknownException: + return nagios.UNKNOWN + except Exception,e: + print "comon_query.check got exception",e + return nagios.UNKNOWN + +########## +re_dec="([0-9]{1,3})" +re_ipsep="\." +re_ip=(re_dec+re_ipsep)*3+re_dec +ma_ip = re.compile (re_ip) + +def compute_address (nodename): + ip=socket.gethostbyname(nodename) + ints=map(int,ma_ip.match(ip).groups()) + res=ints[0] + for i in range(1,4): + res = (res*256)+ints[i] + return res + +########## +URL_FORMAT="http://%s/status/tabulator.cgi" +ARGS_FORMAT="table=table_nodeviewshort&select='%s'" +FILTER_FORMAT="address==%d" +CSV_FORMAT="&format=formatcsv" + +def filter_address (address): + return FILTER_FORMAT%address +def filter_node (nodename): + return filter_address(compute_address(nodename)) + +def full_url (server, filter): + return (URL_FORMAT%server + + '?' + + ARGS_FORMAT%filter) + +def full_url_csv (server,filter): + return full_url(server,filter)+CSV_FORMAT + + +# mention field here means we'll parse it and keep it +# see store_dict below +FIELDS_FOCUS={ + 'resptime': 'float', + 'sshstatus':'int', + 'bootstate':'string', + } + +NOTHING_MATCHED='nothing matched select statement' +########## +def query_node (server,address): + filter=filter_address(address) + full_url=full_url_csv(server,filter) + req = urllib2.urlopen(full_url) + # let's parse this manually + headers=map(string.strip,req.readline().split(',')) + # handle the case where the node is unknown to comon + values=req.readline() + if values.find(NOTHING_MATCHED) != -1: + raise NodeUnknownException + + values=values.split(',') + + print 'h',headers + print 'v', values + + dict={} +# store_dict=lambda key,val: dict[key]=val +# -> SyntaxError: can't assign to lambda + def store_dict (key,val): + if FIELDS_FOCUS.has_key(key): + format = FIELDS_FOCUS[key] + if format == 'float': + dict[key]=float(val) + elif format == 'int': + dict[key]=int(val) + elif format == 'string': + dict[key]=string.strip(val) + map (store_dict, headers, values) + + return dict + +########## +# function for decomposing a number along units +# tuple [n] must be a multiple of tuple[n+1] +# e.g. +# 1223456, (10000, 500, 10) -> (122, 68, 5) +# coz 1223456 = 122*10000 + 68*500 + 5*10 + 6 +def split_number (n,tuple): + result=() + for i in range(0,len(tuple)): + base=tuple[i] + result+=(n/base,) + n=n%base + return result + +### +MINUTE=60 +HOUR=60*MINUTE +DAY=HOUR*24 +WEEK=DAY*7 +MONTH=DAY*30 + +# from a delay in seconds, returns a human-readable string +def seconds_printable (seconds): + month,week,day,hour,minute = split_number(seconds, + (MONTH,WEEK,DAY,HOUR,MINUTE)) + if month != 0: + return "%d month(s), %d weeks, %d day(s)"%(month,week,day) + elif week !=0: + return "%d weeks, %d day(s), %d hour(s)"%(week,day,hour) + elif day != 0: + return "%d day(s), %d hour(s)"%(day,hour) + elif hour != 0: + return "%d hour(s), %d minute(s)"%(hour,minute) + else: + return "%d minute(s)"%(minute) + +########## +def interpret (dict): + # check sshstatus is null + sshstatus=dict['sshstatus'] + if sshstatus != 0: + print 'No response to comon/ssh for %s'%seconds_printable(sshstatus) + if sshstatus >= 10*MINUTE: + return nagios.KO + else: + return nagios.WARNING + else: + # let's focus on resptime + resptime = dict['resptime'] + print "Response time as measured by comon = %.2f s"%resptime + if resptime >= 10.0: + return nagios.KO + elif resptime >= 5.0: + return nagios.WARNING + else: + return nagios.OK + +#################### quick test +def usage(): + print "Usage comon_query.py node" + sys.exit(1) + +if __name__=='__main__': + if len(sys.argv) != 2: + usage() + print 'comon_query.check would return %d'%check(sys.argv[1]) +# print 'get >%s<',seconds_printable(int(sys.argv[1])) diff --git a/nagios/plugin/comon_sensor.py b/nagios/plugin/comon_sensor.py new file mode 100755 index 0000000..610f32a --- /dev/null +++ b/nagios/plugin/comon_sensor.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +import re +import nagios +import socket +import signal +import string + +import sys + +# default port number +PORT=3121 +# default timeout +TIMEOUT=10 + +# exception raised when timeout occurs +TimeoutException = "Timeout" + +#################### comon-based acquisition +def check (hostname, timeout=None): + +# print "Entering comon_sensor::check",hostname + + if (timeout == None): + timeout=TIMEOUT + + try: + # connect to comon and read data + page = read_data (hostname,timeout) + # parse lines and store in dict + dict = parse_data (page) + # keep only relevant stuff and refine parsing + dict = filter (dict) + # make decision + return interpret (dict) + + except TimeoutException: + print "While connecting to comon sensor : timeout expired %d s"%timeout + ### XXX - in some cases this is a KO, but in general + # maybe comon does not run on these nodes + return nagios.UNKNOWN + +### implement timeout as an alarm signal +def alarm_handler(s,closure): + if s == signal.SIGALRM: + raise TimeoutException + else: + print "unexpected signal s in alarm_handler" + +### +# returns a list of lines +# dont use httplib nor urllib2 +# the server side replies its data even before you send a GET request +# with urllib2 you basically get a 'Connection reset by peer' error +def read_data (hostname,timeout,port=None): + + if (port ==None): + port=PORT + + signal.signal(signal.SIGALRM,alarm_handler) + signal.alarm(timeout) + s =socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((hostname,port)) + signal.alarm(0) + + lines=[] + line="" + while 1: + char=s.recv(1) + if not char: + break + elif char == '\n': + lines += [line] + line="" + else: + line+=char + return lines + +### pattern for interpreting sensor output +re_line="^([a-zA-Z0-9_]+): (.*)$" +ma_line=re.compile(re_line) + +def parse_data (page): + dict={} + for line in page: + matched=ma_line.match(line) + if matched: + key,val=matched.groups() + dict[key]=val + return dict + +FIELDS_FOCUS = { + 'Loads':'floats', + 'VMStat':'ints', + 'CPUUse':'ints', + 'MemInfo':'floats', + 'Date':'floats', + 'DfDot':'percent-floats', + 'LastSsh':'ints', + } + + +def filter (dict): + + filtered = {} + for key in dict.keys(): + if key in FIELDS_FOCUS.keys(): + format=FIELDS_FOCUS[key] + value=dict[key].rstrip() + if format == 'ints': + filtered[key]=map(int,value.split(' ')) + elif format == 'floats': + filtered[key]=map(float,value.split(' ')) + elif format == 'percent-floats': + fields=value.split() + pval=string.replace(fields[0],'%','') + filtered[key]=map(float,[pval]+fields[1:]) + # simpler access to single-fields + value = filtered[key] + if len(value)==1: + filtered[key]=value[0] + return filtered + +def interpret (dict): + + status = nagios.OK + + ### check ssh status + ssh_delay = dict['Date']- dict['LastSsh'] + print ssh_delay + + ### + return status + +### +def usage(): + print "Usage comon_sensor.py node timeout" + sys.exit(1) + +if __name__=='__main__': + if len(sys.argv) != 3: + usage() + check(sys.argv[1],int(sys.argv[2])) diff --git a/nagios/plugin/nagios.py b/nagios/plugin/nagios.py new file mode 120000 index 0000000..8433a01 --- /dev/null +++ b/nagios/plugin/nagios.py @@ -0,0 +1 @@ +../configurator/nagios.py \ No newline at end of file -- 2.43.0