X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=comon.py;h=8d96e1697665920b38ffc1ea79405e1192113b1a;hb=refs%2Fheads%2F1.0;hp=ad4035fe354346be918b92f6feed009dc72dd1e7;hpb=55ec8f3eb860029ecf552d8c0a3cc6dbd2f66b68;p=monitor.git diff --git a/comon.py b/comon.py index ad4035f..8d96e16 100755 --- a/comon.py +++ b/comon.py @@ -3,7 +3,7 @@ # # Faiyaz Ahmed # -# $Id: $ +# $Id: comon.py,v 1.7 2007/07/03 19:59:02 soltesz Exp $ # # Get CoMon data, unsorted, in CSV, and create a huge hash. # @@ -14,6 +14,7 @@ import httplib import time import Queue import logging +import pickle from threading import * #httplib.HTTPConnection.debuglevel = 1 @@ -25,61 +26,148 @@ COSLEEP=1200 # CoMon COMONURL = "http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview" +# node type: +# null == +# 0 == +# 1 == Prod +# 2 == alpha +# 3 == beta + +# boot state: +# 0 == new +# 1 == boot +# 2 == dbg +# 3 == rins +# 4 == ins + +def _tohash(rawdata): + # First line Comon returns is list of keys with respect to index + try: + keys = rawdata.readline().rstrip().split(", ") + l_host = [] + hash = {} + i_ignored = 0 + for line in rawdata.readlines(): + l_host = line.rstrip().split(", ") # split the line on ', ' + hostname = l_host[0] + hash[hostname] = {} + for i in range(1,len(keys)): + hash[hostname][keys[i]]=l_host[i] + + except Exception, err: + logger.debug("No hosts retrieved") + return {} + return hash + +def comonget(url): + rawdata = None + print "Getting: %s" % url + try: + coserv = urllib2.Request(url) + coserv.add_header('User-Agent', 'PL_Monitor +http://monitor.planet-lab.org/') + opener = urllib2.build_opener() + # Initial web get from summer.cs in CSV + rawdata = opener.open(coserv) + except urllib2.URLError, (err): + print "Attempting %s" %COMONURL + print "URL error (%s)" % (err) + rawdata = None + return _tohash(rawdata) + class Comon(Thread): """ + cdb is the comon database (dictionary) + all buckets is a queue of all problem nodes. This gets sent to rt to find + tickets open for host. """ - def __init__(self, cdb, allbuckets): + def __init__(self, cdb=None, d_allplc_nodes=None, q_allbuckets=None): + + self.accept_all_nodes = False + + if cdb == None: + cdb = {} + if d_allplc_nodes == None: + self.accept_all_nodes = True # TODO :get from plc. + self.codata = cdb + self.d_allplc_nodes = d_allplc_nodes self.updated = time.time() - self.allbuckets = allbuckets - self.comonbkts = {"ssh": "sshstatus%20%3E%202h", - "clock_drift": "drift%20%3E%201m", - "dns": "dns1udp%20%3E%2080%20&&%20dns2udp%20%3E%2080", - "disk": "resptime%20%3E%200%20&&%20gbfree%20%3C%205", - "filerw": "filerw%3E0"} + self.q_allbuckets = q_allbuckets + #self.comon_buckets = {"down" : "resptime%20==%200%20&&%20keyok==null", + # "ssh": "sshstatus%20%3E%202h", + # "clock_drift": "drift%20%3E%201m", + # "dns": "dns1udp%20%3E%2080%20&&%20dns2udp%20%3E%2080", + # "filerw": "filerw%3E0", + # "dbg" : "keyok==0"} + self.comon_buckets = { + #"down" : "resptime==0&&keyok==null", + #"ssh": "sshstatus > 2h", + #"clock_drift": "drift > 1m", + #"dns": "dns1udp>80 && dns2udp>80", + #"filerw": "filerw > 0", + #"all" : "" + "dbg" : "keyok==0", + } Thread.__init__(self) def __tohash(self,rawdata): # First line Comon returns is list of keys with respect to index keys = rawdata.readline().rstrip().split(", ") - host = [] + l_host = [] hash = {} try: + i_ignored = 0 for line in rawdata.readlines(): - host = line.rstrip().split(", ") - tmp = {} - for i in range(1,len(keys)): - tmp[keys[i]]=host[i] - hash[host[0]]=tmp + l_host = line.rstrip().split(", ") # split the line on ', ' + hostname = l_host[0] + add = False + if self.accept_all_nodes: + add=True + else: + if hostname in self.d_allplc_nodes: # then we'll track it + add = True + + if add: + hash[hostname] = {} + for i in range(1,len(keys)): + hash[hostname][keys[i]]=l_host[i] + else: + i_ignored += 1 + + print "Retrieved %s hosts" % len(hash.keys()) + print "Ignoring %d hosts" % i_ignored + logger.debug("Retrieved %s hosts" % len(hash.keys())) + logger.debug("Ignoring %d hosts" % i_ignored) except Exception, err: logger.debug("No hosts retrieved") return {} return hash # Update individual buckekts. Hostnames only. - def updatebkts(self): - for (bkt,url) in self.comonbkts.items(): + def updatebuckets(self): + for (bucket,url) in self.comon_buckets.items(): + logger.debug("COMON: Updating bucket %s" % bucket) tmp = self.coget(COMONURL + "&format=formatcsv&select='" + url + "'").keys() - setattr(self, bkt, tmp) + setattr(self, bucket, tmp) # Update ALL node information def updatedb(self): # Get time of update self.updated = time.time() # Make a Hash, put in self. - self.codata = self.coget(COMONURL + "&format=formatcsv") + self.codata.update(self.coget(COMONURL + "&format=formatcsv")) def coget(self,url): rawdata = None + print "Getting: %s" % url try: - logger.debug("Trying - " + url) coserv = urllib2.Request(url) coserv.add_header('User-Agent', - 'PL_Monitor +http://monitor.planet-lab.org/') + 'PL_Monitor +http://monitor.planet-lab.org/') opener = urllib2.build_opener() - # Initial web get from summer.cs in CSV + # Initial web get from summer.cs in CSV rawdata = opener.open(coserv) except urllib2.URLError, (err): print "Attempting %s" %COMONURL @@ -87,48 +175,90 @@ class Comon(Thread): rawdata = None return self.__tohash(rawdata) - # Push nodes that are bad (in *a* bucket) into q(allbuckets) + # Push nodes that are bad (in *a* bucket) into q(q_allbuckets) def push(self): - for bucket in self.comonbkts.keys(): + #buckets_per_node = [] + #for bucket in self.comon.comon_buckets.keys(): + # if (hostname in getattr(self.comon, bucket)): + # buckets_per_node.append(bucket) + + #loginbase = self.plcdb_hn2lb[hostname] # plc.siteId(node) + + #if not loginbase in self.sickdb: + # self.sickdb[loginbase] = [{hostname: buckets_per_node}] + #else: + # self.sickdb[loginbase].append({hostname: buckets_per_node}) + + + print "calling Comon.push()" + for bucket in self.comon_buckets.keys(): + #print "bucket: %s" % bucket for host in getattr(self,bucket): - self.allbuckets.put(host) + diag_node = {} + diag_node['nodename'] = host + diag_node['message'] = None + diag_node['bucket'] = [bucket] + diag_node['stage'] = "" + #diag_node['ticket_id'] = "" + diag_node['args'] = None + diag_node['info'] = None + diag_node['time'] = time.time() + #print "host: %s" % host + self.q_allbuckets.put(diag_node) def run(self): - while 1: - self.updatedb() - self.updatebkts() - self.push() - time.sleep(COSLEEP) + self.updatedb() + self.updatebuckets() + self.push() + # insert signal that this is the final host + self.q_allbuckets.put("None") def __repr__(self): return self def main(): - logger.setLevel(logging.DEBUG) - ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) - formatter = logging.Formatter('%(message)s') - ch.setFormatter(formatter) - logger.addHandler(ch) + logger.setLevel(logging.DEBUG) + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) t = Queue.Queue() cdb = {} a = Comon(cdb,t) + #for i in a.comon_buckets: print "%s : %s" % ( i, a.comon_buckets[i]) a.start() - time.sleep(3) - print a.ssh + + time.sleep(5) + #for i in a.down: print i + + time.sleep(5) + #print cdb + for host in cdb.keys(): + #if cdb[host]['keyok'] == "0": + # null implies that it may not be in PL DB. + if cdb[host]['bootstate'] != "null" and \ + cdb[host]['bootstate'] == "2" and \ + cdb[host]['keyok'] == "0": + print("%-40s \t Bootstate %s nodetype %s kernver %s keyok %s" % ( + host, cdb[host]['bootstate'], cdb[host]['nodetype'], + cdb[host]['kernver'], cdb[host]['keyok'])) + # else: + # print("key mismatch at: %s" % host) + #print a.codata['michelangelo.ani.univie.ac.at'] #time.sleep(3) #a.push() #print a.filerw - print a.coget(COMONURL + "&format=formatcsv&select='" + a.comonbkts['filerw']) + #print a.coget(COMONURL + "&format=formatcsv&select='" + a.comon_buckets['filerw']) - os._exit(0) + #os._exit(0) if __name__ == '__main__': import os - try: - main() - except KeyboardInterrupt: - print "Killed. Exitting." - logger.info('Monitor Killed') - os._exit(0) + try: + main() + except KeyboardInterrupt: + print "Killed. Exitting." + logger.info('Monitor Killed') + os._exit(0)