+ monitor.py -- modified the following three to use a record-based events,
[monitor.git] / comon.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: comon.py,v 1.5 2007/05/16 01:53:46 faiyaza Exp $
7 #
8 # Get CoMon data, unsorted, in CSV, and create a huge hash.
9 #
10
11
12 import urllib2
13 import httplib
14 import time
15 import Queue 
16 import logging
17 import pickle
18 from threading import *
19 #httplib.HTTPConnection.debuglevel = 1  
20
21 logger = logging.getLogger("monitor")
22
23 # Time between comon refresh
24 COSLEEP=1200
25
26 # CoMon
27 COMONURL = "http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview"
28
29 # node type:
30 # null == <not in DB?>
31 #        0 == 
32 #        1 == Prod
33 #        2 == alpha
34 #        3 == beta
35
36 # boot state:
37 #       0 == new
38 #       1 == boot
39 #       2 == dbg
40 #       3 == rins
41 #       4 == ins
42
43
44 class Comon(Thread): 
45         """
46         cdb is the comon database (dictionary)
47         all buckets is a queue of all problem nodes. This gets sent to rt to find
48         tickets open for host. 
49         """
50         def __init__(self, cdb, d_allplc_nodes, q_allbuckets):
51                 self.codata = cdb 
52                 self.d_allplc_nodes = d_allplc_nodes
53                 self.updated = time.time()
54                 self.q_allbuckets = q_allbuckets
55                 #self.comon_buckets = {"down" : "resptime%20==%200%20&&%20keyok==null",
56                 #       "ssh": "sshstatus%20%3E%202h",
57                 #       "clock_drift": "drift%20%3E%201m",
58                 #       "dns": "dns1udp%20%3E%2080%20&&%20dns2udp%20%3E%2080",
59                 #       "filerw": "filerw%3E0",
60                 #       "dbg" : "keyok==0"}
61                 self.comon_buckets = {
62                         #"down" : "resptime==0 && keyok==null",
63                         #"ssh": "sshstatus > 2h",
64                         #"clock_drift": "drift > 1m",
65                         #"dns": "dns1udp>80 && dns2udp>80",
66                         #"filerw": "filerw > 0",
67                         "dbg" : "keyok==0"
68                         }
69                 Thread.__init__(self)
70
71         def __tohash(self,rawdata):
72                 # First line Comon returns is list of keys with respect to index
73                 keys = rawdata.readline().rstrip().split(", ")
74                 l_host = []
75                 hash = {}
76                 try:
77                         i_ignored = 0
78                         for line in rawdata.readlines():
79                                 l_host = line.rstrip().split(", ")              # split the line on ', '
80                                 hostname = l_host[0]
81                                 if hostname in self.d_allplc_nodes:             # then we'll track it
82                                         hash[hostname] = {}
83                                         for i in range(1,len(keys)):
84                                                 hash[hostname][keys[i]]=l_host[i]
85                                 else:
86                                         i_ignored += 1
87
88                         print "Retrieved %s hosts" % len(hash.keys())
89                         print "Ignoring %d hosts" % i_ignored
90
91                         logger.debug("Retrieved %s hosts" % len(hash.keys()))
92                         logger.debug("Ignoring %d hosts" % i_ignored)
93                 except Exception, err:
94                         logger.debug("No hosts retrieved")      
95                         return {} 
96                 return hash
97
98         # Update individual buckekts.  Hostnames only.
99         def updatebuckets(self):
100                 for (bucket,url) in self.comon_buckets.items():
101                         logger.debug("COMON:  Updating bucket %s" % bucket)
102                         tmp = self.coget(COMONURL + "&format=formatcsv&select='" + url + "'").keys()
103                         setattr(self, bucket, tmp)
104
105         # Update ALL node information
106         def updatedb(self):
107                 # Get time of update
108                 self.updated = time.time()
109                 # Make a Hash, put in self.
110                 self.codata.update(self.coget(COMONURL + "&format=formatcsv"))
111
112         def coget(self,url):
113                 rawdata = None
114                 print "Getting: %s" % url
115                 try:
116                         coserv = urllib2.Request(url)
117                         coserv.add_header('User-Agent',
118                                 'PL_Monitor +http://monitor.planet-lab.org/')
119                         opener = urllib2.build_opener()
120                         # Initial web get from summer.cs in CSV
121                         rawdata = opener.open(coserv)
122                 except urllib2.URLError, (err):
123                         print "Attempting %s" %COMONURL
124                         print "URL error (%s)" % (err)
125                         rawdata = None
126                 return self.__tohash(rawdata)
127
128         # Push nodes that are bad (in *a* bucket) into q(q_allbuckets)
129         def push(self):
130                 #buckets_per_node = []
131                 #for bucket in self.comon.comon_buckets.keys():
132                 #       if (hostname in getattr(self.comon, bucket)):
133                 #               buckets_per_node.append(bucket)
134
135                 #loginbase = self.plcdb_hn2lb[hostname] # plc.siteId(node)
136
137                 #if not loginbase in self.sickdb:
138                 #       self.sickdb[loginbase] = [{hostname: buckets_per_node}]
139                 #else:
140                 #       self.sickdb[loginbase].append({hostname: buckets_per_node})
141
142
143                 print "calling Comon.push()"
144                 for bucket in self.comon_buckets.keys():
145                         #print "bucket: %s" % bucket
146                         for host in getattr(self,bucket):
147                                 diag_node = {}
148                                 diag_node['nodename'] = host
149                                 diag_node['message'] = None
150                                 diag_node['bucket'] = [bucket]
151                                 diag_node['stage'] = ""
152                                 diag_node['args'] = None
153                                 diag_node['info'] = None
154                                 diag_node['time'] = time.time()
155                                 #print "host: %s" % host
156                                 self.q_allbuckets.put(diag_node)
157
158         def run(self):
159                 self.updatedb()
160                 self.updatebuckets()
161                 self.push()
162                 # insert signal that this is the final host
163                 self.q_allbuckets.put("None")
164  
165         def __repr__(self):
166             return self
167
168 def main():
169         logger.setLevel(logging.DEBUG)
170         ch = logging.StreamHandler()
171         ch.setLevel(logging.DEBUG)
172         formatter = logging.Formatter('%(message)s')
173         ch.setFormatter(formatter)
174         logger.addHandler(ch)
175
176
177         t = Queue.Queue()
178         cdb = {}
179         a = Comon(cdb,t)
180         #for i in a.comon_buckets: print "%s : %s" % ( i, a.comon_buckets[i])
181         a.start()
182
183         time.sleep(5)
184         #for i in a.down: print i
185
186         time.sleep(5)
187         #print cdb
188         for host in cdb.keys():
189                 #if cdb[host]['keyok'] == "0":
190                 # null implies that it may not be in PL DB.
191                 if  cdb[host]['bootstate'] != "null" and \
192                         cdb[host]['bootstate'] == "2" and \
193                         cdb[host]['keyok'] == "0":      
194                         print("%-40s \t Bootstate %s nodetype %s kernver %s keyok %s" % ( 
195                                 host, cdb[host]['bootstate'], cdb[host]['nodetype'], 
196                                 cdb[host]['kernver'], cdb[host]['keyok']))
197                         #ssh = soltesz.SSH('root', host)
198                         #try:
199                         #       val = ssh.run("uname -r")
200                         #       print "%s == %s" % (host, val),
201                         #except:
202                         #       pass
203         #       else:
204         #               print("key mismatch at: %s" % host)
205         #print a.codata['michelangelo.ani.univie.ac.at']
206         #time.sleep(3)
207         #a.push()
208         #print a.filerw
209         #print a.coget(COMONURL + "&format=formatcsv&select='" + a.comon_buckets['filerw'])
210
211         #os._exit(0)
212 if __name__ == '__main__':
213         import os
214         try:
215                 main()
216         except KeyboardInterrupt:
217                 print "Killed.  Exitting."
218                 logger.info('Monitor Killed')
219                 os._exit(0)