Massive commit. Just put all local changes into svn.
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7
8
9 # QUERY all nodes.
10 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
11                                         "table=table_nodeview&" + \
12                                     "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
13                                     "formatcsv"
14                                     #"formatcsv&" + \
15                                         #"select='lastcotop!=0'"
16
17 import threading
18 plc_lock = threading.Lock()
19 round = 1
20 externalState = {'round': round, 'nodes': {}}
21 count = 0
22
23
24 import soltesz
25 import comon
26 import threadpool
27 import syncplcdb
28
29 import plc
30 import auth
31 api = plc.PLC(auth.auth, auth.plc)
32
33 def collectPingAndSSH(nodename, cohash):
34         ### RUN PING ######################
35         ping = soltesz.CMD()
36         (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
37
38         values = {}
39
40         if oval == "":
41                 # An error occurred
42                 values['ping'] = "NOPING"
43         else:
44                 values['ping'] = "PING"
45
46         #uptime = soltesz.SSH('root', nodename)
47         #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '")
48
49         ### RUN SSH ######################
50         b_getbootcd_id = True
51         ssh = soltesz.SSH('root', nodename)
52         oval = ""
53         eval = ""
54         (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
55         val = oval
56         if "2.6.17" in oval or "2.6.2" in oval:
57                 values['ssh'] = 'SSH'
58                 values['category'] = 'ALPHA'
59                 if "bm.log" in oval:
60                         values['state'] = 'DEBUG'
61                 else:
62                         values['state'] = 'BOOT'
63         elif "2.6.12" in oval or "2.6.10" in oval:
64                 values['ssh'] = 'SSH'
65                 values['category'] = 'PROD'
66                 if "bm.log" in oval:
67                         values['state'] = 'DEBUG'
68                 else:
69                         values['state'] = 'BOOT'
70         elif "2.4" in oval:
71                 b_getbootcd_id = False
72                 values['ssh'] = 'SSH'
73                 values['category'] = 'OLDBOOTCD'
74                 values['state'] = 'DEBUG'
75         elif oval != "":
76                 values['ssh'] = 'SSH'
77                 values['category'] = 'UNKNOWN'
78                 if "bm.log" in oval:
79                         values['state'] = 'DEBUG'
80                 else:
81                         values['state'] = 'BOOT'
82         else:
83                 # An error occurred.
84                 b_getbootcd_id = False
85                 values['ssh'] = 'NOSSH'
86                 values['category'] = 'ERROR'
87                 values['state'] = 'DOWN'
88                 val = eval.strip()
89
90         values['kernel'] = val
91
92         if b_getbootcd_id:
93                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
94                 (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
95                 val = oval
96                 if "BootCD" in val:
97                         values['bootcd'] = val
98                         if "v2" in val and \
99                                 ( nodename is not "planetlab1.cs.unc.edu" and \
100                                   nodename is not "planetlab2.cs.unc.edu" ):
101                                 values['category'] = 'OLDBOOTCD'
102                 else:
103                         values['bootcd'] = ""
104         else:
105                 values['bootcd'] = ""
106
107         # TODO: get bm.log for debug nodes.
108         # 'zcat /tmp/bm.log'
109                 
110         if nodename in cohash: 
111                 values['comonstats'] = cohash[nodename]
112         else:
113                 values['comonstats'] = {'resptime':  '-1', 
114                                                                 'uptime':    '-1',
115                                                                 'sshstatus': '-1', 
116                                                                 'lastcotop': '-1'}
117         # include output value
118         ### GET PLC NODE ######################
119         b_except = False
120         plc_lock.acquire()
121
122         try:
123                 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact', 'boot_state', 'nodegroup_ids'])
124         except:
125                 b_except = True
126                 import traceback
127                 b_except = True
128                 import traceback
129                 traceback.print_exc()
130
131         plc_lock.release()
132         if b_except: return (None, None)
133
134         site_id = -1
135         if d_node and len(d_node) > 0:
136                 pcu = d_node[0]['pcu_ids']
137                 if len(pcu) > 0:
138                         values['pcu'] = "PCU"
139                 else:
140                         values['pcu'] = "NOPCU"
141                 site_id = d_node[0]['site_id']
142                 last_contact = d_node[0]['last_contact']
143                 nodegroups = d_node[0]['nodegroup_ids']
144                 values['plcnode'] = {'status' : 'SUCCESS', 
145                                                         'pcu_ids': pcu, 
146                                                         'boot_state' : d_node[0]['boot_state'],
147                                                         'site_id': site_id,
148                                                         'nodegroups' : nodegroups,
149                                                         'last_contact': last_contact}
150         else:
151                 values['pcu']     = "UNKNOWN"
152                 values['plcnode'] = {'status' : "GN_FAILED"}
153                 
154
155         ### GET PLC SITE ######################
156         b_except = False
157         plc_lock.acquire()
158
159         try:
160                 d_site = plc.getSites({'site_id': site_id}, 
161                                                         ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
162         except:
163                 b_except = True
164                 import traceback
165                 traceback.print_exc()
166
167         plc_lock.release()
168         if b_except: return (None, None)
169
170         if d_site and len(d_site) > 0:
171                 max_slices = d_site[0]['max_slices']
172                 num_slices = len(d_site[0]['slice_ids'])
173                 num_nodes = len(d_site[0]['node_ids'])
174                 loginbase = d_site[0]['login_base']
175                 values['plcsite'] = {'num_nodes' : num_nodes, 
176                                                         'max_slices' : max_slices, 
177                                                         'num_slices' : num_slices,
178                                                         'login_base' : loginbase,
179                                                         'status'     : 'SUCCESS'}
180         else:
181                 values['plcsite'] = {'status' : "GS_FAILED"}
182
183         values['checked'] = time.time()
184
185         return (nodename, values)
186
187 def recordPingAndSSH(request, result):
188         global externalState
189         global count
190         (nodename, values) = result
191
192         if values is not None:
193                 global_round = externalState['round']
194                 externalState['nodes'][nodename]['values'] = values
195                 externalState['nodes'][nodename]['round'] = global_round
196
197                 count += 1
198                 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
199                 soltesz.dbDump(config.dbname, externalState)
200
201 # this will be called when an exception occurs within a thread
202 def handle_exception(request, result):
203         print "Exception occured in request %s" % request.requestID
204         for i in result:
205                 print "Result: %s" % i
206
207
208 def checkAndRecordState(l_nodes, cohash):
209         global externalState
210         global count
211         global_round = externalState['round']
212
213         tp = threadpool.ThreadPool(20)
214
215         # CREATE all the work requests
216         for nodename in l_nodes:
217                 if nodename not in externalState['nodes']:
218                         externalState['nodes'][nodename] = {'round': 0, 'values': []}
219
220                 node_round   = externalState['nodes'][nodename]['round']
221                 if node_round < global_round:
222                         # recreate node stats when refreshed
223                         #print "%s" % nodename
224                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
225                                                                                  None, recordPingAndSSH, handle_exception)
226                         tp.putRequest(req)
227                 else:
228                         # We just skip it, since it's "up to date"
229                         count += 1
230                         print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
231                         pass
232
233         # WAIT while all the work requests are processed.
234         while 1:
235                 try:
236                         time.sleep(1)
237                         tp.poll()
238                 except KeyboardInterrupt:
239                         print "Interrupted!"
240                         break
241                 except threadpool.NoResultsPending:
242                         print "All results collected."
243                         break
244
245
246
247 def main():
248         global externalState
249
250         externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
251
252         if config.increment:
253                 # update global round number to force refreshes across all nodes
254                 externalState['round'] += 1
255
256         cotop = comon.Comon()
257         # lastcotop measures whether cotop is actually running.  this is a better
258         # metric than sshstatus, or other values from CoMon
259         cotop_url = COMON_COTOPURL
260
261         # history information for all nodes
262         cohash = cotop.coget(cotop_url)
263         l_nodes = syncplcdb.create_plcdb()
264         if config.filename:
265                 f_nodes = config.getListFromFile(config.filename)
266                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
267         elif config.node:
268                 f_nodes = [config.node]
269                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
270         elif config.nodegroup:
271                 ng = api.GetNodeGroups({'name' : config.nodegroup})
272                 l_nodes = api.GetNodes(ng[0]['node_ids'])
273
274         l_nodes = [node['hostname'] for node in l_nodes]
275
276         print "fetching %s hosts" % len(l_nodes)
277
278         checkAndRecordState(l_nodes, cohash)
279
280         return 0
281
282
283 if __name__ == '__main__':
284         from config import config
285         from optparse import OptionParser
286         parser = OptionParser()
287         parser.set_defaults(filename=None, node=None, nodegroup=None, increment=False, dbname="findbadnodes", cachenodes=False)
288         parser.add_option("", "--node", dest="node", metavar="hostname", 
289                                                 help="Provide a single node to operate on")
290         parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
291                                                 help="Provide the input file for the node list")
292         parser.add_option("", "--nodegroup", dest="nodegroup", metavar="FILE", 
293                                                 help="Provide the nodegroup for the list of nodes.")
294
295         parser.add_option("", "--cachenodes", action="store_true",
296                                                 help="Cache node lookup from PLC")
297         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
298                                                 help="Specify the name of the database to which the information is saved")
299         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
300                                                 help="Increment round number to force refresh or retry")
301         config = config(parser)
302         config.parse_args()
303
304         try:
305                 main()
306         except Exception, err:
307                 import traceback
308                 print traceback.print_exc()
309                 print "Exception: %s" % err
310                 print "Saving data... exitting."
311                 soltesz.dbDump(config.dbname, externalState)
312                 sys.exit(0)