Read nodes from a given file, for batch updates when using nodequery and
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7
8 from config import config
9 from optparse import OptionParser
10 parser = OptionParser()
11 parser.set_defaults(filename=None, increment=False, dbname="findbadnodes", cachenodes=False)
12 parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
13                                         help="Provide the input file for the node list")
14 parser.add_option("", "--cachenodes", action="store_true",
15                                         help="Cache node lookup from PLC")
16 parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
17                                         help="Specify the name of the database to which the information is saved")
18 parser.add_option("-i", "--increment", action="store_true", dest="increment", 
19                                         help="Increment round number to force refresh or retry")
20 config = config(parser)
21 config.parse_args()
22
23 # QUERY all nodes.
24 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
25                                         "table=table_nodeview&" + \
26                                     "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
27                                     "formatcsv"
28                                     #"formatcsv&" + \
29                                         #"select='lastcotop!=0'"
30
31 import threading
32 plc_lock = threading.Lock()
33 round = 1
34 externalState = {'round': round, 'nodes': {}}
35 count = 0
36
37
38 import soltesz
39 import plc
40 import comon
41 import threadpool
42 import syncplcdb
43
44 def collectPingAndSSH(nodename, cohash):
45         ### RUN PING ######################
46         ping = soltesz.CMD()
47         (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
48
49         values = {}
50
51         if oval == "":
52                 # An error occurred
53                 values['ping'] = "NOPING"
54         else:
55                 values['ping'] = "PING"
56
57         #uptime = soltesz.SSH('root', nodename)
58         #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '")
59
60         ### RUN SSH ######################
61         b_getbootcd_id = True
62         ssh = soltesz.SSH('root', nodename)
63         oval = ""
64         eval = ""
65         (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
66         val = oval
67         if "2.6.17" in oval or "2.6.2" in oval:
68                 values['ssh'] = 'SSH'
69                 values['category'] = 'ALPHA'
70                 if "bm.log" in oval:
71                         values['state'] = 'DEBUG'
72                 else:
73                         values['state'] = 'BOOT'
74         elif "2.6.12" in oval or "2.6.10" in oval:
75                 values['ssh'] = 'SSH'
76                 values['category'] = 'PROD'
77                 if "bm.log" in oval:
78                         values['state'] = 'DEBUG'
79                 else:
80                         values['state'] = 'BOOT'
81         elif "2.4" in oval:
82                 b_getbootcd_id = False
83                 values['ssh'] = 'SSH'
84                 values['category'] = 'OLDBOOTCD'
85                 values['state'] = 'DEBUG'
86         elif oval != "":
87                 values['ssh'] = 'SSH'
88                 values['category'] = 'UNKNOWN'
89                 if "bm.log" in oval:
90                         values['state'] = 'DEBUG'
91                 else:
92                         values['state'] = 'BOOT'
93         else:
94                 # An error occurred.
95                 b_getbootcd_id = False
96                 values['ssh'] = 'NOSSH'
97                 values['category'] = 'ERROR'
98                 values['state'] = 'DOWN'
99                 val = eval.strip()
100
101         values['kernel'] = val
102
103         if b_getbootcd_id:
104                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
105                 (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
106                 val = oval
107                 if "BootCD" in val:
108                         values['bootcd'] = val
109                         if "v2" in val and \
110                                 ( nodename is not "planetlab1.cs.unc.edu" and \
111                                   nodename is not "planetlab2.cs.unc.edu" ):
112                                 values['category'] = 'OLDBOOTCD'
113                 else:
114                         values['bootcd'] = ""
115         else:
116                 values['bootcd'] = ""
117
118         # TODO: get bm.log for debug nodes.
119         # 'zcat /tmp/bm.log'
120                 
121         if nodename in cohash: 
122                 values['comonstats'] = cohash[nodename]
123         else:
124                 values['comonstats'] = {'resptime':  '-1', 
125                                                                 'uptime':    '-1',
126                                                                 'sshstatus': '-1', 
127                                                                 'lastcotop': '-1'}
128         # include output value
129         ### GET PLC NODE ######################
130         b_except = False
131         plc_lock.acquire()
132
133         try:
134                 d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact', 'boot_state', 'nodegroup_ids'])
135         except:
136                 b_except = True
137                 import traceback
138                 b_except = True
139                 import traceback
140                 traceback.print_exc()
141
142         plc_lock.release()
143         if b_except: return (None, None)
144
145         site_id = -1
146         if d_node and len(d_node) > 0:
147                 pcu = d_node[0]['pcu_ids']
148                 if len(pcu) > 0:
149                         values['pcu'] = "PCU"
150                 else:
151                         values['pcu'] = "NOPCU"
152                 site_id = d_node[0]['site_id']
153                 last_contact = d_node[0]['last_contact']
154                 nodegroups = d_node[0]['nodegroup_ids']
155                 values['plcnode'] = {'status' : 'SUCCESS', 
156                                                         'pcu_ids': pcu, 
157                                                         'boot_state' : d_node[0]['boot_state'],
158                                                         'site_id': site_id,
159                                                         'nodegroups' : nodegroups,
160                                                         'last_contact': last_contact}
161         else:
162                 values['pcu']     = "UNKNOWN"
163                 values['plcnode'] = {'status' : "GN_FAILED"}
164                 
165
166         ### GET PLC SITE ######################
167         b_except = False
168         plc_lock.acquire()
169
170         try:
171                 d_site = plc.getSites({'site_id': site_id}, 
172                                                         ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
173         except:
174                 b_except = True
175                 import traceback
176                 traceback.print_exc()
177
178         plc_lock.release()
179         if b_except: return (None, None)
180
181         if d_site and len(d_site) > 0:
182                 max_slices = d_site[0]['max_slices']
183                 num_slices = len(d_site[0]['slice_ids'])
184                 num_nodes = len(d_site[0]['node_ids'])
185                 loginbase = d_site[0]['login_base']
186                 values['plcsite'] = {'num_nodes' : num_nodes, 
187                                                         'max_slices' : max_slices, 
188                                                         'num_slices' : num_slices,
189                                                         'login_base' : loginbase,
190                                                         'status'     : 'SUCCESS'}
191         else:
192                 values['plcsite'] = {'status' : "GS_FAILED"}
193
194         values['checked'] = time.time()
195
196         return (nodename, values)
197
198 def recordPingAndSSH(request, result):
199         global externalState
200         global count
201         (nodename, values) = result
202
203         if values is not None:
204                 global_round = externalState['round']
205                 externalState['nodes'][nodename]['values'] = values
206                 externalState['nodes'][nodename]['round'] = global_round
207
208                 count += 1
209                 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
210                 soltesz.dbDump(config.dbname, externalState)
211
212 # this will be called when an exception occurs within a thread
213 def handle_exception(request, result):
214         print "Exception occured in request %s" % request.requestID
215         for i in result:
216                 print "Result: %s" % i
217
218
219 def checkAndRecordState(l_nodes, cohash):
220         global externalState
221         global count
222         global_round = externalState['round']
223
224         tp = threadpool.ThreadPool(20)
225
226         # CREATE all the work requests
227         for nodename in l_nodes:
228                 if nodename not in externalState['nodes']:
229                         externalState['nodes'][nodename] = {'round': 0, 'values': []}
230
231                 node_round   = externalState['nodes'][nodename]['round']
232                 if node_round < global_round:
233                         # recreate node stats when refreshed
234                         #print "%s" % nodename
235                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
236                                                                                  None, recordPingAndSSH, handle_exception)
237                         tp.putRequest(req)
238                 else:
239                         # We just skip it, since it's "up to date"
240                         count += 1
241                         print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
242                         pass
243
244         # WAIT while all the work requests are processed.
245         while 1:
246                 try:
247                         time.sleep(1)
248                         tp.poll()
249                 except KeyboardInterrupt:
250                         print "Interrupted!"
251                         break
252                 except threadpool.NoResultsPending:
253                         print "All results collected."
254                         break
255
256
257
258 def main():
259         global externalState
260
261         externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
262
263         if config.increment:
264                 # update global round number to force refreshes across all nodes
265                 externalState['round'] += 1
266
267         cotop = comon.Comon()
268         # lastcotop measures whether cotop is actually running.  this is a better
269         # metric than sshstatus, or other values from CoMon
270         cotop_url = COMON_COTOPURL
271
272         # history information for all nodes
273         cohash = cotop.coget(cotop_url)
274         l_nodes = syncplcdb.create_plcdb()
275         if config.filename:
276                 f_nodes = config.getListFromFile(config.filename)
277                 l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
278
279         l_nodes = [node['hostname'] for node in l_nodes]
280
281         print "fetching %s hosts" % len(l_nodes)
282
283         checkAndRecordState(l_nodes, cohash)
284
285         return 0
286
287
288 if __name__ == '__main__':
289         try:
290                 main()
291         except Exception, err:
292                 import traceback
293                 print traceback.print_exc()
294                 print "Exception: %s" % err
295                 print "Saving data... exitting."
296                 soltesz.dbDump(config.dbname, externalState)
297                 sys.exit(0)