+ findbad.py: this actively probes all machines in the PLC db, using ping,
[monitor.git] / findbad.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 import soltesz
8 import plc
9 import comon
10 import threadpool
11
12 from config import config
13 from optparse import OptionParser
14 parser = OptionParser()
15 parser.set_defaults(filename="", increment=False, dbname="findbadnodes")
16 parser.add_option("-f", "--nodes", dest="filename", metavar="FILE", 
17                                         help="Provide the input file for the node list")
18 parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
19                                         help="Specify the name of the database to which the information is saved")
20 parser.add_option("-i", "--increment", action="store_true", dest="increment", 
21                                         help="Increment round number to force refresh or retry")
22 config = config(parser)
23 config.parse_args()
24
25 # QUERY all nodes.
26 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
27                                         "table=table_nodeview&" + \
28                                     "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
29                                     "formatcsv"
30                                     #"formatcsv&" + \
31                                         #"select='lastcotop!=0'"
32
33 round = 1
34 externalState = {'round': round, 'nodes': {}}
35 count = 0
36
37 def collectPingAndSSH(nodename, cohash):
38         ### RUN PING ######################
39         ping = soltesz.CMD()
40         (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
41
42         values = {}
43
44         if oval == "":
45                 # An error occurred
46                 values['ping'] = "NOPING"
47         else:
48                 values['ping'] = "PING"
49
50         ### RUN SSH ######################
51         b_getbootcd_id = True
52         ssh = soltesz.SSH('root', nodename)
53         oval = ""
54         eval = ""
55         (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
56         val = oval
57         if "2.6.17" in oval or "2.6.20" in oval:
58                 values['ssh'] = 'SSH'
59                 if "bm.log" in oval:
60                         values['category'] = 'ALPHA'
61                         values['state'] = 'DEBUG'
62                 else:
63                         values['category'] = 'ALPHA'
64                         values['state'] = 'BOOT'
65         elif "2.6.12" in oval or "2.6.10" in oval:
66                 values['ssh'] = 'SSH'
67                 values['category'] = 'PROD'
68                 if "bm.log" in oval:
69                         values['state'] = 'DEBUG'
70                 else:
71                         values['state'] = 'BOOT'
72         elif "2.4" in oval:
73                 b_getbootcd_id = False
74                 values['ssh'] = 'SSH'
75                 values['category'] = 'OLDBOOTCD'
76                 values['state'] = 'DEBUG'
77         elif oval != "":
78                 values['ssh'] = 'SSH'
79                 values['category'] = 'UNKNOWN'
80                 if "bm.log" in oval:
81                         values['state'] = 'DEBUG'
82                 else:
83                         values['state'] = 'BOOT'
84         else:
85                 # An error occurred.
86                 b_getbootcd_id = False
87                 values['ssh'] = 'NOSSH'
88                 values['category'] = 'ERROR'
89                 values['state'] = 'DOWN'
90                 val = eval.strip()
91
92         values['kernel'] = val
93
94         if b_getbootcd_id:
95                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
96                 (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
97                 val = oval
98                 if "BootCD" in val:
99                         values['bootcd'] = val
100                         if "v2" in val:
101                                 values['category'] = 'OLDBOOTCD'
102                 else:
103                         values['bootcd'] = ""
104         else:
105                 values['bootcd'] = ""
106
107         # TODO: get bm.log for debug nodes.
108         # 'zcat /tmp/bm.log'
109                 
110         values['comonstats'] = cohash[nodename]
111         # include output value
112         ### GET PLC NODE ######################
113         d_node = plc.getNodes({'hostname': nodename})
114         site_id = -1
115         if d_node and len(d_node) > 0:
116                 pcu = d_node[0]['pcu_ids']
117                 if len(pcu) > 0:
118                         values['pcu'] = "PCU"
119                 else:
120                         values['pcu'] = "NOPCU"
121                 site_id = d_node[0]['site_id']
122                 values['plcnode'] = {'status' : 'SUCCESS', 'pcu_ids': pcu, 'site_id': site_id}
123         else:
124                 values['pcu']     = "UNKNOWN"
125                 values['plcnode'] = {'status' : "GN_FAILED"}
126                 
127
128         ### GET PLC SITE ######################
129         d_site = plc.getSites({'site_id': site_id})
130         if d_site and len(d_site) > 0:
131                 max_slices = d_site[0]['max_slices']
132                 num_slices = len(d_site[0]['slice_ids'])
133                 num_nodes = len(d_site[0]['node_ids'])
134                 loginbase = d_site[0]['login_base']
135                 values['plcsite'] = {'num_nodes' : num_nodes, 
136                                                         'max_slices' : max_slices, 
137                                                         'num_slices' : num_slices,
138                                                         'login_base' : loginbase,
139                                                         'status'     : 'SUCCESS'}
140         else:
141                 values['plcsite'] = {'status' : "GS_FAILED"}
142
143         return (nodename, values)
144
145 def recordPingAndSSH(request, result):
146         global externalState
147         global count
148         (nodename, values) = result
149
150         global_round = externalState['round']
151         externalState['nodes'][nodename]['values'] = values
152         externalState['nodes'][nodename]['round'] = global_round
153
154         count += 1
155         print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
156         soltesz.dbDump(config.dbname, externalState)
157
158 # this will be called when an exception occurs within a thread
159 def handle_exception(request, result):
160         print "Exception occured in request %s" % request.requestID
161         for i in result:
162                 print "Result: %s" % i
163
164
165 def checkAndRecordState(l_nodes, cohash):
166         global externalState
167         global count
168         global_round = externalState['round']
169
170         tp = threadpool.ThreadPool(20)
171
172         # CREATE all the work requests
173         for nodename in l_nodes:
174                 if nodename not in externalState['nodes']:
175                         externalState['nodes'][nodename] = {'round': 0, 'values': []}
176
177                 node_round   = externalState['nodes'][nodename]['round']
178                 if node_round < global_round:
179                         # recreate node stats when refreshed
180                         #print "%s" % nodename
181                         req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, 
182                                                                                  None, recordPingAndSSH, handle_exception)
183                         tp.putRequest(req)
184                 else:
185                         # We just skip it, since it's "up to date"
186                         count += 1
187                         print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
188                         pass
189
190         # WAIT while all the work requests are processed.
191         while 1:
192                 try:
193                         time.sleep(1)
194                         tp.poll()
195                 except KeyboardInterrupt:
196                         print "Interrupted!"
197                         break
198                 except threadpool.NoResultsPending:
199                         print "All results collected."
200                         break
201
202
203
204 def main():
205         global externalState
206
207         externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
208
209         if config.increment:
210                 # update global round number to force refreshes across all nodes
211                 externalState['round'] += 1
212
213         cotop = comon.Comon()
214         # lastcotop measures whether cotop is actually running.  this is a better
215         # metric than sshstatus, or other values from CoMon
216         cotop_url = COMON_COTOPURL
217
218         cohash = cotop.coget(cotop_url)
219
220         if config.filename == "":
221                 l_nodes = cohash.keys()
222         else:
223                 l_nodes = config.getListFromFile(config.filename)
224
225         checkAndRecordState(l_nodes, cohash)
226
227         return 0
228
229
230 if __name__ == '__main__':
231         try:
232                 main()
233         except Exception, err:
234                 print "Exception: %s" % err
235                 print "Saving data... exitting."
236                 soltesz.dbDump(config.dbname, externalState)
237                 sys.exit(0)