completed updates to the info model.
[monitor.git] / findbadpcu.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 import socket
8 import sets
9 import signal
10 import traceback
11 from datetime import datetime,timedelta
12 import threadpool
13 import threading
14
15 import monitor
16 from monitor.pcu import reboot
17 from monitor import config
18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
19 from monitor import util 
20 from monitor.wrapper import plc, plccache
21 from nodequery import pcu_select
22
23 plc_lock = threading.Lock()
24 global_round = 1
25 errorState = {}
26 count = 0
27
28 def nmap_portstatus(status):
29         ps = {}
30         l_nmap = status.split()
31         ports = l_nmap[4:]
32
33         continue_probe = False
34         for port in ports:
35                 results = port.split('/')
36                 ps[results[0]] = results[1]
37                 if results[1] == "open":
38                         continue_probe = True
39         return (ps, continue_probe)
40
41 def get_pcu(pcuname):
42         plc_lock.acquire()
43         try:
44                 #print "GetPCU from PLC %s" % pcuname
45                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
46                 #print l_pcu
47                 if len(l_pcu) > 0:
48                         l_pcu = l_pcu[0]
49         except:
50                 try:
51                         #print "GetPCU from file %s" % pcuname
52                         l_pcus = plccache.l_pcus
53                         for i in l_pcus:
54                                 if i['pcu_id'] == pcuname:
55                                         l_pcu = i
56                 except:
57                         traceback.print_exc()
58                         l_pcu = None
59
60         plc_lock.release()
61         return l_pcu
62
63 def get_nodes(node_ids):
64         plc_lock.acquire()
65         l_node = []
66         try:
67                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
68         except:
69                 try:
70                         plc_nodes = plccache.l_plcnodes
71                         for n in plc_nodes:
72                                 if n['node_id'] in node_ids:
73                                         l_node.append(n)
74                 except:
75                         traceback.print_exc()
76                         l_node = None
77
78         plc_lock.release()
79         if l_node == []:
80                 l_node = None
81         return l_node
82         
83
84 def get_plc_pcu_values(pcuname):
85         """
86                 Try to contact PLC to get the PCU info.
87                 If that fails, try a backup copy from the last run.
88                 If that fails, return None
89         """
90         values = {}
91
92         l_pcu = get_pcu(pcuname)
93         
94         if l_pcu is not None:
95                 site_id = l_pcu['site_id']
96                 node_ids = l_pcu['node_ids']
97                 l_node = get_nodes(node_ids) 
98                                 
99                 if l_node is not None:
100                         for node in l_node:
101                                 values[node['hostname']] = node['ports'][0]
102
103                         values['nodenames'] = [node['hostname'] for node in l_node]
104
105                         # NOTE: this is for a dry run later. It doesn't matter which node.
106                         values['node_id'] = l_node[0]['node_id']
107
108                 values.update(l_pcu)
109         else:
110                 values = None
111         
112         return values
113
114 def get_plc_site_values(site_id):
115         ### GET PLC SITE ######################
116         plc_lock.acquire()
117         values = {}
118         d_site = None
119
120         try:
121                 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
122                 if len(d_site) > 0:
123                         d_site = d_site[0]
124         except:
125                 try:
126                         plc_sites = plccache.l_plcsites
127                         for site in plc_sites:
128                                 if site['site_id'] == site_id:
129                                         d_site = site
130                                         break
131                 except:
132                         traceback.print_exc()
133                         values = None
134
135         plc_lock.release()
136
137         if d_site is not None:
138                 max_slices = d_site['max_slices']
139                 num_slices = len(d_site['slice_ids'])
140                 num_nodes = len(d_site['node_ids'])
141                 loginbase = d_site['login_base']
142                 values['plcsite'] = {'num_nodes' : num_nodes, 
143                                                         'max_slices' : max_slices, 
144                                                         'num_slices' : num_slices,
145                                                         'login_base' : loginbase,
146                                                         'status'     : 'SUCCESS'}
147         else:
148                 values = None
149
150
151         return values
152
153
154 def collectPingAndSSH(pcuname, cohash):
155
156         continue_probe = True
157         errors = None
158         values = {'reboot' : 'novalue'}
159         ### GET PCU ######################
160         try:
161                 b_except = False
162                 try:
163                         v = get_plc_pcu_values(pcuname)
164                         if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
165                         if v['ip'] is not None: v['ip'] = v['ip'].strip()
166
167                         if v is not None:
168                                 values['plc_pcu_stats'] = v
169                         else:
170                                 continue_probe = False
171                 except:
172                         b_except = True
173                         traceback.print_exc()
174                         continue_probe = False
175
176                 if b_except or not continue_probe: return (None, None, None)
177
178
179                 #### COMPLETE ENTRY   #######################
180
181                 values['complete_entry'] = []
182                 #if values['protocol'] is None or values['protocol'] is "":
183                 #       values['complete_entry'] += ["protocol"]
184                 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
185                         values['complete_entry'] += ["model"]
186                         # Cannot continue due to this condition
187                         continue_probe = False
188
189                 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
190                         values['complete_entry'] += ["password"]
191                         # Cannot continue due to this condition
192                         continue_probe = False
193
194                 if len(values['complete_entry']) > 0:
195                         continue_probe = False
196
197                 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
198                         values['complete_entry'] += ["hostname"]
199                 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
200                         values['complete_entry'] += ["ip"]
201
202                 # If there are no nodes associated with this PCU, then we cannot continue.
203                 if len(values['plc_pcu_stats']['node_ids']) == 0:
204                         continue_probe = False
205                         values['complete_entry'] += ['NoNodeIds']
206
207                 #### DNS and IP MATCH #######################
208                 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
209                    values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
210                         #print "Calling socket.gethostbyname(%s)" % values['hostname']
211                         try:
212                                 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
213                                 if ipaddr == values['plc_pcu_stats']['ip']:
214                                         values['dnsmatch'] = "DNS-OK"
215                                 else:
216                                         values['dnsmatch'] = "DNS-MISMATCH"
217                                         continue_probe = False
218
219                         except Exception, err:
220                                 values['dnsmatch'] = "DNS-NOENTRY"
221                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
222                                 #print err
223                 else:
224                         if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
225                                 values['dnsmatch'] = "NOHOSTNAME"
226                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
227                         else:
228                                 values['dnsmatch'] = "NO-DNS-OR-IP"
229                                 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
230                                 continue_probe = False
231
232                 #### RUN NMAP ###############################
233                 if continue_probe:
234                         nmap = util.command.CMD()
235                         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
236                         # NOTE: an empty / error value for oval, will still work.
237                         (values['portstatus'], continue_probe) = nmap_portstatus(oval)
238                 else:
239                         values['portstatus'] = None
240                         
241
242                 ######  DRY RUN  ############################
243                 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
244                         rb_ret = reboot.reboot_test(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
245                 else:
246                         rb_ret = "Not_Run" # No nodes to test"
247
248                 values['reboot'] = rb_ret
249
250         except:
251                 print "____________________________________"
252                 print values
253                 errors = values
254                 print "____________________________________"
255                 errors['traceback'] = traceback.format_exc()
256                 print errors['traceback']
257
258         values['date_checked'] = time.time()
259         return (pcuname, values, errors)
260
261 def recordPingAndSSH(request, result):
262         global errorState
263         global count
264         global global_round
265         (nodename, values, errors) = result
266
267         if values is not None:
268                 pcu_id = int(nodename)
269                 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
270                                                                                         if_new_set={'round': global_round})
271                 global_round = fbsync.round
272                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, 
273                                                                                         if_new_set={'round' : global_round})
274
275                 fbrec = FindbadPCURecord(
276                                         date_checked=datetime.fromtimestamp(values['date_checked']),
277                                         round=fbsync.round,
278                                         plc_pcuid=pcu_id,
279                                         plc_pcu_stats=values['plc_pcu_stats'],
280                                         dns_status=values['dnsmatch'],
281                                         port_status=values['portstatus'],
282                                         entry_complete=" ".join(values['complete_entry']),
283                                         reboot_trial_status="%s" % values['reboot'],
284                                 )
285                 fbnodesync.round = global_round
286
287                 fbnodesync.flush()
288                 fbsync.flush()
289                 fbrec.flush()
290
291                 count += 1
292                 print "%d %s %s" % (count, nodename, values)
293
294         if errors is not None:
295                 pcu_id = "id_%s" % nodename
296                 errorState[pcu_id] = errors
297                 database.dbDump("findbadpcu_errors", errorState)
298
299 # this will be called when an exception occurs within a thread
300 def handle_exception(request, result):
301         print "Exception occured in request %s" % request.requestID
302         for i in result:
303                 print "Result: %s" % i
304
305
306 def checkAndRecordState(l_pcus, cohash):
307         global global_round
308         global count
309
310         tp = threadpool.ThreadPool(10)
311
312         # CREATE all the work requests
313         for pcuname in l_pcus:
314                 pcu_id = int(pcuname)
315                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
316                 fbnodesync.flush()
317
318                 node_round   = fbnodesync.round
319                 if node_round < global_round:
320                         # recreate node stats when refreshed
321                         #print "%s" % nodename
322                         req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, 
323                                                                                  None, recordPingAndSSH, handle_exception)
324                         tp.putRequest(req)
325                 else:
326                         # We just skip it, since it's "up to date"
327                         count += 1
328                         print "%d %s %s" % (count, pcu_id, node_round)
329
330         # WAIT while all the work requests are processed.
331         begin = time.time()
332         while 1:
333                 try:
334                         time.sleep(1)
335                         tp.poll()
336                         # if more than two hours
337                         if time.time() - begin > (60*60*1):
338                                 print "findbadpcus.py has run out of time!!!!!!"
339                                 os._exit(1)
340                 except KeyboardInterrupt:
341                         print "Interrupted!"
342                         break
343                 except threadpool.NoResultsPending:
344                         print "All results collected."
345                         break
346
347         print FindbadPCURecordSync.query.count()
348         print FindbadPCURecord.query.count()
349         session.flush()
350
351
352 def main():
353         global global_round
354
355         #  monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
356         l_pcus = plccache.l_pcus
357         cohash = {}
358
359         fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
360
361         global_round = fbsync.round
362
363         if config.increment:
364                 # update global round number to force refreshes across all nodes
365                 global_round += 1
366                 fbsync.round = global_round
367
368         fbsync.flush()
369
370         if config.site is not None:
371                 api = plc.getAuthAPI()
372                 site = api.GetSites(config.site)
373                 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
374                 pcus = []
375                 for node in l_nodes:
376                         pcus += node['pcu_ids']
377                 # clear out dups.
378                 l_pcus = [pcu for pcu in sets.Set(pcus)]
379         elif config.pcuselect is not None:
380                 n, pcus = pcu_select(config.pcuselect)
381                 # clear out dups.
382                 l_pcus = [pcu for pcu in sets.Set(pcus)]
383
384         elif config.nodelist == None and config.pcuid == None:
385                 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
386                 l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
387         elif config.nodelist is not None:
388                 l_pcus = util.file.getListFromFile(config.nodelist)
389                 l_pcus = [int(pcu) for pcu in l_pcus]
390         elif config.pcuid is not None:
391                 l_pcus = [ config.pcuid ] 
392                 l_pcus = [int(pcu) for pcu in l_pcus]
393
394         checkAndRecordState(l_pcus, cohash)
395
396         return 0
397
398
399 if __name__ == '__main__':
400         import logging
401         logger = logging.getLogger("monitor")
402         logger.setLevel(logging.DEBUG)
403         fh = logging.FileHandler("monitor.log", mode = 'a')
404         fh.setLevel(logging.DEBUG)
405         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
406         fh.setFormatter(formatter)
407         logger.addHandler(fh)
408         from monitor import parser as parsermodule
409         parser = parsermodule.getParser()
410         parser.set_defaults(nodelist=None, 
411                                                 increment=False, 
412                                                 pcuid=None,
413                                                 pcuselect=None,
414                                                 site=None,
415                                                 dbname="findbadpcus", 
416                                                 cachenodes=False,
417                                                 refresh=False,
418                                                 )
419         parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
420                                                 help="Provide the input file for the node list")
421         parser.add_option("", "--site", dest="site", metavar="FILE", 
422                                                 help="Get all pcus associated with the given site's nodes")
423         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
424                                                 help="Query string to apply to the findbad pcus")
425         parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
426                                                 help="Provide the id for a single pcu")
427
428         parser.add_option("", "--cachenodes", action="store_true",
429                                                 help="Cache node lookup from PLC")
430         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
431                                                 help="Specify the name of the database to which the information is saved")
432         parser.add_option("", "--refresh", action="store_true", dest="refresh",
433                                                 help="Refresh the cached values")
434         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
435                                                 help="Increment round number to force refresh or retry")
436         parser = parsermodule.getParser(['defaults'], parser)
437         config = parsermodule.parse_args(parser)
438         try:
439                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
440                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
441                 if 'LANG' in os.environ:
442                         del os.environ['LANG']
443                 main()
444                 time.sleep(1)
445         except Exception, err:
446                 traceback.print_exc()
447                 print "Exception: %s" % err
448                 print "Saving data... exitting."
449                 sys.exit(0)