468107daa6f20827824d19c6934543e5f35f25b4
[monitor.git] / findbadpcu.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 import socket
8 import sets
9 import signal
10 import traceback
11 from datetime import datetime,timedelta
12 import threadpool
13 import threading
14
15 import monitor
16 from pcucontrol  import reboot
17 from monitor import config
18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
19 from monitor import database
20 from monitor import util 
21 from monitor.wrapper import plc, plccache
22 from nodequery import pcu_select
23
24 plc_lock = threading.Lock()
25 global_round = 1
26 errorState = {}
27 count = 0
28
29 def nmap_port_status(status):
30         ps = {}
31         l_nmap = status.split()
32         ports = l_nmap[4:]
33
34         continue_probe = False
35         for port in ports:
36                 results = port.split('/')
37                 ps[results[0]] = results[1]
38                 if results[1] == "open":
39                         continue_probe = True
40         return (ps, continue_probe)
41
42 def get_pcu(pcuname):
43         plc_lock.acquire()
44         try:
45                 #print "GetPCU from PLC %s" % pcuname
46                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
47                 #print l_pcu
48                 if len(l_pcu) > 0:
49                         l_pcu = l_pcu[0]
50         except:
51                 try:
52                         #print "GetPCU from file %s" % pcuname
53                         l_pcus = plccache.l_pcus
54                         for i in l_pcus:
55                                 if i['pcu_id'] == pcuname:
56                                         l_pcu = i
57                 except:
58                         traceback.print_exc()
59                         l_pcu = None
60
61         plc_lock.release()
62         return l_pcu
63
64 def get_nodes(node_ids):
65         plc_lock.acquire()
66         l_node = []
67         try:
68                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
69         except:
70                 try:
71                         plc_nodes = plccache.l_plcnodes
72                         for n in plc_nodes:
73                                 if n['node_id'] in node_ids:
74                                         l_node.append(n)
75                 except:
76                         traceback.print_exc()
77                         l_node = None
78
79         plc_lock.release()
80         if l_node == []:
81                 l_node = None
82         return l_node
83         
84
85 def get_plc_pcu_values(pcuname):
86         """
87                 Try to contact PLC to get the PCU info.
88                 If that fails, try a backup copy from the last run.
89                 If that fails, return None
90         """
91         values = {}
92
93         l_pcu = get_pcu(pcuname)
94         
95         if l_pcu is not None:
96                 site_id = l_pcu['site_id']
97                 node_ids = l_pcu['node_ids']
98                 l_node = get_nodes(node_ids) 
99                                 
100                 if l_node is not None:
101                         for node in l_node:
102                                 values[node['hostname']] = node['ports'][0]
103
104                         values['nodenames'] = [node['hostname'] for node in l_node]
105
106                         # NOTE: this is for a dry run later. It doesn't matter which node.
107                         values['node_id'] = l_node[0]['node_id']
108
109                 values.update(l_pcu)
110         else:
111                 values = None
112         
113         return values
114
115 def get_plc_site_values(site_id):
116         ### GET PLC SITE ######################
117         plc_lock.acquire()
118         values = {}
119         d_site = None
120
121         try:
122                 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
123                 if len(d_site) > 0:
124                         d_site = d_site[0]
125         except:
126                 try:
127                         plc_sites = plccache.l_plcsites
128                         for site in plc_sites:
129                                 if site['site_id'] == site_id:
130                                         d_site = site
131                                         break
132                 except:
133                         traceback.print_exc()
134                         values = None
135
136         plc_lock.release()
137
138         if d_site is not None:
139                 max_slices = d_site['max_slices']
140                 num_slices = len(d_site['slice_ids'])
141                 num_nodes = len(d_site['node_ids'])
142                 loginbase = d_site['login_base']
143                 values['plcsite'] = {'num_nodes' : num_nodes, 
144                                                         'max_slices' : max_slices, 
145                                                         'num_slices' : num_slices,
146                                                         'login_base' : loginbase,
147                                                         'status'     : 'SUCCESS'}
148         else:
149                 values = None
150
151
152         return values
153
154
155 def collectPingAndSSH(pcuname, cohash):
156
157         continue_probe = True
158         errors = None
159         values = {'reboot' : 'novalue'}
160         ### GET PCU ######################
161         try:
162                 b_except = False
163                 try:
164                         v = get_plc_pcu_values(pcuname)
165                         if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
166                         if v['ip'] is not None: v['ip'] = v['ip'].strip()
167
168                         if v is not None:
169                                 values['plc_pcu_stats'] = v
170                         else:
171                                 continue_probe = False
172                 except:
173                         b_except = True
174                         traceback.print_exc()
175                         continue_probe = False
176
177                 if b_except or not continue_probe: return (None, None, None)
178
179
180                 #### COMPLETE ENTRY   #######################
181
182                 values['entry_complete'] = []
183                 #if values['protocol'] is None or values['protocol'] is "":
184                 #       values['entry_complete'] += ["protocol"]
185                 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
186                         values['entry_complete'] += ["model"]
187                         # Cannot continue due to this condition
188                         continue_probe = False
189
190                 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
191                         values['entry_complete'] += ["password"]
192                         # Cannot continue due to this condition
193                         continue_probe = False
194
195                 if len(values['entry_complete']) > 0:
196                         continue_probe = False
197
198                 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
199                         values['entry_complete'] += ["hostname"]
200                 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
201                         values['entry_complete'] += ["ip"]
202
203                 # If there are no nodes associated with this PCU, then we cannot continue.
204                 if len(values['plc_pcu_stats']['node_ids']) == 0:
205                         continue_probe = False
206                         values['entry_complete'] += ['NoNodeIds']
207
208                 #### DNS and IP MATCH #######################
209                 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
210                    values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
211                         #print "Calling socket.gethostbyname(%s)" % values['hostname']
212                         try:
213                                 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
214                                 if ipaddr == values['plc_pcu_stats']['ip']:
215                                         values['dns_status'] = "DNS-OK"
216                                 else:
217                                         values['dns_status'] = "DNS-MISMATCH"
218                                         continue_probe = False
219
220                         except Exception, err:
221                                 values['dns_status'] = "DNS-NOENTRY"
222                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
223                                 #print err
224                 else:
225                         if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
226                                 values['dns_status'] = "NOHOSTNAME"
227                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
228                         else:
229                                 values['dns_status'] = "NO-DNS-OR-IP"
230                                 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
231                                 continue_probe = False
232
233                 #### RUN NMAP ###############################
234                 if continue_probe:
235                         nmap = util.command.CMD()
236                         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
237                         # NOTE: an empty / error value for oval, will still work.
238                         (values['port_status'], continue_probe) = nmap_port_status(oval)
239                 else:
240                         values['port_status'] = None
241                         
242
243                 ######  DRY RUN  ############################
244                 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
245                         rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
246                 else:
247                         rb_ret = "Not_Run" # No nodes to test"
248
249                 values['reboot'] = rb_ret
250
251         except:
252                 print "____________________________________"
253                 print values
254                 errors = values
255                 print "____________________________________"
256                 errors['traceback'] = traceback.format_exc()
257                 print errors['traceback']
258                 values['reboot'] = errors['traceback']
259
260         values['date_checked'] = time.time()
261         return (pcuname, values, errors)
262
263 def recordPingAndSSH(request, result):
264         global errorState
265         global count
266         global global_round
267         (nodename, values, errors) = result
268
269         if values is not None:
270                 pcu_id = int(nodename)
271                 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
272                                                                                         if_new_set={'round': global_round})
273                 global_round = fbsync.round
274                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, 
275                                                                                         if_new_set={'round' : global_round})
276
277                 fbrec = FindbadPCURecord(
278                                         date_checked=datetime.fromtimestamp(values['date_checked']),
279                                         round=fbsync.round,
280                                         plc_pcuid=pcu_id,
281                                         plc_pcu_stats=values['plc_pcu_stats'],
282                                         dns_status=values['dns_status'],
283                                         port_status=values['port_status'],
284                                         entry_complete=" ".join(values['entry_complete']),
285                                         reboot_trial_status="%s" % values['reboot'],
286                                 )
287                 fbnodesync.round = global_round
288
289                 fbnodesync.flush()
290                 fbsync.flush()
291                 fbrec.flush()
292
293                 count += 1
294                 print "%d %s %s" % (count, nodename, values)
295
296         if errors is not None:
297                 pcu_id = "id_%s" % nodename
298                 errorState[pcu_id] = errors
299                 database.dbDump("findbadpcu_errors", errorState)
300
301 # this will be called when an exception occurs within a thread
302 def handle_exception(request, result):
303         print "Exception occured in request %s" % request.requestID
304         for i in result:
305                 print "Result: %s" % i
306
307
308 def checkAndRecordState(l_pcus, cohash):
309         global global_round
310         global count
311
312         tp = threadpool.ThreadPool(10)
313
314         # CREATE all the work requests
315         for pcuname in l_pcus:
316                 pcu_id = int(pcuname)
317                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
318                 fbnodesync.flush()
319
320                 node_round   = fbnodesync.round
321                 if node_round < global_round or config.force:
322                         # recreate node stats when refreshed
323                         #print "%s" % nodename
324                         req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, 
325                                                                                  None, recordPingAndSSH, handle_exception)
326                         tp.putRequest(req)
327                 else:
328                         # We just skip it, since it's "up to date"
329                         count += 1
330                         print "%d %s %s" % (count, pcu_id, node_round)
331
332         # WAIT while all the work requests are processed.
333         begin = time.time()
334         while 1:
335                 try:
336                         time.sleep(1)
337                         tp.poll()
338                         # if more than two hours
339                         if time.time() - begin > (60*60*1):
340                                 print "findbadpcus.py has run out of time!!!!!!"
341                                 os._exit(1)
342                 except KeyboardInterrupt:
343                         print "Interrupted!"
344                         break
345                 except threadpool.NoResultsPending:
346                         print "All results collected."
347                         break
348
349         print FindbadPCURecordSync.query.count()
350         print FindbadPCURecord.query.count()
351         session.flush()
352
353
354 def main():
355         global global_round
356
357         #  monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
358         l_pcus = plccache.l_pcus
359         cohash = {}
360
361         fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
362
363         global_round = fbsync.round
364
365
366         if config.site is not None:
367                 api = plc.getAuthAPI()
368                 site = api.GetSites(config.site)
369                 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
370                 pcus = []
371                 for node in l_nodes:
372                         pcus += node['pcu_ids']
373                 # clear out dups.
374                 l_pcus = [pcu for pcu in sets.Set(pcus)]
375         elif config.pcuselect is not None:
376                 n, pcus = pcu_select(config.pcuselect)
377                 print pcus
378                 # clear out dups.
379                 l_pcus = [pcu for pcu in sets.Set(pcus)]
380
381         elif config.nodelist == None and config.pcuid == None:
382                 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
383                 l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
384         elif config.nodelist is not None:
385                 l_pcus = util.file.getListFromFile(config.nodelist)
386                 l_pcus = [int(pcu) for pcu in l_pcus]
387         elif config.pcuid is not None:
388                 l_pcus = [ config.pcuid ] 
389                 l_pcus = [int(pcu) for pcu in l_pcus]
390
391         if config.increment:
392                 # update global round number to force refreshes across all nodes
393                 global_round += 1
394                 fbsync.round = global_round
395         fbsync.flush()
396
397         checkAndRecordState(l_pcus, cohash)
398
399         return 0
400
401
402 print "main"
403 if __name__ == '__main__':
404         import logging
405         logger = logging.getLogger("monitor")
406         logger.setLevel(logging.DEBUG)
407         fh = logging.FileHandler("monitor.log", mode = 'a')
408         fh.setLevel(logging.DEBUG)
409         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
410         fh.setFormatter(formatter)
411         logger.addHandler(fh)
412         from monitor import parser as parsermodule
413         parser = parsermodule.getParser()
414         parser.set_defaults(nodelist=None, 
415                                                 increment=False, 
416                                                 pcuid=None,
417                                                 pcuselect=None,
418                                                 site=None,
419                                                 dbname="findbadpcus", 
420                                                 cachenodes=False,
421                                                 cachecalls=True,
422                                                 force=False,
423                                                 )
424         parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
425                                                 help="Provide the input file for the node list")
426         parser.add_option("", "--site", dest="site", metavar="FILE", 
427                                                 help="Get all pcus associated with the given site's nodes")
428         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
429                                                 help="Query string to apply to the findbad pcus")
430         parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
431                                                 help="Provide the id for a single pcu")
432
433         parser.add_option("", "--cachenodes", action="store_true",
434                                                 help="Cache node lookup from PLC")
435         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
436                                                 help="Specify the name of the database to which the information is saved")
437         parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
438                                                 help="Refresh the cached values")
439         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
440                                                 help="Increment round number to force refresh or retry")
441         parser.add_option("", "--force", action="store_true", dest="force", 
442                                                 help="Force probe without incrementing global 'round'.")
443         parser = parsermodule.getParser(['defaults'], parser)
444         config = parsermodule.parse_args(parser)
445         if hasattr(config, 'cachecalls') and not config.cachecalls:
446                 # NOTE: if explicilty asked, refresh cached values.
447                 print "Reloading PLCCache"
448                 plccache.init()
449         try:
450                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
451                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
452                 if 'LANG' in os.environ:
453                         del os.environ['LANG']
454                 main()
455                 time.sleep(1)
456         except Exception, err:
457                 traceback.print_exc()
458                 print "Exception: %s" % err
459                 print "Saving data... exitting."
460                 sys.exit(0)