modified *list templates with abreviated information
[monitor.git] / findbadpcu.py
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import string
6 import time
7 import socket
8 import sets
9 import signal
10 import traceback
11 from datetime import datetime,timedelta
12 import threadpool
13 import threading
14
15 import monitor
16 from pcucontrol  import reboot
17 from monitor import config
18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
19 from monitor import database
20 from monitor import util 
21 from monitor.wrapper import plc, plccache
22 from nodequery import pcu_select
23 from nodecommon import nmap_port_status
24
25 plc_lock = threading.Lock()
26 global_round = 1
27 errorState = {}
28 count = 0
29
30 def get_pcu(pcuname):
31         plc_lock.acquire()
32         try:
33                 #print "GetPCU from PLC %s" % pcuname
34                 l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
35                 #print l_pcu
36                 if len(l_pcu) > 0:
37                         l_pcu = l_pcu[0]
38         except:
39                 try:
40                         #print "GetPCU from file %s" % pcuname
41                         l_pcus = plccache.l_pcus
42                         for i in l_pcus:
43                                 if i['pcu_id'] == pcuname:
44                                         l_pcu = i
45                 except:
46                         traceback.print_exc()
47                         l_pcu = None
48
49         plc_lock.release()
50         return l_pcu
51
52 def get_nodes(node_ids):
53         plc_lock.acquire()
54         l_node = []
55         try:
56                 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
57         except:
58                 try:
59                         plc_nodes = plccache.l_plcnodes
60                         for n in plc_nodes:
61                                 if n['node_id'] in node_ids:
62                                         l_node.append(n)
63                 except:
64                         traceback.print_exc()
65                         l_node = None
66
67         plc_lock.release()
68         if l_node == []:
69                 l_node = None
70         return l_node
71         
72
73 def get_plc_pcu_values(pcuname):
74         """
75                 Try to contact PLC to get the PCU info.
76                 If that fails, try a backup copy from the last run.
77                 If that fails, return None
78         """
79         values = {}
80
81         l_pcu = get_pcu(pcuname)
82         
83         if l_pcu is not None:
84                 site_id = l_pcu['site_id']
85                 node_ids = l_pcu['node_ids']
86                 l_node = get_nodes(node_ids) 
87                                 
88                 if l_node is not None:
89                         for node in l_node:
90                                 values[node['hostname']] = node['ports'][0]
91
92                         values['nodenames'] = [node['hostname'] for node in l_node]
93
94                         # NOTE: this is for a dry run later. It doesn't matter which node.
95                         values['node_id'] = l_node[0]['node_id']
96
97                 values.update(l_pcu)
98         else:
99                 values = None
100         
101         return values
102
103 def get_plc_site_values(site_id):
104         ### GET PLC SITE ######################
105         plc_lock.acquire()
106         values = {}
107         d_site = None
108
109         try:
110                 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
111                 if len(d_site) > 0:
112                         d_site = d_site[0]
113         except:
114                 try:
115                         plc_sites = plccache.l_plcsites
116                         for site in plc_sites:
117                                 if site['site_id'] == site_id:
118                                         d_site = site
119                                         break
120                 except:
121                         traceback.print_exc()
122                         values = None
123
124         plc_lock.release()
125
126         if d_site is not None:
127                 max_slices = d_site['max_slices']
128                 num_slices = len(d_site['slice_ids'])
129                 num_nodes = len(d_site['node_ids'])
130                 loginbase = d_site['login_base']
131                 values['plcsite'] = {'num_nodes' : num_nodes, 
132                                                         'max_slices' : max_slices, 
133                                                         'num_slices' : num_slices,
134                                                         'login_base' : loginbase,
135                                                         'status'     : 'SUCCESS'}
136         else:
137                 values = None
138
139
140         return values
141
142
143 def collectPingAndSSH(pcuname, cohash):
144
145         continue_probe = True
146         errors = None
147         values = {'reboot' : 'novalue'}
148         ### GET PCU ######################
149         try:
150                 b_except = False
151                 try:
152                         v = get_plc_pcu_values(pcuname)
153                         if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
154                         if v['ip'] is not None: v['ip'] = v['ip'].strip()
155
156                         if v is not None:
157                                 values['plc_pcu_stats'] = v
158                         else:
159                                 continue_probe = False
160                 except:
161                         b_except = True
162                         traceback.print_exc()
163                         continue_probe = False
164
165                 if b_except or not continue_probe: return (None, None, None)
166
167                 #### RUN NMAP ###############################
168                 if continue_probe:
169                         nmap = util.command.CMD()
170                         print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
171                         (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
172                         # NOTE: an empty / error value for oval, will still work.
173                         (values['port_status'], continue_probe) = nmap_port_status(oval)
174                 else:
175                         values['port_status'] = None
176                         
177                 #### COMPLETE ENTRY   #######################
178
179                 values['entry_complete'] = []
180                 #if values['protocol'] is None or values['protocol'] is "":
181                 #       values['entry_complete'] += ["protocol"]
182                 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
183                         values['entry_complete'] += ["model"]
184                         # Cannot continue due to this condition
185                         continue_probe = False
186
187                 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
188                         values['entry_complete'] += ["password"]
189                         # Cannot continue due to this condition
190                         continue_probe = False
191
192                 if len(values['entry_complete']) > 0:
193                         continue_probe = False
194
195                 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
196                         values['entry_complete'] += ["hostname"]
197                 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
198                         values['entry_complete'] += ["ip"]
199
200                 # If there are no nodes associated with this PCU, then we cannot continue.
201                 if len(values['plc_pcu_stats']['node_ids']) == 0:
202                         continue_probe = False
203                         values['entry_complete'] += ['nodeids']
204
205
206                 #### DNS and IP MATCH #######################
207                 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
208                    values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
209                         #print "Calling socket.gethostbyname(%s)" % values['hostname']
210                         try:
211                                 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
212                                 if ipaddr == values['plc_pcu_stats']['ip']:
213                                         values['dns_status'] = "DNS-OK"
214                                 else:
215                                         values['dns_status'] = "DNS-MISMATCH"
216                                         continue_probe = False
217
218                         except Exception, err:
219                                 values['dns_status'] = "DNS-NOENTRY"
220                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
221                                 #print err
222                 else:
223                         if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
224                                 values['dns_status'] = "NOHOSTNAME"
225                                 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
226                         else:
227                                 values['dns_status'] = "NO-DNS-OR-IP"
228                                 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
229                                 continue_probe = False
230
231
232                 ######  DRY RUN  ############################
233                 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
234                         rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
235                                                                                         values, 1, True)
236                 else:
237                         rb_ret = "Not_Run" # No nodes to test"
238
239                 values['reboot'] = rb_ret
240
241         except:
242                 print "____________________________________"
243                 print values
244                 errors = values
245                 print "____________________________________"
246                 errors['traceback'] = traceback.format_exc()
247                 print errors['traceback']
248                 values['reboot'] = errors['traceback']
249
250         values['date_checked'] = time.time()
251         return (pcuname, values, errors)
252
253 def recordPingAndSSH(request, result):
254         global errorState
255         global count
256         global global_round
257         (nodename, values, errors) = result
258
259         if values is not None:
260                 pcu_id = int(nodename)
261                 #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, 
262                 #                                                                       if_new_set={'round': global_round})
263                 #global_round = fbsync.round
264                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, 
265                                                                                         if_new_set={'round' : global_round})
266
267                 fbrec = FindbadPCURecord(
268                                         date_checked=datetime.fromtimestamp(values['date_checked']),
269                                         round=global_round,
270                                         plc_pcuid=pcu_id,
271                                         plc_pcu_stats=values['plc_pcu_stats'],
272                                         dns_status=values['dns_status'],
273                                         port_status=values['port_status'],
274                                         entry_complete=" ".join(values['entry_complete']),
275                                         reboot_trial_status="%s" % values['reboot'],
276                                 )
277                 fbnodesync.round = global_round
278
279                 fbnodesync.flush()
280                 #fbsync.flush()
281                 fbrec.flush()
282
283                 count += 1
284                 print "%d %s %s" % (count, nodename, values)
285
286         if errors is not None:
287                 pcu_id = "id_%s" % nodename
288                 errorState[pcu_id] = errors
289                 database.dbDump("findbadpcu_errors", errorState)
290
291 # this will be called when an exception occurs within a thread
292 def handle_exception(request, result):
293         print "Exception occured in request %s" % request.requestID
294         for i in result:
295                 print "Result: %s" % i
296
297
298 def checkAndRecordState(l_pcus, cohash):
299         global global_round
300         global count
301
302         tp = threadpool.ThreadPool(10)
303
304         # CREATE all the work requests
305         for pcuname in l_pcus:
306                 pcu_id = int(pcuname)
307                 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
308                 fbnodesync.flush()
309
310                 node_round   = fbnodesync.round
311                 if node_round < global_round or config.force:
312                         # recreate node stats when refreshed
313                         #print "%s" % nodename
314                         req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {}, 
315                                                                                  None, recordPingAndSSH, handle_exception)
316                         tp.putRequest(req)
317                 else:
318                         # We just skip it, since it's "up to date"
319                         count += 1
320                         print "%d %s %s" % (count, pcu_id, node_round)
321
322         # WAIT while all the work requests are processed.
323         begin = time.time()
324         while 1:
325                 try:
326                         time.sleep(1)
327                         tp.poll()
328                         # if more than two hours
329                         if time.time() - begin > (60*60*1):
330                                 print "findbadpcus.py has run out of time!!!!!!"
331                                 os._exit(1)
332                 except KeyboardInterrupt:
333                         print "Interrupted!"
334                         break
335                 except threadpool.NoResultsPending:
336                         print "All results collected."
337                         break
338
339         print FindbadPCURecordSync.query.count()
340         print FindbadPCURecord.query.count()
341         session.flush()
342
343
344 def main():
345         global global_round
346
347         #  monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
348         l_pcus = plccache.l_pcus
349         cohash = {}
350
351         fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
352
353         global_round = fbsync.round
354
355
356         if config.site is not None:
357                 api = plc.getAuthAPI()
358                 site = api.GetSites(config.site)
359                 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
360                 pcus = []
361                 for node in l_nodes:
362                         pcus += node['pcu_ids']
363                 # clear out dups.
364                 l_pcus = [pcu for pcu in sets.Set(pcus)]
365         elif config.pcuselect is not None:
366                 n, pcus = pcu_select(config.pcuselect)
367                 print pcus
368                 # clear out dups.
369                 l_pcus = [pcu for pcu in sets.Set(pcus)]
370
371         elif config.nodelist == None and config.pcuid == None:
372                 print "Calling API GetPCUs() : cachecalls(%s)" % config.cachecalls
373                 l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
374         elif config.nodelist is not None:
375                 l_pcus = util.file.getListFromFile(config.nodelist)
376                 l_pcus = [int(pcu) for pcu in l_pcus]
377         elif config.pcuid is not None:
378                 l_pcus = [ config.pcuid ] 
379                 l_pcus = [int(pcu) for pcu in l_pcus]
380
381         if config.increment:
382                 # update global round number to force refreshes across all nodes
383                 global_round += 1
384
385         checkAndRecordState(l_pcus, cohash)
386
387         if config.increment:
388                 # update global round number to force refreshes across all nodes
389                 fbsync.round = global_round
390                 fbsync.flush()
391                 session.flush()
392
393         return 0
394
395
396 print "main"
397 if __name__ == '__main__':
398         import logging
399         logger = logging.getLogger("monitor")
400         logger.setLevel(logging.DEBUG)
401         fh = logging.FileHandler("monitor.log", mode = 'a')
402         fh.setLevel(logging.DEBUG)
403         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
404         fh.setFormatter(formatter)
405         logger.addHandler(fh)
406         from monitor import parser as parsermodule
407         parser = parsermodule.getParser()
408         parser.set_defaults(nodelist=None, 
409                                                 increment=False, 
410                                                 pcuid=None,
411                                                 pcuselect=None,
412                                                 site=None,
413                                                 dbname="findbadpcus", 
414                                                 cachenodes=False,
415                                                 cachecalls=True,
416                                                 force=False,
417                                                 )
418         parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", 
419                                                 help="Provide the input file for the node list")
420         parser.add_option("", "--site", dest="site", metavar="FILE", 
421                                                 help="Get all pcus associated with the given site's nodes")
422         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
423                                                 help="Query string to apply to the findbad pcus")
424         parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
425                                                 help="Provide the id for a single pcu")
426
427         parser.add_option("", "--cachenodes", action="store_true",
428                                                 help="Cache node lookup from PLC")
429         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
430                                                 help="Specify the name of the database to which the information is saved")
431         parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
432                                                 help="Refresh the cached values")
433         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
434                                                 help="Increment round number to force refresh or retry")
435         parser.add_option("", "--force", action="store_true", dest="force", 
436                                                 help="Force probe without incrementing global 'round'.")
437         parser = parsermodule.getParser(['defaults'], parser)
438         config = parsermodule.parse_args(parser)
439         if hasattr(config, 'cachecalls') and not config.cachecalls:
440                 # NOTE: if explicilty asked, refresh cached values.
441                 print "Reloading PLCCache"
442                 plccache.init()
443         try:
444                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
445                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
446                 if 'LANG' in os.environ:
447                         del os.environ['LANG']
448                 main()
449                 time.sleep(1)
450         except Exception, err:
451                 traceback.print_exc()
452                 print "Exception: %s" % err
453                 print "Saving data... exitting."
454                 sys.exit(0)