11 from datetime import datetime,timedelta
16 from monitor.pcu import reboot
17 from monitor import config
18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
19 from monitor import util
20 from monitor.wrapper import plc, plccache
21 from nodequery import pcu_select
23 plc_lock = threading.Lock()
28 def nmap_portstatus(status):
30 l_nmap = status.split()
33 continue_probe = False
35 results = port.split('/')
36 ps[results[0]] = results[1]
37 if results[1] == "open":
39 return (ps, continue_probe)
44 #print "GetPCU from PLC %s" % pcuname
45 l_pcu = plc.GetPCUs({'pcu_id' : pcuname})
51 #print "GetPCU from file %s" % pcuname
52 l_pcus = plccache.l_pcus
54 if i['pcu_id'] == pcuname:
63 def get_nodes(node_ids):
67 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
70 plc_nodes = plccache.l_plcnodes
72 if n['node_id'] in node_ids:
84 def get_plc_pcu_values(pcuname):
86 Try to contact PLC to get the PCU info.
87 If that fails, try a backup copy from the last run.
88 If that fails, return None
92 l_pcu = get_pcu(pcuname)
95 site_id = l_pcu['site_id']
96 node_ids = l_pcu['node_ids']
97 l_node = get_nodes(node_ids)
99 if l_node is not None:
101 values[node['hostname']] = node['ports'][0]
103 values['nodenames'] = [node['hostname'] for node in l_node]
105 # NOTE: this is for a dry run later. It doesn't matter which node.
106 values['node_id'] = l_node[0]['node_id']
114 def get_plc_site_values(site_id):
115 ### GET PLC SITE ######################
121 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
126 plc_sites = plccache.l_plcsites
127 for site in plc_sites:
128 if site['site_id'] == site_id:
132 traceback.print_exc()
137 if d_site is not None:
138 max_slices = d_site['max_slices']
139 num_slices = len(d_site['slice_ids'])
140 num_nodes = len(d_site['node_ids'])
141 loginbase = d_site['login_base']
142 values['plcsite'] = {'num_nodes' : num_nodes,
143 'max_slices' : max_slices,
144 'num_slices' : num_slices,
145 'login_base' : loginbase,
146 'status' : 'SUCCESS'}
154 def collectPingAndSSH(pcuname, cohash):
156 continue_probe = True
158 values = {'reboot' : 'novalue'}
159 ### GET PCU ######################
163 v = get_plc_pcu_values(pcuname)
164 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
165 if v['ip'] is not None: v['ip'] = v['ip'].strip()
168 values['plc_pcu_stats'] = v
170 continue_probe = False
173 traceback.print_exc()
174 continue_probe = False
176 if b_except or not continue_probe: return (None, None, None)
179 #### COMPLETE ENTRY #######################
181 values['complete_entry'] = []
182 #if values['protocol'] is None or values['protocol'] is "":
183 # values['complete_entry'] += ["protocol"]
184 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
185 values['complete_entry'] += ["model"]
186 # Cannot continue due to this condition
187 continue_probe = False
189 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
190 values['complete_entry'] += ["password"]
191 # Cannot continue due to this condition
192 continue_probe = False
194 if len(values['complete_entry']) > 0:
195 continue_probe = False
197 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
198 values['complete_entry'] += ["hostname"]
199 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
200 values['complete_entry'] += ["ip"]
202 # If there are no nodes associated with this PCU, then we cannot continue.
203 if len(values['plc_pcu_stats']['node_ids']) == 0:
204 continue_probe = False
205 values['complete_entry'] += ['NoNodeIds']
207 #### DNS and IP MATCH #######################
208 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
209 values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
210 #print "Calling socket.gethostbyname(%s)" % values['hostname']
212 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
213 if ipaddr == values['plc_pcu_stats']['ip']:
214 values['dnsmatch'] = "DNS-OK"
216 values['dnsmatch'] = "DNS-MISMATCH"
217 continue_probe = False
219 except Exception, err:
220 values['dnsmatch'] = "DNS-NOENTRY"
221 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
224 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
225 values['dnsmatch'] = "NOHOSTNAME"
226 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
228 values['dnsmatch'] = "NO-DNS-OR-IP"
229 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
230 continue_probe = False
232 #### RUN NMAP ###############################
234 nmap = util.command.CMD()
235 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
236 # NOTE: an empty / error value for oval, will still work.
237 (values['portstatus'], continue_probe) = nmap_portstatus(oval)
239 values['portstatus'] = None
242 ###### DRY RUN ############################
243 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
244 rb_ret = reboot.reboot_test(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
246 rb_ret = "Not_Run" # No nodes to test"
248 values['reboot'] = rb_ret
251 print "____________________________________"
254 print "____________________________________"
255 errors['traceback'] = traceback.format_exc()
256 print errors['traceback']
258 values['date_checked'] = time.time()
259 return (pcuname, values, errors)
261 def recordPingAndSSH(request, result):
265 (nodename, values, errors) = result
267 if values is not None:
268 pcu_id = int(nodename)
269 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
270 if_new_set={'round': global_round})
271 global_round = fbsync.round
272 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
273 if_new_set={'round' : global_round})
275 fbrec = FindbadPCURecord(
276 date_checked=datetime.fromtimestamp(values['date_checked']),
279 plc_pcu_stats=values['plc_pcu_stats'],
280 dns_status=values['dnsmatch'],
281 port_status=values['portstatus'],
282 entry_complete=" ".join(values['complete_entry']),
283 reboot_trial_status="%s" % values['reboot'],
285 fbnodesync.round = global_round
292 print "%d %s %s" % (count, nodename, values)
294 if errors is not None:
295 pcu_id = "id_%s" % nodename
296 errorState[pcu_id] = errors
297 database.dbDump("findbadpcu_errors", errorState)
299 # this will be called when an exception occurs within a thread
300 def handle_exception(request, result):
301 print "Exception occured in request %s" % request.requestID
303 print "Result: %s" % i
306 def checkAndRecordState(l_pcus, cohash):
310 tp = threadpool.ThreadPool(10)
312 # CREATE all the work requests
313 for pcuname in l_pcus:
314 pcu_id = int(pcuname)
315 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
318 node_round = fbnodesync.round
319 if node_round < global_round:
320 # recreate node stats when refreshed
321 #print "%s" % nodename
322 req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
323 None, recordPingAndSSH, handle_exception)
326 # We just skip it, since it's "up to date"
328 print "%d %s %s" % (count, pcu_id, node_round)
330 # WAIT while all the work requests are processed.
336 # if more than two hours
337 if time.time() - begin > (60*60*1):
338 print "findbadpcus.py has run out of time!!!!!!"
340 except KeyboardInterrupt:
343 except threadpool.NoResultsPending:
344 print "All results collected."
347 print FindbadPCURecordSync.query.count()
348 print FindbadPCURecord.query.count()
355 # monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
356 l_pcus = plccache.l_pcus
359 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
361 global_round = fbsync.round
364 # update global round number to force refreshes across all nodes
366 fbsync.round = global_round
370 if config.site is not None:
371 api = plc.getAuthAPI()
372 site = api.GetSites(config.site)
373 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
376 pcus += node['pcu_ids']
378 l_pcus = [pcu for pcu in sets.Set(pcus)]
379 elif config.pcuselect is not None:
380 n, pcus = pcu_select(config.pcuselect)
382 l_pcus = [pcu for pcu in sets.Set(pcus)]
384 elif config.nodelist == None and config.pcuid == None:
385 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
386 l_pcus = [pcu['pcu_id'] for pcu in l_pcus]
387 elif config.nodelist is not None:
388 l_pcus = util.file.getListFromFile(config.nodelist)
389 l_pcus = [int(pcu) for pcu in l_pcus]
390 elif config.pcuid is not None:
391 l_pcus = [ config.pcuid ]
392 l_pcus = [int(pcu) for pcu in l_pcus]
394 checkAndRecordState(l_pcus, cohash)
399 if __name__ == '__main__':
401 logger = logging.getLogger("monitor")
402 logger.setLevel(logging.DEBUG)
403 fh = logging.FileHandler("monitor.log", mode = 'a')
404 fh.setLevel(logging.DEBUG)
405 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
406 fh.setFormatter(formatter)
407 logger.addHandler(fh)
408 from monitor import parser as parsermodule
409 parser = parsermodule.getParser()
410 parser.set_defaults(nodelist=None,
415 dbname="findbadpcus",
419 parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
420 help="Provide the input file for the node list")
421 parser.add_option("", "--site", dest="site", metavar="FILE",
422 help="Get all pcus associated with the given site's nodes")
423 parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
424 help="Query string to apply to the findbad pcus")
425 parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
426 help="Provide the id for a single pcu")
428 parser.add_option("", "--cachenodes", action="store_true",
429 help="Cache node lookup from PLC")
430 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
431 help="Specify the name of the database to which the information is saved")
432 parser.add_option("", "--refresh", action="store_true", dest="refresh",
433 help="Refresh the cached values")
434 parser.add_option("-i", "--increment", action="store_true", dest="increment",
435 help="Increment round number to force refresh or retry")
436 parser = parsermodule.getParser(['defaults'], parser)
437 config = parsermodule.parse_args(parser)
439 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
440 # when LANG is set... Do not know why. Unsetting LANG, fixes the problem.
441 if 'LANG' in os.environ:
442 del os.environ['LANG']
445 except Exception, err:
446 traceback.print_exc()
447 print "Exception: %s" % err
448 print "Saving data... exitting."