11 from datetime import datetime,timedelta
16 from pcucontrol import reboot
17 from monitor import config
18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
19 from monitor import database
20 from monitor import util
21 from monitor.wrapper import plc, plccache
22 from nodequery import pcu_select
24 plc_lock = threading.Lock()
29 def nmap_port_status(status):
31 l_nmap = status.split()
34 continue_probe = False
36 results = port.split('/')
37 ps[results[0]] = results[1]
38 if results[1] == "open":
40 return (ps, continue_probe)
45 #print "GetPCU from PLC %s" % pcuname
46 l_pcu = plc.GetPCUs({'pcu_id' : pcuname})
52 #print "GetPCU from file %s" % pcuname
53 l_pcus = plccache.l_pcus
55 if i['pcu_id'] == pcuname:
64 def get_nodes(node_ids):
68 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
71 plc_nodes = plccache.l_plcnodes
73 if n['node_id'] in node_ids:
85 def get_plc_pcu_values(pcuname):
87 Try to contact PLC to get the PCU info.
88 If that fails, try a backup copy from the last run.
89 If that fails, return None
93 l_pcu = get_pcu(pcuname)
96 site_id = l_pcu['site_id']
97 node_ids = l_pcu['node_ids']
98 l_node = get_nodes(node_ids)
100 if l_node is not None:
102 values[node['hostname']] = node['ports'][0]
104 values['nodenames'] = [node['hostname'] for node in l_node]
106 # NOTE: this is for a dry run later. It doesn't matter which node.
107 values['node_id'] = l_node[0]['node_id']
115 def get_plc_site_values(site_id):
116 ### GET PLC SITE ######################
122 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
127 plc_sites = plccache.l_plcsites
128 for site in plc_sites:
129 if site['site_id'] == site_id:
133 traceback.print_exc()
138 if d_site is not None:
139 max_slices = d_site['max_slices']
140 num_slices = len(d_site['slice_ids'])
141 num_nodes = len(d_site['node_ids'])
142 loginbase = d_site['login_base']
143 values['plcsite'] = {'num_nodes' : num_nodes,
144 'max_slices' : max_slices,
145 'num_slices' : num_slices,
146 'login_base' : loginbase,
147 'status' : 'SUCCESS'}
155 def collectPingAndSSH(pcuname, cohash):
157 continue_probe = True
159 values = {'reboot' : 'novalue'}
160 ### GET PCU ######################
164 v = get_plc_pcu_values(pcuname)
165 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
166 if v['ip'] is not None: v['ip'] = v['ip'].strip()
169 values['plc_pcu_stats'] = v
171 continue_probe = False
174 traceback.print_exc()
175 continue_probe = False
177 if b_except or not continue_probe: return (None, None, None)
180 #### COMPLETE ENTRY #######################
182 values['entry_complete'] = []
183 #if values['protocol'] is None or values['protocol'] is "":
184 # values['entry_complete'] += ["protocol"]
185 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
186 values['entry_complete'] += ["model"]
187 # Cannot continue due to this condition
188 continue_probe = False
190 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
191 values['entry_complete'] += ["password"]
192 # Cannot continue due to this condition
193 continue_probe = False
195 if len(values['entry_complete']) > 0:
196 continue_probe = False
198 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
199 values['entry_complete'] += ["hostname"]
200 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
201 values['entry_complete'] += ["ip"]
203 # If there are no nodes associated with this PCU, then we cannot continue.
204 if len(values['plc_pcu_stats']['node_ids']) == 0:
205 continue_probe = False
206 values['entry_complete'] += ['NoNodeIds']
208 #### DNS and IP MATCH #######################
209 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
210 values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
211 #print "Calling socket.gethostbyname(%s)" % values['hostname']
213 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
214 if ipaddr == values['plc_pcu_stats']['ip']:
215 values['dns_status'] = "DNS-OK"
217 values['dns_status'] = "DNS-MISMATCH"
218 continue_probe = False
220 except Exception, err:
221 values['dns_status'] = "DNS-NOENTRY"
222 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
225 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
226 values['dns_status'] = "NOHOSTNAME"
227 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
229 values['dns_status'] = "NO-DNS-OR-IP"
230 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
231 continue_probe = False
233 #### RUN NMAP ###############################
235 nmap = util.command.CMD()
236 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
237 # NOTE: an empty / error value for oval, will still work.
238 (values['port_status'], continue_probe) = nmap_port_status(oval)
240 values['port_status'] = None
243 ###### DRY RUN ############################
244 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
245 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
247 rb_ret = "Not_Run" # No nodes to test"
249 values['reboot'] = rb_ret
252 print "____________________________________"
255 print "____________________________________"
256 errors['traceback'] = traceback.format_exc()
257 print errors['traceback']
258 values['reboot'] = errors['traceback']
260 values['date_checked'] = time.time()
261 return (pcuname, values, errors)
263 def recordPingAndSSH(request, result):
267 (nodename, values, errors) = result
269 if values is not None:
270 pcu_id = int(nodename)
271 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
272 if_new_set={'round': global_round})
273 global_round = fbsync.round
274 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
275 if_new_set={'round' : global_round})
277 fbrec = FindbadPCURecord(
278 date_checked=datetime.fromtimestamp(values['date_checked']),
281 plc_pcu_stats=values['plc_pcu_stats'],
282 dns_status=values['dns_status'],
283 port_status=values['port_status'],
284 entry_complete=" ".join(values['entry_complete']),
285 reboot_trial_status="%s" % values['reboot'],
287 fbnodesync.round = global_round
294 print "%d %s %s" % (count, nodename, values)
296 if errors is not None:
297 pcu_id = "id_%s" % nodename
298 errorState[pcu_id] = errors
299 database.dbDump("findbadpcu_errors", errorState)
301 # this will be called when an exception occurs within a thread
302 def handle_exception(request, result):
303 print "Exception occured in request %s" % request.requestID
305 print "Result: %s" % i
308 def checkAndRecordState(l_pcus, cohash):
312 tp = threadpool.ThreadPool(10)
314 # CREATE all the work requests
315 for pcuname in l_pcus:
316 pcu_id = int(pcuname)
317 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
320 node_round = fbnodesync.round
321 if node_round < global_round or config.force:
322 # recreate node stats when refreshed
323 #print "%s" % nodename
324 req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
325 None, recordPingAndSSH, handle_exception)
328 # We just skip it, since it's "up to date"
330 print "%d %s %s" % (count, pcu_id, node_round)
332 # WAIT while all the work requests are processed.
338 # if more than two hours
339 if time.time() - begin > (60*60*1):
340 print "findbadpcus.py has run out of time!!!!!!"
342 except KeyboardInterrupt:
345 except threadpool.NoResultsPending:
346 print "All results collected."
349 print FindbadPCURecordSync.query.count()
350 print FindbadPCURecord.query.count()
357 # monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
358 l_pcus = plccache.l_pcus
361 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
363 global_round = fbsync.round
366 if config.site is not None:
367 api = plc.getAuthAPI()
368 site = api.GetSites(config.site)
369 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
372 pcus += node['pcu_ids']
374 l_pcus = [pcu for pcu in sets.Set(pcus)]
375 elif config.pcuselect is not None:
376 n, pcus = pcu_select(config.pcuselect)
379 l_pcus = [pcu for pcu in sets.Set(pcus)]
381 elif config.nodelist == None and config.pcuid == None:
382 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
383 l_pcus = [pcu['pcu_id'] for pcu in l_pcus]
384 elif config.nodelist is not None:
385 l_pcus = util.file.getListFromFile(config.nodelist)
386 l_pcus = [int(pcu) for pcu in l_pcus]
387 elif config.pcuid is not None:
388 l_pcus = [ config.pcuid ]
389 l_pcus = [int(pcu) for pcu in l_pcus]
392 # update global round number to force refreshes across all nodes
394 fbsync.round = global_round
397 checkAndRecordState(l_pcus, cohash)
403 if __name__ == '__main__':
405 logger = logging.getLogger("monitor")
406 logger.setLevel(logging.DEBUG)
407 fh = logging.FileHandler("monitor.log", mode = 'a')
408 fh.setLevel(logging.DEBUG)
409 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
410 fh.setFormatter(formatter)
411 logger.addHandler(fh)
412 from monitor import parser as parsermodule
413 parser = parsermodule.getParser()
414 parser.set_defaults(nodelist=None,
419 dbname="findbadpcus",
424 parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
425 help="Provide the input file for the node list")
426 parser.add_option("", "--site", dest="site", metavar="FILE",
427 help="Get all pcus associated with the given site's nodes")
428 parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
429 help="Query string to apply to the findbad pcus")
430 parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
431 help="Provide the id for a single pcu")
433 parser.add_option("", "--cachenodes", action="store_true",
434 help="Cache node lookup from PLC")
435 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
436 help="Specify the name of the database to which the information is saved")
437 parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
438 help="Refresh the cached values")
439 parser.add_option("-i", "--increment", action="store_true", dest="increment",
440 help="Increment round number to force refresh or retry")
441 parser.add_option("", "--force", action="store_true", dest="force",
442 help="Force probe without incrementing global 'round'.")
443 parser = parsermodule.getParser(['defaults'], parser)
444 config = parsermodule.parse_args(parser)
445 if hasattr(config, 'cachecalls') and not config.cachecalls:
446 # NOTE: if explicilty asked, refresh cached values.
447 print "Reloading PLCCache"
450 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
451 # when LANG is set... Do not know why. Unsetting LANG, fixes the problem.
452 if 'LANG' in os.environ:
453 del os.environ['LANG']
456 except Exception, err:
457 traceback.print_exc()
458 print "Exception: %s" % err
459 print "Saving data... exitting."