11 from datetime import datetime,timedelta
16 from pcucontrol import reboot
17 from monitor import config
18 from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
19 from monitor import database
20 from monitor import util
21 from monitor.wrapper import plc, plccache
22 from nodequery import pcu_select
23 from nodecommon import nmap_port_status
25 plc_lock = threading.Lock()
33 #print "GetPCU from PLC %s" % pcuname
34 l_pcu = plc.GetPCUs({'pcu_id' : pcuname})
40 #print "GetPCU from file %s" % pcuname
41 l_pcus = plccache.l_pcus
43 if i['pcu_id'] == pcuname:
52 def get_nodes(node_ids):
56 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
59 plc_nodes = plccache.l_plcnodes
61 if n['node_id'] in node_ids:
73 def get_plc_pcu_values(pcuname):
75 Try to contact PLC to get the PCU info.
76 If that fails, try a backup copy from the last run.
77 If that fails, return None
81 l_pcu = get_pcu(pcuname)
84 site_id = l_pcu['site_id']
85 node_ids = l_pcu['node_ids']
86 l_node = get_nodes(node_ids)
88 if l_node is not None:
90 values[node['hostname']] = node['ports'][0]
92 values['nodenames'] = [node['hostname'] for node in l_node]
94 # NOTE: this is for a dry run later. It doesn't matter which node.
95 values['node_id'] = l_node[0]['node_id']
103 def get_plc_site_values(site_id):
104 ### GET PLC SITE ######################
110 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
115 plc_sites = plccache.l_plcsites
116 for site in plc_sites:
117 if site['site_id'] == site_id:
121 traceback.print_exc()
126 if d_site is not None:
127 max_slices = d_site['max_slices']
128 num_slices = len(d_site['slice_ids'])
129 num_nodes = len(d_site['node_ids'])
130 loginbase = d_site['login_base']
131 values['plcsite'] = {'num_nodes' : num_nodes,
132 'max_slices' : max_slices,
133 'num_slices' : num_slices,
134 'login_base' : loginbase,
135 'status' : 'SUCCESS'}
143 def collectPingAndSSH(pcuname, cohash):
145 continue_probe = True
147 values = {'reboot' : 'novalue'}
148 ### GET PCU ######################
152 v = get_plc_pcu_values(pcuname)
153 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
154 if v['ip'] is not None: v['ip'] = v['ip'].strip()
157 values['plc_pcu_stats'] = v
159 continue_probe = False
162 traceback.print_exc()
163 continue_probe = False
165 if b_except or not continue_probe: return (None, None, None)
167 #### RUN NMAP ###############################
169 nmap = util.command.CMD()
170 print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
171 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
172 # NOTE: an empty / error value for oval, will still work.
173 (values['port_status'], continue_probe) = nmap_port_status(oval)
175 values['port_status'] = None
177 #### COMPLETE ENTRY #######################
179 values['entry_complete'] = []
180 #if values['protocol'] is None or values['protocol'] is "":
181 # values['entry_complete'] += ["protocol"]
182 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
183 values['entry_complete'] += ["model"]
184 # Cannot continue due to this condition
185 continue_probe = False
187 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
188 values['entry_complete'] += ["password"]
189 # Cannot continue due to this condition
190 continue_probe = False
192 if len(values['entry_complete']) > 0:
193 continue_probe = False
195 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
196 values['entry_complete'] += ["hostname"]
197 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
198 values['entry_complete'] += ["ip"]
200 # If there are no nodes associated with this PCU, then we cannot continue.
201 if len(values['plc_pcu_stats']['node_ids']) == 0:
202 continue_probe = False
203 values['entry_complete'] += ['nodeids']
206 #### DNS and IP MATCH #######################
207 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
208 values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
209 #print "Calling socket.gethostbyname(%s)" % values['hostname']
211 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
212 if ipaddr == values['plc_pcu_stats']['ip']:
213 values['dns_status'] = "DNS-OK"
215 values['dns_status'] = "DNS-MISMATCH"
216 continue_probe = False
218 except Exception, err:
219 values['dns_status'] = "DNS-NOENTRY"
220 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
223 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
224 values['dns_status'] = "NOHOSTNAME"
225 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
227 values['dns_status'] = "NO-DNS-OR-IP"
228 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
229 continue_probe = False
232 ###### DRY RUN ############################
233 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
234 rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0],
237 rb_ret = "Not_Run" # No nodes to test"
239 values['reboot'] = rb_ret
242 print "____________________________________"
245 print "____________________________________"
246 errors['traceback'] = traceback.format_exc()
247 print errors['traceback']
248 values['reboot'] = errors['traceback']
250 values['date_checked'] = time.time()
251 return (pcuname, values, errors)
253 def recordPingAndSSH(request, result):
257 (nodename, values, errors) = result
259 if values is not None:
260 pcu_id = int(nodename)
261 #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
262 # if_new_set={'round': global_round})
263 #global_round = fbsync.round
264 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
265 if_new_set={'round' : global_round})
267 fbrec = FindbadPCURecord(
268 date_checked=datetime.fromtimestamp(values['date_checked']),
271 plc_pcu_stats=values['plc_pcu_stats'],
272 dns_status=values['dns_status'],
273 port_status=values['port_status'],
274 entry_complete=" ".join(values['entry_complete']),
275 reboot_trial_status="%s" % values['reboot'],
277 fbnodesync.round = global_round
284 print "%d %s %s" % (count, nodename, values)
286 if errors is not None:
287 pcu_id = "id_%s" % nodename
288 errorState[pcu_id] = errors
289 database.dbDump("findbadpcu_errors", errorState)
291 # this will be called when an exception occurs within a thread
292 def handle_exception(request, result):
293 print "Exception occured in request %s" % request.requestID
295 print "Result: %s" % i
298 def checkAndRecordState(l_pcus, cohash):
302 tp = threadpool.ThreadPool(10)
304 # CREATE all the work requests
305 for pcuname in l_pcus:
306 pcu_id = int(pcuname)
307 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
310 node_round = fbnodesync.round
311 if node_round < global_round or config.force:
312 # recreate node stats when refreshed
313 #print "%s" % nodename
314 req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
315 None, recordPingAndSSH, handle_exception)
318 # We just skip it, since it's "up to date"
320 print "%d %s %s" % (count, pcu_id, node_round)
322 # WAIT while all the work requests are processed.
328 # if more than two hours
329 if time.time() - begin > (60*60*1):
330 print "findbadpcus.py has run out of time!!!!!!"
332 except KeyboardInterrupt:
335 except threadpool.NoResultsPending:
336 print "All results collected."
339 print FindbadPCURecordSync.query.count()
340 print FindbadPCURecord.query.count()
347 # monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
348 l_pcus = plccache.l_pcus
351 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
353 global_round = fbsync.round
356 if config.site is not None:
357 api = plc.getAuthAPI()
358 site = api.GetSites(config.site)
359 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
362 pcus += node['pcu_ids']
364 l_pcus = [pcu for pcu in sets.Set(pcus)]
365 elif config.pcuselect is not None:
366 n, pcus = pcu_select(config.pcuselect)
369 l_pcus = [pcu for pcu in sets.Set(pcus)]
371 elif config.nodelist == None and config.pcuid == None:
372 print "Calling API GetPCUs() : cachecalls(%s)" % config.cachecalls
373 l_pcus = [pcu['pcu_id'] for pcu in l_pcus]
374 elif config.nodelist is not None:
375 l_pcus = util.file.getListFromFile(config.nodelist)
376 l_pcus = [int(pcu) for pcu in l_pcus]
377 elif config.pcuid is not None:
378 l_pcus = [ config.pcuid ]
379 l_pcus = [int(pcu) for pcu in l_pcus]
382 # update global round number to force refreshes across all nodes
385 checkAndRecordState(l_pcus, cohash)
388 # update global round number to force refreshes across all nodes
389 fbsync.round = global_round
397 if __name__ == '__main__':
399 logger = logging.getLogger("monitor")
400 logger.setLevel(logging.DEBUG)
401 fh = logging.FileHandler("monitor.log", mode = 'a')
402 fh.setLevel(logging.DEBUG)
403 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
404 fh.setFormatter(formatter)
405 logger.addHandler(fh)
406 from monitor import parser as parsermodule
407 parser = parsermodule.getParser()
408 parser.set_defaults(nodelist=None,
413 dbname="findbadpcus",
418 parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
419 help="Provide the input file for the node list")
420 parser.add_option("", "--site", dest="site", metavar="FILE",
421 help="Get all pcus associated with the given site's nodes")
422 parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
423 help="Query string to apply to the findbad pcus")
424 parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
425 help="Provide the id for a single pcu")
427 parser.add_option("", "--cachenodes", action="store_true",
428 help="Cache node lookup from PLC")
429 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
430 help="Specify the name of the database to which the information is saved")
431 parser.add_option("", "--nocachecalls", action="store_false", dest="cachecalls",
432 help="Refresh the cached values")
433 parser.add_option("-i", "--increment", action="store_true", dest="increment",
434 help="Increment round number to force refresh or retry")
435 parser.add_option("", "--force", action="store_true", dest="force",
436 help="Force probe without incrementing global 'round'.")
437 parser = parsermodule.getParser(['defaults'], parser)
438 config = parsermodule.parse_args(parser)
439 if hasattr(config, 'cachecalls') and not config.cachecalls:
440 # NOTE: if explicilty asked, refresh cached values.
441 print "Reloading PLCCache"
444 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
445 # when LANG is set... Do not know why. Unsetting LANG, fixes the problem.
446 if 'LANG' in os.environ:
447 del os.environ['LANG']
450 except Exception, err:
451 traceback.print_exc()
452 print "Exception: %s" % err
453 print "Saving data... exitting."