11 from datetime import datetime,timedelta
16 from monitor.pcu import reboot
17 from monitor import config
18 from monitor.database import FindbadPCURecordSync, FindbadPCURecord
19 from monitor import util
20 from monitor.wrapper import plc
22 from nodequery import pcu_select
24 plc_lock = threading.Lock()
29 def nmap_portstatus(status):
31 l_nmap = status.split()
34 continue_probe = False
36 results = port.split('/')
37 ps[results[0]] = results[1]
38 if results[1] == "open":
40 return (ps, continue_probe)
45 #print "GetPCU from PLC %s" % pcuname
46 l_pcu = plc.GetPCUs({'pcu_id' : pcuname})
52 #print "GetPCU from file %s" % pcuname
53 l_pcus = database.dbLoad("pculist")
55 if i['pcu_id'] == pcuname:
64 def get_nodes(node_ids):
68 l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
71 plc_nodes = database.dbLoad("l_plcnodes")
73 if n['node_id'] in node_ids:
85 def get_plc_pcu_values(pcuname):
87 Try to contact PLC to get the PCU info.
88 If that fails, try a backup copy from the last run.
89 If that fails, return None
93 l_pcu = get_pcu(pcuname)
96 site_id = l_pcu['site_id']
97 node_ids = l_pcu['node_ids']
98 l_node = get_nodes(node_ids)
100 if l_node is not None:
102 values[node['hostname']] = node['ports'][0]
104 values['nodenames'] = [node['hostname'] for node in l_node]
106 # NOTE: this is for a dry run later. It doesn't matter which node.
107 values['node_id'] = l_node[0]['node_id']
115 def get_plc_site_values(site_id):
116 ### GET PLC SITE ######################
122 d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
127 plc_sites = database.dbLoad("l_plcsites")
128 for site in plc_sites:
129 if site['site_id'] == site_id:
133 traceback.print_exc()
138 if d_site is not None:
139 max_slices = d_site['max_slices']
140 num_slices = len(d_site['slice_ids'])
141 num_nodes = len(d_site['node_ids'])
142 loginbase = d_site['login_base']
143 values['plcsite'] = {'num_nodes' : num_nodes,
144 'max_slices' : max_slices,
145 'num_slices' : num_slices,
146 'login_base' : loginbase,
147 'status' : 'SUCCESS'}
155 def collectPingAndSSH(pcuname, cohash):
157 continue_probe = True
159 values = {'reboot' : 'novalue'}
160 ### GET PCU ######################
164 v = get_plc_pcu_values(pcuname)
165 if v['hostname'] is not None: v['hostname'] = v['hostname'].strip()
166 if v['ip'] is not None: v['ip'] = v['ip'].strip()
169 values['plc_pcu_stats'] = v
171 continue_probe = False
174 traceback.print_exc()
175 continue_probe = False
177 if b_except or not continue_probe: return (None, None, None)
180 #### COMPLETE ENTRY #######################
182 values['complete_entry'] = []
183 #if values['protocol'] is None or values['protocol'] is "":
184 # values['complete_entry'] += ["protocol"]
185 if values['plc_pcu_stats']['model'] is None or values['plc_pcu_stats']['model'] is "":
186 values['complete_entry'] += ["model"]
187 # Cannot continue due to this condition
188 continue_probe = False
190 if values['plc_pcu_stats']['password'] is None or values['plc_pcu_stats']['password'] is "":
191 values['complete_entry'] += ["password"]
192 # Cannot continue due to this condition
193 continue_probe = False
195 if len(values['complete_entry']) > 0:
196 continue_probe = False
198 if values['plc_pcu_stats']['hostname'] is None or values['plc_pcu_stats']['hostname'] is "":
199 values['complete_entry'] += ["hostname"]
200 if values['plc_pcu_stats']['ip'] is None or values['plc_pcu_stats']['ip'] is "":
201 values['complete_entry'] += ["ip"]
203 # If there are no nodes associated with this PCU, then we cannot continue.
204 if len(values['plc_pcu_stats']['node_ids']) == 0:
205 continue_probe = False
206 values['complete_entry'] += ['NoNodeIds']
208 #### DNS and IP MATCH #######################
209 if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \
210 values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
211 #print "Calling socket.gethostbyname(%s)" % values['hostname']
213 ipaddr = socket.gethostbyname(values['plc_pcu_stats']['hostname'])
214 if ipaddr == values['plc_pcu_stats']['ip']:
215 values['dnsmatch'] = "DNS-OK"
217 values['dnsmatch'] = "DNS-MISMATCH"
218 continue_probe = False
220 except Exception, err:
221 values['dnsmatch'] = "DNS-NOENTRY"
222 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
225 if values['plc_pcu_stats']['ip'] is not None and values['plc_pcu_stats']['ip'] is not "":
226 values['dnsmatch'] = "NOHOSTNAME"
227 values['plc_pcu_stats']['hostname'] = values['plc_pcu_stats']['ip']
229 values['dnsmatch'] = "NO-DNS-OR-IP"
230 values['plc_pcu_stats']['hostname'] = "No_entry_in_DB"
231 continue_probe = False
233 #### RUN NMAP ###############################
235 nmap = util.command.CMD()
236 (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
237 # NOTE: an empty / error value for oval, will still work.
238 (values['portstatus'], continue_probe) = nmap_portstatus(oval)
240 values['portstatus'] = None
243 ###### DRY RUN ############################
244 if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0:
245 rb_ret = reboot.reboot_test(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True)
247 rb_ret = "Not_Run" # No nodes to test"
249 values['reboot'] = rb_ret
252 print "____________________________________"
255 print "____________________________________"
256 errors['traceback'] = traceback.format_exc()
257 print errors['traceback']
259 values['date_checked'] = time.time()
260 return (pcuname, values, errors)
262 def recordPingAndSSH(request, result):
266 (nodename, values, errors) = result
268 if values is not None:
269 pcu_id = int(nodename)
270 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
271 if_new_set={'round': global_round})
272 global_round = fbsync.round
273 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id,
274 if_new_set={'round' : global_round})
276 fbrec = FindbadPCURecord(
277 date_checked=datetime.fromtimestamp(values['date_checked']),
279 plc_pcu_stats=values['plc_pcu_stats'],
280 dns_status=values['dnsmatch'],
281 port_status=values['portstatus'],
282 entry_complete=" ".join(values['complete_entry']),
283 reboot_trial_status="%s" % values['reboot'],
285 fbnodesync.round = global_round
287 print "%d %s %s" % (count, nodename, values)
289 if errors is not None:
290 pcu_id = "id_%s" % nodename
291 errorState[pcu_id] = errors
292 database.dbDump("findbadpcu_errors", errorState)
294 # this will be called when an exception occurs within a thread
295 def handle_exception(request, result):
296 print "Exception occured in request %s" % request.requestID
298 print "Result: %s" % i
301 def checkAndRecordState(l_pcus, cohash):
305 tp = threadpool.ThreadPool(10)
307 # CREATE all the work requests
308 for pcuname in l_pcus:
309 pcu_id = int(pcuname)
310 fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
312 node_round = fbnodesync.round
313 if node_round < global_round:
314 # recreate node stats when refreshed
315 #print "%s" % nodename
316 req = threadpool.WorkRequest(collectPingAndSSH, [pcuname, cohash], {},
317 None, recordPingAndSSH, handle_exception)
320 # We just skip it, since it's "up to date"
322 print "%d %s %s" % (count, pcu_id, node_round)
324 # WAIT while all the work requests are processed.
330 # if more than two hours
331 if time.time() - begin > (60*60*1):
332 print "findbadpcus.py has run out of time!!!!!!"
334 except KeyboardInterrupt:
337 except threadpool.NoResultsPending:
338 print "All results collected."
341 print FindbadPCURecordSync.query.count()
342 print FindbadPCURecord.query.count()
348 l_pcus = monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
351 fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round})
353 global_round = fbsync.round
356 # update global round number to force refreshes across all nodes
358 fbsync.round = global_round
360 if config.site is not None:
361 api = plc.getAuthAPI()
362 site = api.GetSites(config.site)
363 l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
366 pcus += node['pcu_ids']
368 l_pcus = [pcu for pcu in sets.Set(pcus)]
369 elif config.pcuselect is not None:
370 n, pcus = pcu_select(config.pcuselect)
372 l_pcus = [pcu for pcu in sets.Set(pcus)]
374 elif config.nodelist == None and config.pcuid == None:
375 print "Calling API GetPCUs() : refresh(%s)" % config.refresh
376 l_pcus = [pcu['pcu_id'] for pcu in l_pcus]
377 elif config.nodelist is not None:
378 l_pcus = util.file.getListFromFile(config.nodelist)
379 l_pcus = [int(pcu) for pcu in l_pcus]
380 elif config.pcuid is not None:
381 l_pcus = [ config.pcuid ]
382 l_pcus = [int(pcu) for pcu in l_pcus]
384 checkAndRecordState(l_pcus, cohash)
389 if __name__ == '__main__':
391 logger = logging.getLogger("monitor")
392 logger.setLevel(logging.DEBUG)
393 fh = logging.FileHandler("monitor.log", mode = 'a')
394 fh.setLevel(logging.DEBUG)
395 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
396 fh.setFormatter(formatter)
397 logger.addHandler(fh)
398 from monitor import parser as parsermodule
399 parser = parsermodule.getParser()
400 parser.set_defaults(nodelist=None,
405 dbname="findbadpcus",
409 parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
410 help="Provide the input file for the node list")
411 parser.add_option("", "--site", dest="site", metavar="FILE",
412 help="Get all pcus associated with the given site's nodes")
413 parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
414 help="Query string to apply to the findbad pcus")
415 parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
416 help="Provide the id for a single pcu")
418 parser.add_option("", "--cachenodes", action="store_true",
419 help="Cache node lookup from PLC")
420 parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
421 help="Specify the name of the database to which the information is saved")
422 parser.add_option("", "--refresh", action="store_true", dest="refresh",
423 help="Refresh the cached values")
424 parser.add_option("-i", "--increment", action="store_true", dest="increment",
425 help="Increment round number to force refresh or retry")
426 parser = parsermodule.getParser(['defaults'], parser)
427 config = parsermodule.parse_args(parser)
429 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
430 # when LANG is set... Do not know why. Unsetting LANG, fixes the problem.
431 if 'LANG' in os.environ:
432 del os.environ['LANG']
435 except Exception, err:
436 traceback.print_exc()
437 print "Exception: %s" % err
438 print "Saving data... exitting."