#!/usr/bin/python import os import sys import string import time from datetime import datetime,timedelta from monitor.common import * from monitor.query import verify,query_to_dict,node_select from monitor import config from monitor.wrapper import plc,plccache from monitor.const import MINUP from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord from monitor.database.dborm import mon_session as session from monitor.model import * api = plc.getAuthAPI() round = 1 count = 0 def main(): main2(config) def main2(config): l_plcnodes = plccache.l_nodes l_nodes = get_nodeset(config) checkAndRecordState(l_nodes, l_plcnodes) def get_uptime(uptime_str): up = 0 if len(uptime_str) > 0: try: up = float(uptime_str.split()[0]) print "uptime: %s" % up except: up = 0 return up # Node states: def check_node_state(rec, node): node_state = rec.observed_status if rec.plc_node_stats: print rec.plc_node_stats boot_state = rec.plc_node_stats['boot_state'] run_level = rec.plc_node_stats['run_level'] last_contact = rec.plc_node_stats['last_contact'] node.plc_nodeid = rec.plc_node_stats['node_id'] else: boot_state = "unknown" last_contact = None if boot_state == 'disable': boot_state = 'disabled' if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot' if rec.plc_node_stats and len(rec.plc_node_stats['pcu_ids']) > 0: node.haspcu = True else: node.haspcu = False node.firewall = rec.firewall node.plc_siteid = rec.plc_node_stats['site_id'] # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need # 'translations' into the node.status state # 'BOOT' is a permanent state, but we want it to have a bit of # hysteresis (less than 0.5 days) ################################################################# # "Initialize" the findbad states into nodebad status if they are not already set if node_state == 'DOWN': if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \ node.status != 'disabled': # NOTE: if changed less than 2 months, then we can allow this. # otherwise, apply 'down' status after greater than 2 months (below). print "changed status from %s to %s" % (node.status, boot_state) node.status = boot_state node.last_changed = datetime.now() if node.status not in ['offline', 'down', 'disabled']: print "changed status from %s to offline" % node.status node.status = 'offline' node.last_changed = datetime.now() if node_state == 'DEBUG': if boot_state != 'disabled' and boot_state != 'safeboot': print "changed status from %s to failboot" % (node.status) current_status = "failboot" else: print "changed status from %s to %s" % (node.status, boot_state) current_status = boot_state if current_status != node.status and \ current_status in ['failboot', 'disabled', 'safeboot']: node.status = current_status node.last_changed = datetime.now() if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': old_status = node.status uptime = get_uptime(rec.uptime) if uptime > (60*60*24): node.status = 'good' node.last_changed = datetime.now() - timedelta(0,uptime) else: node.status = 'online' node.last_changed = datetime.now() print "changed status from %s to %s" % (old_status, node.status) ################################################################# # Switch temporary hystersis states into their 'firm' states. # online -> good after half a day # offline -> down after two days # failboot -> down after 30 days # safeboot -> failboot after 60 days # disabled -> down after 60 days if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): print "changed status from %s to good" % node.status node.status = 'good' # NOTE: do not reset last_changed, or you lose how long it's been up. if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): print "changed status from %s to down" % node.status node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30): print "changed status from %s to down" % node.status node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): print "changed status from %s to down" % node.status # NOTE: change an admin mode back into failboot after two months. node.status = 'failboot' node.last_changed = datetime.now() # extreme cases of offline nodes if ( boot_state == 'disabled' or last_contact == None ) and \ changed_greaterthan(node.last_changed, 2*30) and \ node.status != 'down': print "changed status from %s to down" % node.status node.status = 'down' node.last_changed = datetime.now() def checkAndRecordState(l_nodes, l_plcnodes): global count for nodename in l_nodes: nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, if_new_set={'status' : 'offline', 'last_changed' : datetime.now()}) nodehist.last_checked = datetime.now() try: # Find the most recent record noderec = FindbadNodeRecord.get_latest_by(hostname=nodename) except: print "COULD NOT FIND %s" % nodename import traceback email_exception() print traceback.print_exc() continue if not noderec: print "none object for %s"% nodename continue try: check_node_state(noderec, nodehist) except: print "check_node_state failed %s" % nodename import traceback email_exception(nodename) print traceback.print_exc() continue count += 1 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) # NOTE: this commits all pending operations to the DB. Do not remove. session.flush() return True if __name__ == '__main__': from monitor import parser as parsermodule parser = parsermodule.getParser(['nodesets']) parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False) parser = parsermodule.getParser(['defaults'], parser) config = parsermodule.parse_args(parser) try: main2(config) except Exception, err: import traceback print traceback.print_exc() print "Exception: %s" % err sys.exit(0)