X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=nodebad.py;h=dc866645eba29d39184df4544391e3b19bfea537;hb=a1d795311ad8facac7399e73123d53e6c8e72c9d;hp=0130c3e642e336ee5cbbf5c660c3225ae7af0783;hpb=6d46ab9b534b60675a3dcb11fcb664589a3691f8;p=monitor.git diff --git a/nodebad.py b/nodebad.py index 0130c3e..dc86664 100755 --- a/nodebad.py +++ b/nodebad.py @@ -4,153 +4,183 @@ import os import sys import string import time +from datetime import datetime,timedelta +from monitor.query import verify,query_to_dict,node_select -import database -import comon -import threadpool -import syncplcdb -from nodequery import verify,query_to_dict,node_select -from nodecommon import * +from monitor.common import * + +from monitor import config +from monitor.wrapper import plc,plccache +from monitor.const import MINUP +from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord +from monitor.database.dborm import mon_session as session + +from monitor.model import * -import plc api = plc.getAuthAPI() -from unified_model import * -from const import MINUP round = 1 -externalState = {'round': round, 'nodes': {}} count = 0 +def main(): + main2(config) -def main(config): - global externalState - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) - if config.increment: - # update global round number to force refreshes across all nodes - externalState['round'] += 1 - - l_nodes = syncplcdb.create_plcdb() - l_plcnodes = database.dbLoad("l_plcnodes") +def main2(config): + l_plcnodes = plccache.l_nodes l_nodes = get_nodeset(config) - #if config.node: - # l_nodes = [config.node] - ##else: - # l_nodes = [node['hostname'] for node in l_plcnodes] checkAndRecordState(l_nodes, l_plcnodes) +# Node states: + +def check_node_state(rec, node): + + node_state = rec.observed_status + if rec.plc_node_stats: + print rec.plc_node_stats + boot_state = rec.plc_node_stats['boot_state'] + run_level = rec.plc_node_stats['run_level'] + last_contact = rec.plc_node_stats['last_contact'] + node.plc_nodeid = rec.plc_node_stats['node_id'] + else: + boot_state = "unknown" + last_contact = None + + if boot_state == 'disable': boot_state = 'disabled' + if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot' + + if len(rec.plc_node_stats['pcu_ids']) > 0: + node.haspcu = True + else: + node.haspcu = False + + node.firewall = rec.firewall + node.plc_siteid = rec.plc_node_stats['site_id'] + + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need + # 'translations' into the node.status state + # 'BOOT' is a permanent state, but we want it to have a bit of + # hysteresis (less than 0.5 days) + ################################################################# + # "Initialize" the findbad states into nodebad status if they are not already set + + if node_state == 'DOWN': + if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \ + node.status != 'disabled': + # NOTE: if changed less than 2 months, then we can allow this. + # otherwise, apply 'down' status after greater than 2 months (below). + + print "changed status from %s to %s" % (node.status, boot_state) + node.status = boot_state + node.last_changed = datetime.now() + + if node.status not in ['offline', 'down', 'disabled']: + print "changed status from %s to offline" % node.status + node.status = 'offline' + node.last_changed = datetime.now() + + if node_state == 'DEBUG': + if boot_state != 'disabled' and boot_state != 'safeboot': + print "changed status from %s to failboot" % (node.status) + current_status = "failboot" + else: + print "changed status from %s to %s" % (node.status, boot_state) + current_status = boot_state + + if current_status != node.status and \ + current_status in ['failboot', 'disabled', 'safeboot']: + + node.status = current_status + node.last_changed = datetime.now() + + if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': + print "changed status from %s to online" % node.status + node.status = 'online' + node.last_changed = datetime.now() + + ################################################################# + # Switch temporary hystersis states into their 'firm' states. + # online -> good after half a day + # offline -> down after two days + # failboot -> down after 30 days + # safeboot -> failboot after 60 days + # disabled -> down after 60 days + + if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): + print "changed status from %s to good" % node.status + node.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): + print "changed status from %s to down" % node.status + # NOTE: change an admin mode back into failboot after two months. + node.status = 'failboot' + node.last_changed = datetime.now() + + # extreme cases of offline nodes + if ( boot_state == 'disabled' or last_contact == None ) and \ + changed_greaterthan(node.last_changed, 2*30) and \ + node.status != 'down': + print "changed status from %s to down" % node.status + node.status = 'down' + node.last_changed = datetime.now() + def checkAndRecordState(l_nodes, l_plcnodes): - global externalState global count - global_round = externalState['round'] for nodename in l_nodes: - if nodename not in externalState['nodes']: - externalState['nodes'][nodename] = {'round': 0, 'values': []} - - node_round = externalState['nodes'][nodename]['round'] - if node_round < global_round: - # do work - values = collectStatusAndState(nodename, l_plcnodes) - global_round = externalState['round'] - externalState['nodes'][nodename]['values'] = values - externalState['nodes'][nodename]['round'] = global_round - else: - count += 1 - - if count % 20 == 0: - database.dbDump(config.dbname, externalState) - - database.dbDump(config.dbname, externalState) - -fb = database.dbLoad('findbad') - -def getnodesup(nodelist): - up = 0 - for node in nodelist: - if node['hostname'] in fb['nodes'].keys(): - try: - if fb['nodes'][node['hostname']]['values']['state'] == "BOOT": - up = up + 1 - except: - pass - return up - -def get(fb, path): - indexes = path.split("/") - values = fb - for index in indexes: - if index in values: - values = values[index] - else: - return None - return values -def collectStatusAndState(nodename, l_plcnodes): - global count + nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + nodehist.last_checked = datetime.now() - d_node = None - for node in l_plcnodes: - if node['hostname'] == nodename: - d_node = node - break - if not d_node: - return None - - pf = PersistFlags(nodename, 1, db='node_persistflags') - - if not pf.checkattr('last_changed'): - pf.last_changed = time.time() - - pf.last_checked = time.time() - - if not pf.checkattr('status'): - pf.status = "unknown" - - state_path = "nodes/" + nodename + "/values/state" - bootstate_path = "nodes/" + nodename + "/values/plcnode/boot_state" - - if get(fb, state_path) == "BOOT": - if pf.status != "good": pf.last_changed = time.time() - pf.status = "good" - elif get(fb, state_path) == "DEBUG": - bs = get(fb, bootstate_path) - if pf.status != bs: pf.last_changed = time.time() - pf.status = bs - else: - if pf.status != "down": pf.last_changed = time.time() - pf.status = "down" + try: + # Find the most recent record + noderec = FindbadNodeRecord.get_latest_by(hostname=nodename) + except: + print "COULD NOT FIND %s" % nodename + import traceback + email_exception() + print traceback.print_exc() + continue + + if not noderec: + print "none object for %s"% nodename + continue - count += 1 - print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(pf.last_changed)) - # updated by other modules - #pf.enabled = - #pf.suspended = + check_node_state(noderec, nodehist) - pf.save() + count += 1 + print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) + + # NOTE: this commits all pending operations to the DB. Do not remove. + session.flush() return True if __name__ == '__main__': - import parser as parsermodule + from monitor import parser as parsermodule parser = parsermodule.getParser(['nodesets']) - parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, - increment=False, dbname="nodebad", cachenodes=False) - - parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") - parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") + parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False) parser = parsermodule.getParser(['defaults'], parser) config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback print traceback.print_exc() print "Exception: %s" % err - print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0)