X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=nodebad.py;h=c3aae39619d18335e985f1d62f2da2584d053ce5;hb=6a452e8ece2ca8a47105c128eaebc38507bc76c5;hp=96720fbda5d8bbcb3c4e3d8e895658f1347c36de;hpb=6496f5b4a0220e4055fee76c97f92293f9559117;p=monitor.git diff --git a/nodebad.py b/nodebad.py index 96720fb..c3aae39 100755 --- a/nodebad.py +++ b/nodebad.py @@ -4,158 +4,168 @@ import os import sys import string import time +from datetime import datetime,timedelta - -import database -import comon -import threadpool -import syncplcdb from nodequery import verify,query_to_dict,node_select -import plc -import auth -api = plc.PLC(auth.auth, auth.plc) -from unified_model import * -from monitor_policy import MINUP +from monitor.common import * + +from monitor import config +from monitor.wrapper import plc,plccache +from monitor.const import MINUP +from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord +from monitor.database.dborm import mon_session as session + +from monitor.model import * + +api = plc.getAuthAPI() round = 1 -externalState = {'round': round, 'nodes': {}} count = 0 +def main(): + main2(config) -def main(config): - global externalState - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) - if config.increment: - # update global round number to force refreshes across all nodes - externalState['round'] += 1 +def main2(config): - l_nodes = syncplcdb.create_plcdb() - l_plcnodes = database.dbLoad("l_plcnodes") - - if config.node: - l_nodes = [config.node] - else: - l_nodes = [node['hostname'] for node in l_plcnodes] + l_plcnodes = plccache.l_nodes + l_nodes = get_nodeset(config) checkAndRecordState(l_nodes, l_plcnodes) +# Node states: + +def check_node_state(rec, node): + + node_state = rec.observed_status + if rec.plc_node_stats: + print rec.plc_node_stats + boot_state = rec.plc_node_stats['boot_state'] + last_contact = rec.plc_node_stats['last_contact'] + else: + boot_state = "unknown" + last_contact = None + + if boot_state == 'disable': boot_state = 'disabled' + if boot_state == 'diag': boot_state = 'diagnose' + + if len(rec.plc_node_stats['pcu_ids']) > 0: + node.haspcu = True + else: + node.haspcu = False + + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need + # 'translations' into the node.status state + # 'BOOT' is a permanent state, but we want it to have a bit of + # hysteresis (less than 0.5 days) + + ################################################################# + # "Initialize" the findbad states into nodebad status if they are not already set + + if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' : + print "changed status from %s to offline" % node.status + node.status = 'offline' + node.last_changed = datetime.now() + + if node_state == 'DEBUG' and node.status != 'monitordebug' and \ + node.status != 'disabled' and \ + node.status != 'diagnose': + if boot_state != 'disabled' and boot_state != 'diagnose': + + print "changed status from %s to monitordebug" % (node.status) + node.status = "monitordebug" + node.last_changed = datetime.now() + else: + print "changed status from %s to %s" % (node.status, boot_state) + node.status = boot_state + node.last_changed = datetime.now() + + if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': + print "changed status from %s to online" % node.status + node.status = 'online' + node.last_changed = datetime.now() + + ################################################################# + # Switch temporary hystersis states into their 'firm' states. + # online -> good after half a day + # offline -> down after two days + # monitordebug -> down after 30 days + # diagnose -> monitordebug after 60 days + # disabled -> down after 60 days + + if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): + print "changed status from %s to good" % node.status + node.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60): + print "changed status from %s to down" % node.status + # NOTE: change an admin mode back into monitordebug after two months. + node.status = 'monitordebug' + node.last_changed = datetime.now() + + # extreme cases of offline nodes + if ( boot_state == 'disabled' or last_contact == None ) and \ + changed_greaterthan(node.last_changed, 2*30) and \ + node.status != 'down': + print "changed status from %s to down" % node.status + node.status = 'down' + node.last_changed = datetime.now() + def checkAndRecordState(l_nodes, l_plcnodes): - global externalState global count - global_round = externalState['round'] for nodename in l_nodes: - if nodename not in externalState['nodes']: - externalState['nodes'][nodename] = {'round': 0, 'values': []} - - node_round = externalState['nodes'][nodename]['round'] - if node_round < global_round: - # do work - values = collectStatusAndState(nodename, l_plcnodes) - global_round = externalState['round'] - externalState['nodes'][nodename]['values'] = values - externalState['nodes'][nodename]['round'] = global_round - else: - count += 1 - - if count % 20 == 0: - database.dbDump(config.dbname, externalState) - - database.dbDump(config.dbname, externalState) - -fb = database.dbLoad('findbad') -hn2lb = database.dbLoad("plcdb_hn2lb") - -def getnodesup(nodelist): - up = 0 - for node in nodelist: - if node['hostname'] in fb['nodes'].keys(): - try: - if fb['nodes'][node['hostname']]['values']['state'] == "BOOT": - up = up + 1 - except: - pass - return up - -def get(fb, path): - indexes = path.split("/") - values = fb - for index in indexes: - if index in values: - values = values[index] - else: - return None - return values -def collectStatusAndState(nodename, l_plcnodes): - global count + nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + nodehist.last_checked = datetime.now() - d_node = None - for node in l_plcnodes: - if node['hostname'] == nodename: - d_node = node - break - if not d_node: - return None - - pf = PersistFlags(nodename, 1, db='node_persistflags') - - if not pf.checkattr('last_changed'): - pf.last_changed = time.time() - - pf.last_checked = time.time() - - if not pf.checkattr('status'): - pf.status = "unknown" - - state_path = "nodes/" + nodename + "/values/state" - bootstate_path = "nodes/" + nodename + "/values/plcnode/boot_state" - - if get(fb, state_path) == "BOOT": - if pf.status != "good": pf.last_changed = time.time() - pf.status = "good" - elif get(fb, state_path) == "DEBUG": - bs = get(fb, bootstate_path) - if pf.status != bs: pf.last_changed = time.time() - pf.status = bs - else: - if pf.status != "down": pf.last_changed = time.time() - pf.status = "down" + try: + # Find the most recent record + noderec = FindbadNodeRecord.get_latest_by(hostname=nodename) + except: + print "COULD NOT FIND %s" % nodename + import traceback + email_exception() + print traceback.print_exc() + continue + + if not noderec: + print "none object for %s"% nodename + continue + + check_node_state(noderec, nodehist) - count += 1 - print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(pf.last_changed)) - # updated by other modules - #pf.enabled = - #pf.suspended = + count += 1 + print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) - pf.save() + # NOTE: this commits all pending operations to the DB. Do not remove. + session.flush() return True if __name__ == '__main__': - from config import config - from optparse import OptionParser - parser = OptionParser() - parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, - increment=False, dbname="nodebad", cachenodes=False) - parser.add_option("", "--node", dest="node", metavar="hostname", - help="Provide a single node to operate on") - parser.add_option("", "--nodelist", dest="nodelist", metavar="file.list", - help="Provide a list of files to operate on") - - parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") - parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") - config = config(parser) - config.parse_args() + from monitor import parser as parsermodule + parser = parsermodule.getParser(['nodesets']) + parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False) + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback print traceback.print_exc() print "Exception: %s" % err - print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0)