X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=pcubad.py;h=59dfe7ae38ed991c78ddf2b5b36b5e76391988f6;hb=refs%2Fheads%2F2.0;hp=fbcdb1714457168a0e2139007ebbf7a1297bb87c;hpb=c51ad794e8dc07072d705b508e79ba06849aa408;p=monitor.git diff --git a/pcubad.py b/pcubad.py index fbcdb17..59dfe7a 100755 --- a/pcubad.py +++ b/pcubad.py @@ -4,39 +4,53 @@ import os import sys import string import time - -from reboot import pcu_name - -import database -import comon -import threadpool -import syncplcdb +import sets +from datetime import datetime,timedelta + +from monitor import database +from monitor import reboot +from monitor import parser as parsermodule +from monitor import config +from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord +from monitor.database.dborm import mon_session as session +from monitor.wrapper import plc,plccache +from monitor.const import MINUP + +from monitor.common import * from nodequery import verify,query_to_dict,node_select +from monitor.model import * -import plc api = plc.getAuthAPI() -from unified_model import * -from monitor_policy import MINUP -round = 1 -externalState = {'round': round, 'nodes': {}} -count = 0 +def main(): + main2(config) + +def main2(config): + + l_plcpcus = plccache.l_pcus -def main(config): - global externalState - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) - if config.increment: - # update global round number to force refreshes across all pcus - externalState['round'] += 1 + l_pcus = None + if config.site is not None: + site = plccache.GetSitesByName([config.site]) + l_nodes = plccache.GetNodesByIds(site[0]['node_ids']) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] - l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) + elif config.node: + node = plccache.GetNodeByName(config.node) + pcus = node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] - l_pcu = None - if config.pcu: + elif config.pcu: for pcu in l_plcpcus: - if pcu['hostname'] == config.pcu or pcu['ip'] == config.pcu: + if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \ + ( pcu['ip'] is not None and config.pcu in pcu['ip'] ): l_pcus = [pcu['pcu_id']] - if not l_pcu: + if not l_pcus: print "ERROR: could not find pcu %s" % config.pcu sys.exit(1) else: @@ -44,119 +58,103 @@ def main(config): checkAndRecordState(l_pcus, l_plcpcus) -def checkAndRecordState(l_pcus, l_plcpcus): - global externalState - global count - global_round = externalState['round'] +hn2lb = plccache.plcdb_hn2lb - for pcuname in l_pcus: - if pcuname not in externalState['nodes']: - externalState['nodes'][pcuname] = {'round': 0, 'values': []} - - pcu_round = externalState['nodes'][pcuname]['round'] - if pcu_round < global_round: - # do work - values = collectStatusAndState(pcuname, l_plcpcus) - global_round = externalState['round'] - externalState['nodes'][pcuname]['values'] = values - externalState['nodes'][pcuname]['round'] = global_round - else: - count += 1 - - if count % 20 == 0: - database.dbDump(config.dbname, externalState) - - database.dbDump(config.dbname, externalState) - -fbpcu = database.dbLoad('findbadpcus') -hn2lb = database.dbLoad("plcdb_hn2lb") - -def get(fb, path): - indexes = path.split("/") - values = fb - for index in indexes: - if index in values: - values = values[index] - else: - return None - return values - -def collectStatusAndState(pcuname, l_plcpcus): - global count - - d_pcu = None - for pcu in l_plcpcus: - if pcu['pcu_id'] == pcuname: - d_pcu = pcu - break - if not d_pcu: - return None - - pf = PersistFlags(pcuname, 1, db='pcu_persistflags') - - if not pf.checkattr('last_changed'): - pf.last_changed = time.time() - - pf.last_checked = time.time() - - if not pf.checkattr('valid'): - pf.valid = "unknown" - pf.last_valid = 0 - - if not pf.checkattr('status'): - pf.status = "unknown" - - state_path = "nodes/id_" + str(pcuname) + "/values/reboot" - bootstate_path = "nodes/id_" + str(pcuname) + "/values/plcpcu/boot_state" - - current_state = get(fbpcu, state_path) - if current_state == 0: - if pf.status != "good": pf.last_changed = time.time() - pf.status = "good" - elif current_state == 'NetDown': - if pf.status != "netdown": pf.last_changed = time.time() - pf.status = "netdown" - elif current_state == 'Not_Run': - if pf.status != "badconfig": pf.last_changed = time.time() - pf.status = "badconfig" - else: - if pf.status != "error": pf.last_changed = time.time() - pf.status = "error" +def check_pcu_state(rec, pcu): + + pcu_state = rec.reboot_trial_status - count += 1 - print "%d %35s %s since(%s)" % (count, pcu_name(d_pcu), pf.status, diff_time(pf.last_changed)) - # updated by other modules - #pf.enabled = - #pf.suspended = + if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \ + ( pcu.status == 'online' or pcu.status == 'good' ): + print "changed status from %s to offline" % pcu.status + pcu.status = 'offline' + pcu.last_changed = datetime.now() - pf.save() + if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]: + print "changed status from %s to online" % pcu.status + pcu.status = 'online' + pcu.last_changed = datetime.now() + + if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5): + #send thank you notice, or on-line notice. + print "changed status from %s to good" % pcu.status + pcu.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2): + # send down pcu notice + print "changed status from %s to down" % pcu.status + pcu.status = 'down' + pcu.last_changed = datetime.now() + + if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30): + print "changed status from %s to down" % pcu.status + pcu.status = 'down' + pcu.last_changed = datetime.now() + +def checkAndRecordState(l_pcus, l_plcpcus): + count = 0 + for pcuname in l_pcus: + + d_pcu = None + for pcu in l_plcpcus: + if pcu['pcu_id'] == pcuname: + d_pcu = pcu + break + if not d_pcu: + continue + + pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'], + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + pcuhist.last_checked = datetime.now() + + try: + # Find the most recent record + pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first() + except: + print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu) + import traceback + email_exception() + print traceback.print_exc() + # don't have the info to create a new entry right now, so continue. + continue + + if not pcurec: + print "none object for pcu %s"% reboot.pcu_name(d_pcu) + continue + + check_pcu_state(pcurec, pcuhist) + + count += 1 + print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple()))) + + # NOTE: this commits all pending operations to the DB. Do not remove, or + # replace with another operations that also commits all pending ops, such + # as session.commit() or flush() or something + session.flush() + print HistoryPCURecord.query.count() return True if __name__ == '__main__': - from config import config - from optparse import OptionParser - parser = OptionParser() - parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, - increment=False, dbname="pcubad", cachepcus=False) + parser = parsermodule.getParser() + parser.set_defaults(filename=None, pcu=None, node=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False) parser.add_option("", "--pcu", dest="pcu", metavar="hostname", help="Provide a single pcu to operate on") + parser.add_option("", "--site", dest="site", metavar="sitename", + help="Provide a single sitename to operate on") + parser.add_option("", "--node", dest="node", metavar="nodename", + help="Provide a single node to operate on") parser.add_option("", "--pculist", dest="pculist", metavar="file.list", help="Provide a list of files to operate on") - parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") - parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") - config = config(parser) - config.parse_args() + config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback - print traceback.print_exc() + traceback.print_exc() print "Exception: %s" % err - print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0)