X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=sitebad.py;h=a779a8ef30f355ede7227c971ad1e7891bf4d753;hb=d56e149e9dbd22321a9d843b41b4d279614937e2;hp=750572a2a938da4c225b5635b43679cc822362fc;hpb=b548c69db3d1f302b4d0d08377f0231eb3c4fd58;p=monitor.git diff --git a/sitebad.py b/sitebad.py index 750572a..a779a8e 100755 --- a/sitebad.py +++ b/sitebad.py @@ -4,54 +4,112 @@ import os import sys import string import time - - -import database -import comon -import threadpool -import syncplcdb -from nodequery import verify,query_to_dict,node_select from datetime import datetime,timedelta -import config -from sqlobject import connectionForURI,sqlhub -connection = connectionForURI(config.sqlobjecturi) -sqlhub.processConnection = connection -from infovacuum.model.findbadrecord import * -from infovacuum.model.historyrecord import * +from monitor import database +from monitor import parser as parsermodule +from monitor import config +from monitor.database.info.model import * +from monitor.wrapper import plc, plccache +from monitor.const import MINUP + +from monitor.common import * +from monitor.query import verify,query_to_dict,node_select +from monitor.model import * -import plc api = plc.getAuthAPI() -from unified_model import * -from const import MINUP +def main(): + main2(config) -def main(config): +def main2(config): - l_nodes = syncplcdb.create_plcdb() - l_plcsites = database.dbLoad("l_plcsites") + l_nodes = plccache.l_nodes + l_plcsites = plccache.l_sites if config.site: l_sites = [config.site] + elif config.node: + l_sites = [plccache.plcdb_hn2lb[config.node]] + elif config.sitelist: + site_list = config.sitelist.split(',') + l_sites = site_list else: l_sites = [site['login_base'] for site in l_plcsites] - checkAndRecordState(l_sites, l_plcsites) + checkAndRecordState(l_sites, l_plcsites, config.checkpcu) -def getnodesup(nodelist): +def getnodesup(nodelist, checkpcu): + # NOTE : assume that a blacklisted node is fine, since we're told not to + # ignore it, no policy actions should be taken for it. up = 0 for node in nodelist: try: - noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], - orderBy='date_checked').reversed()[0] - if noderec.observed_status == "BOOT": - up = up + 1 + # NOTE: adding a condition for nodehist.haspcu would include pcus + # in the calculation + nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) + nodebl = BlacklistRecord.get_by(hostname=node['hostname']) + if checkpcu: + # get pcu history for node + if nodehist.haspcu: + # get node record for pcuid + noderec = FindbadNodeRecord.get_latest_by(hostname=node['hostname']) + # get pcuhistory based on pcuid + pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=noderec.plc_pcuid) + # if pcu is not down & node is not down + if (nodehist is not None and nodehist.status != 'down' and \ + pcuhist is not None and pcuhist.status != 'down') or \ + (nodebl is not None and not nodebl.expired()): + up = up + 1 + + else: + # todo: don't count + pass + else: + if (nodehist is not None and nodehist.status != 'down') or \ + (nodebl is not None and not nodebl.expired()): + up = up + 1 except: - pass + import traceback + email_exception(node['hostname']) + print traceback.print_exc() return up -def checkAndRecordState(l_sites, l_plcsites): +def check_site_state(rec, sitehist): + + if sitehist.new and sitehist.status not in ['new', 'online', 'good']: + sitehist.status = 'new' + sitehist.penalty_applied = True # because new sites are disabled by default, i.e. have a penalty. + sitehist.last_changed = datetime.now() + + if sitehist.nodes_up >= MINUP: + + if sitehist.status != 'online' and sitehist.status != 'good': + sitehist.last_changed = datetime.now() + + if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online': + print "changed status from %s to online" % sitehist.status + sitehist.status = 'online' + + if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good': + print "changed status from %s to good" % sitehist.status + sitehist.status = 'good' + + elif not sitehist.new: + + if sitehist.status != 'offline' and sitehist.status != 'down': + sitehist.last_changed = datetime.now() + + if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline': + print "changed status from %s to offline" % sitehist.status + sitehist.status = 'offline' + + if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down': + print "changed status from %s to down" % sitehist.status + sitehist.status = 'down' + +def checkAndRecordState(l_sites, l_plcsites, checkpcu): count = 0 - lb2hn = database.dbLoad("plcdb_lb2hn") + lb2hn = plccache.plcdb_lb2hn for sitename in l_sites: d_site = None for site in l_plcsites: @@ -62,46 +120,56 @@ def checkAndRecordState(l_sites, l_plcsites): continue if sitename in lb2hn: - try: - pf = HistorySiteRecord.by_loginbase(sitename) - except: - pf = HistorySiteRecord(loginbase=sitename) - - pf.last_checked = datetime.now() - - pf.slices_used = len(d_site['slice_ids']) - pf.nodes_total = len(lb2hn[sitename]) - pf.nodes_up = getnodesup(lb2hn[sitename]) - - if pf.nodes_up >= MINUP: - if pf.status != "good": pf.last_changed = datetime.now() - pf.status = "good" - else: - if pf.status != "down": pf.last_changed = datetime.now() - pf.status = "down" + sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename, + if_new_set={'status' : 'unknown', + 'last_changed' : datetime.now(), + 'message_id': 0, + 'penalty_level' : 0}) + sitehist.last_checked = datetime.now() + + sitehist.plc_siteid = d_site['site_id'] + sitehist.slices_total = d_site['max_slices'] + sitehist.slices_used = len(d_site['slice_ids']) + sitehist.nodes_total = len(lb2hn[sitename]) + if sitehist.message_id != 0: + rtstatus = mailer.getTicketStatus(sitehist.message_id) + sitehist.message_status = rtstatus['Status'] + sitehist.message_queue = rtstatus['Queue'] + sitehist.message_created = datetime.fromtimestamp(rtstatus['Created']) + + sitehist.nodes_up = getnodesup(lb2hn[sitename], checkpcu) + sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago + sitehist.enabled = d_site['enabled'] + + check_site_state(d_site, sitehist) count += 1 - print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, - pf.nodes_total, pf.nodes_up, pf.status) + print "%d %15s slices(%2s) nodes(%2s) notdown(%2s) %s" % (count, sitename, sitehist.slices_used, + sitehist.nodes_total, sitehist.nodes_up, sitehist.status) + sitehist.flush() + + print HistorySiteRecord.query.count() + session.flush() return True if __name__ == '__main__': - import parser as parsermodule + from monitor import parser as parsermodule parser = parsermodule.getParser() - parser.set_defaults(filename=None, node=None, site=None, - nodeselect=False, nodegroup=None, cachenodes=False) + parser.set_defaults(checkpcu=False) parser.add_option("", "--site", dest="site", metavar="login_base", help="Provide a single site to operate on") - parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list", - help="Provide a list of files to operate on") + parser.add_option("", "--sitelist", dest="sitelist", + help="Provide a list of sites separated by ','") + parser.add_option("", "--checkpcu", dest="checkpcu", action="store_true", + help="whether to include PCUs in the site status") config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback print traceback.print_exc()