From 90b2e8e7cb145cb1f6b3780867617084441b6ca9 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Wed, 12 Nov 2008 00:31:22 +0000 Subject: [PATCH] use plccache, instead of directly loading pkl files dumpact.py pcuinfo.py findbad.py showlatlon.py nodebad.py pcubad.py dumpdiag.py findbadpcu.py nodeinfo.py sitebad.py monitor_policy.py operate on new database model nodequery.py clean_policy.py nodecommon.py - updated key names in fields bootman.py nodeinfo.py siteinfo.py nodegroups.py nodeconfig.py Use latest monitor module getconf.py grouprins.py todo --- bootman.py | 9 +- clean_policy.py | 445 ++++++++++++++---------------------- dumpact.py | 3 +- dumpdiag.py | 3 +- findbad.py | 7 +- findbadpcu.py | 12 +- getconf.py | 9 +- grouprins.py | 24 +- monitor/wrapper/plccache.py | 5 +- monitor_policy.py | 7 +- nodebad.py | 8 +- nodecommon.py | 45 ++-- nodeconfig.py | 11 +- nodegroups.py | 18 +- nodeinfo.py | 19 +- nodequery.py | 87 +++---- pcubad.py | 8 +- pcuinfo.py | 8 +- printbadcsv.py | 2 - showlatlon.py | 8 +- sitebad.py | 9 +- siteinfo.py | 16 +- todo | 56 +++++ unified_model.py | 111 ++------- 24 files changed, 406 insertions(+), 524 deletions(-) diff --git a/bootman.py b/bootman.py index 0e13517..e8dc7b8 100755 --- a/bootman.py +++ b/bootman.py @@ -36,13 +36,6 @@ from Rpyc import SocketConnection, Async from Rpyc.Utils import * fb = None -def get_fbnode(node): - global fb - if fb is None: - fb = database.dbLoad("findbad") - fbnode = fb['nodes'][node]['values'] - return fbnode - class NodeConnection: def __init__(self, connection, node, config): self.node = node @@ -314,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None): # NOTE: Nothing works if the bootcd is REALLY old. # So, this is the first step. - fbnode = get_fbnode(hostname) + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() if fbnode['category'] == "OLDBOOTCD": print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" args = {} diff --git a/clean_policy.py b/clean_policy.py index 8e35903..516a8de 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -6,6 +6,8 @@ from unified_model import cmpCategoryVal import sys import emailTxt import string +from monitor.wrapper import plccache +from datetime import datetime from rt import is_host_in_rt_tickets import plc @@ -20,101 +22,76 @@ from const import * from unified_model import * -def get_ticket_id(record): - if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None: - return record['ticket_id'] - elif 'found_rt_ticket' in record and \ - record['found_rt_ticket'] is not "" and \ - record['found_rt_ticket'] is not None: - return record['found_rt_ticket'] - else: - return None - class MonitorMergeDiagnoseSendEscellate: act_all = None - fb = None def __init__(self, hostname, act): self.hostname = hostname self.act = act self.plcdb_hn2lb = None if self.plcdb_hn2lb is None: - self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = plccache.plcdb_hn2lb self.loginbase = self.plcdb_hn2lb[self.hostname] return - def getFBRecord(self): - if MonitorMergeDiagnoseSendEscellate.fb == None: - MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad") - - fb = MonitorMergeDiagnoseSendEscellate.fb - - if self.hostname in fb['nodes']: - fbnode = fb['nodes'][self.hostname]['values'] + def getFBRecords(self): + fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname) + fbnodes = None + if fbrec: + fbnodes = fbrecs else: - raise Exception("Hostname %s not in scan database"% self.hostname) - return fbnode - - def getActionRecord(self): - # update ticket status - if MonitorMergeDiagnoseSendEscellate.act_all == None: - MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all") - - act_all = MonitorMergeDiagnoseSendEscellate.act_all - - if self.hostname in act_all and len(act_all[self.hostname]) > 0: - actnode = act_all[self.hostname][0] + fbnodes = None + return fbnodes + + def getLastActionRecord(self): + actrec = ActionRecord.get_latest_by(hostname=self.hostname) + actnode = None + if actrec: + actnode = actrec else: actnode = None return actnode - def getKernel(self, unamestr): - s = unamestr.split() - if len(s) > 2: - return s[2] - else: - return "" - - def mergeRecord(self, fbnode, actnode): - fbnode['kernel'] = self.getKernel(fbnode['kernel']) - fbnode['stage'] = "findbad" - fbnode['message'] = None - fbnode['args'] = None - fbnode['info'] = None - fbnode['log'] = None - fbnode['time'] = time.time() - fbnode['email'] = TECH - fbnode['action-level'] = 0 - fbnode['action'] = ['noop'] - fbnode['date_created'] = time.time() - - if actnode is None: # there is no entry in act_all - actnode = {} - actnode.update(fbnode) - actnode['ticket_id'] = "" - actnode['prev_category'] = "ERROR" + def getPreviousCategory(self, actrec): + ret = None + if actrec: + ret = actrec.findbad_records[0].observed_category else: - actnode['prev_category']= actnode['category'] - actnode['comonstats'] = fbnode['comonstats'] - actnode['category'] = fbnode['category'] - actnode['state'] = fbnode['state'] - actnode['kernel'] = fbnode['kernel'] - actnode['bootcd'] = fbnode['bootcd'] - actnode['plcnode'] = fbnode['plcnode'] - ticket = get_ticket_id(actnode) - if ticket is None: actnode['ticket_id'] = "" - actnode['rt'] = mailer.getTicketStatus(ticket) - - #for key in actnode.keys(): - # print "%10s %s %s " % (key, "==", actnode[key]) - #print "----------------------------" + ret = "ERROR" + return ret - return actnode + + def mergeRecord(self, fbnodes, actrec): + + actdefault = {} + actdefault['date_created'] = datetime.now() + actdefault['date_action_taken'] = datetime.now() + + actdefault['stage'] = "initial" + actdefault['message_series'] = None + actdefault['message_index'] = None + actdefault['message_arguments'] = None + + actdefault['send_email_to'] = TECH + actdefault['penalty_level'] = 0 + actdefault['action'] = [ 'noop' ] + actdefault['take_action'] = False + + actdefault['ticket_id'] = "" + actdefault['findbad_records'] = fbnodes + actdefault['last_action_record'] = actrec + + actdefault['prev_category'] = self.getPreviousCategory(actrec) + actdefault['category'] = fbnodes[0].observed_category + + actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id) + + return actdefault def run(self): - fbnode = self.getFBRecord() - actnode= self.getActionRecord() - actrec = self.mergeRecord(fbnode, actnode) + fbnodes = self.getFBRecords() + actnode= self.getLastActionRecord() + actrec = self.mergeRecord(fbnodes, actnode) record = Record(self.hostname, actrec) diag = self.diagnose(record) if self.act and diag is not None: @@ -122,26 +99,21 @@ class MonitorMergeDiagnoseSendEscellate: def diagnose(self, record): - diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags') + diag = {} # NOTE: change record stage based on RT status. - #diag.setFlag('ResetStage') if record.stageIswaitforever(): ticket = record.data['rt'] if 'new' in ticket['Status']: print "Resetting Stage!!!!!" - # diag.setFlag('ResetStage') record.reset_stage() - #if diag.getFlag('ResetStage'): - # print "diagnose: resetting stage" - # diag.resetFlag('ResetStage') if 'resolved' in ticket['Status']: - diag.setFlag('RTEndRecord') + diag['RTEndRecord'] = True # NOTE: take category, and prepare action category = record.getCategory() if category == "error": - diag.setFlag('SendNodedown') + diag['SendNodedown'] = True record.data['message_series'] = emailTxt.mailtxt.newdown record.data['log'] = self.getDownLog(record) @@ -149,7 +121,7 @@ class MonitorMergeDiagnoseSendEscellate: state = record.getState() if state == "boot": if record.severity() != 0: - diag.setFlag('SendThankyou') + diag['SendThankyou'] = True print "RESETTING STAGE: improvement" record.data['stage'] = 'improvement' record.data['message_series'] = emailTxt.mailtxt.newthankyou @@ -167,105 +139,85 @@ class MonitorMergeDiagnoseSendEscellate: # TODO: how to not send email?... - record = self.checkStageAndTime(diag,record) + record = self.checkStageAndTime(record) #if record: print "diagnose: checkStageAndTime Returned Valid Record" - site = PersistFlags(self.loginbase, 1, db='site_persistflags') + siterec = HistorySiteRecord.by_loginbase(self.loginbase) - if "good" not in site.status: # != "good": + if "good" not in siterec.status: # != "good": print "diagnose: Setting site %s for 'squeeze'" % self.loginbase - diag.setFlag('Squeeze') + diag['Squeeze'] = True else: print "diagnose: Setting site %s for 'backoff'" % self.loginbase - diag.setFlag('BackOff') + diag['BackOff'] = True - diag.save() return diag - #else: - # print "checkStageAndTime Returned NULL Record" - # return None def action(self, record, diag): message = None - #print record.data['stage'] - #print "improvement" in record.data['stage'] - #print self.getSendEmailFlag(record) print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) ) if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \ "monitor-end-record" in record.data['stage']: print "action: getting message" + #### Send EMAIL message = record.getMessage(record.data['ticket_id']) if message: - #message.reset() print "action: sending email" message.send(record.getContacts()) - #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" - #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" - #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" - #print message if message.rt.ticket_id: print "action: setting record ticket_id" record.data['ticket_id'] = message.rt.ticket_id - if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): + #### APPLY PENALTY + if ( record.data['take_action'] and diag['Squeeze'] ): print "action: taking action" - record.takeAction(record.data['action-level']) - diag.resetFlag('Squeeze') - diag.save() + record.takeAction(record.data['penalty_level']) + del diag['Squeeze'] if diag.getFlag('BackOff'): record.takeAction(0) - diag.resetFlag('BackOff') - diag.save() + del diag['BackOff'] + #### SAVE TO DB if record.saveAction(): print "action: saving act_all db" self.add_and_save_act_all(record) else: print "action: NOT saving act_all db" - print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] ) + print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] ) - if record.improved() or diag.getFlag('RTEndRecord'): + #### END RECORD + if record.improved() or diag['RTEndRecord']: print "action: end record for %s" % self.hostname record.end_record() - diag.setFlag('CloseRT') - diag.resetFlag('RTEndRecord') - diag.save() - #return None + diag['CloseRT'] = True + del diag['RTEndRecord'] + #### CLOSE RT TICKET if message: - if diag.getFlag('CloseRT'): + if diag['CloseRT']: message.rt.closeTicket() - diag.resetFlag('CloseRT') - diag.save() + del diag['CloseRT'] else: print "NOT sending email : %s %s" % (config.mail, record.data['rt']) return - def getSendEmailFlag(self, record): - if not config.mail: - return False - - # resend if open & created longer than 30 days ago. - if 'rt' in record.data and \ - 'Status' in record.data['rt'] and \ - "open" in record.data['rt']['Status'] and \ - record.data['rt']['Created'] > int(time.time() - 60*60*24*30): - # if created-time is greater than the thirty days ago from the current time - return False - - return True - def add_and_save_act_all(self, record): - self.act_all = database.dbLoad("act_all") - if self.hostname not in self.act_all: - self.act_all[self.hostname] = [] - self.act_all[self.hostname].insert(0,record.data) - database.dbDump("act_all", self.act_all) - + """ + Read the sync record for this node, and increment the round and + create an ActionRecord for this host using the record.data values. + """ + recsync = RecordActionSync.get_by(hostname=self.hostname) + rec = RecordAction(hostname=self.hostname) + recsync.round += 1 + record.data['round'] = recsync.round + # TODO: we will need to delete some of these before setting them in the DB. + rec.set(**record.data) + rec.flush() + def getDownLog(self, record): record.data['args'] = {'nodename': self.hostname} @@ -300,140 +252,82 @@ class MonitorMergeDiagnoseSendEscellate: log = "IMPR: %s improved to %s " % (self.hostname, record.data['category']) return log - def checkStageAndTime(self, diag, record): + def makeRecord(self, **kwargs): + rec = {} + for key in kwargs.keys(): + rec[key] = kwargs[key] + return rec + + def checkStageAndTime(self, record): + """ + The core variables are: + + send_email_to : defines who to send messages to at this time + take_action : whether or not to take action + penalty_level : how much of a penalty to apply + message_index : where in the escellation sequence we are. + save_act_all : whether or not to save the action record in the db. + + action/stage : stage tracks which state we're in. + """ + stages = { + "initial" : [ { action='noop', next="weekone"}], + "weekone" : [ { action='noop', index=0, save=True, email=TECH, length=7*SPERDAY, next="weektwo" }, ], + "weektwo" : [ { action='nocreate', index=1, save=True, email=TECH|PI, length=7*SPERDAY, next="waitforever" }, ], + "waitforever" : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY, next="waitforever" }, ], + "paused" : [ { action='noop', save=True length=30*SPERDAY, next="weekone" }, ] + "improvement" : [ { action='close_rt', index=0, save=True, email=TECH, next="monitor-end-record" }, ], + } + # TODO: make this time relative to the PREVIOUS action taken. current_time = time.time() - delta = current_time - record.data['time'] - #print record.data - if 'findbad' in record.data['stage']: + current_stage = record.getMostRecentStage() + recent_time = record.getMostRecentTime() + + delta = current_time - recent_time + + if current_stage in stages: + values = stages[current_stage][0] + + if delta >= values['length']: + print "checkStageAndTime: transition to next stage" + new_stage = values['next'] + values = stages[new_stage] + + elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data: + print "checkStageAndTime: second message in one week for stage two" + take_action=False + pass + else: + # DO NOTHING + take_action=False, + save_act_all=False, + message_index=None, + print "checkStageAndTime: second message in one week for stage two" + + rec = self.makeRecord( stage=new_stage, send_email_to=values['email'], + action=values['action'], message_index=values['index'], + save_act_all=values['save'], penalty_level=values['index'], + date_action_taken=current_time) + record.data.update(rec) + + + if 'initial' in record.data['stage']: # The node is bad, and there's no previous record of it. - record.data['email'] = TECH - record.data['action'] = ['noop'] - record.data['takeaction'] = False - record.data['message'] = record.data['message_series'][0] - record.data['stage'] = 'stage_actinoneweek' - record.data['save-act-all'] = True - record.data['action-level'] = 0 - - elif 'reboot_node' in record.data['stage']: - record.data['email'] = TECH - record.data['action'] = ['noop'] - record.data['message'] = record.data['message_series'][0] - record.data['stage'] = 'stage_actinoneweek' - record.data['takeaction'] = False - record.data['save-act-all'] = False - record.data['action-level'] = 0 - + rec = self.makeRecord( + stage="weekone", send_email_to=TECH, + action=['noop'], take_action=False, + message_index=0, save_act_all=True, + penalty_level=0, ) + record.data.update(rec) + elif 'improvement' in record.data['stage']: print "checkStageAndTime: backing off of %s" % self.hostname - record.data['action'] = ['close_rt'] - record.data['takeaction'] = True - record.data['message'] = record.data['message_series'][0] - record.data['stage'] = 'monitor-end-record' - record.data['save-act-all'] = True - record.data['action-level'] = 0 - - elif 'actinoneweek' in record.data['stage']: - if delta >= 7 * SPERDAY: - print "checkStageAndTime: transition to next stage actintwoweeks" - record.data['email'] = TECH | PI - record.data['stage'] = 'stage_actintwoweeks' - record.data['message'] = record.data['message_series'][1] - record.data['action'] = ['nocreate' ] - record.data['time'] = current_time # reset clock for waitforever - record.data['takeaction'] = True - record.data['save-act-all'] = True - record.data['action-level'] = 1 - elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data: - print "checkStageAndTime: second message in one week" - record.data['email'] = TECH - record.data['message'] = record.data['message_series'][0] - record.data['action'] = ['sendmailagain-waitforoneweekaction' ] - record.data['second-mail-at-oneweek'] = True - record.data['takeaction'] = False - record.data['save-act-all'] = True - record.data['action-level'] = 0 - else: - record.data['message'] = None - record.data['action'] = ['waitforoneweekaction' ] - record.data['takeaction'] = False - record.data['save-act-all'] = False - record.data['action-level'] = 0 - print "checkStageAndTime: ignoring this record for: %s" % self.hostname - #return None # don't send if there's no action - - elif 'actintwoweeks' in record.data['stage']: - if delta >= 7 * SPERDAY: - print "checkStageAndTime: transition to next stage waitforever" - record.data['email'] = TECH | PI | USER - record.data['stage'] = 'stage_waitforever' - record.data['message'] = record.data['message_series'][2] - record.data['action'] = ['suspendslices'] - record.data['time'] = current_time # reset clock for waitforever - record.data['takeaction'] = True - record.data['save-act-all'] = True - record.data['action-level'] = 2 - elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data: - print "checkStageAndTime: second message in one week for stage two" - record.data['email'] = TECH | PI - record.data['message'] = record.data['message_series'][1] - record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ] - record.data['second-mail-at-twoweeks'] = True - record.data['takeaction'] = False - record.data['save-act-all'] = True - record.data['action-level'] = 1 - else: - record.data['message'] = None - record.data['takeaction'] = False - record.data['action'] = ['waitfortwoweeksaction'] - record.data['save-act-all'] = False - print "checkStageAndTime: second message in one week for stage two" - record.data['action-level'] = 1 - #return None # don't send if there's no action - - elif 'ticket_waitforever' in record.data['stage']: - record.data['email'] = TECH - record.data['takeaction'] = True - if 'first-found' not in record.data: - record.data['first-found'] = True - record.data['log'] += " firstfound" - record.data['action'] = ['ticket_waitforever'] - record.data['message'] = None - record.data['time'] = current_time - record.data['save-act-all'] = True - record.data['action-level'] = 2 - else: - if delta >= 7*SPERDAY: - record.data['action'] = ['ticket_waitforever'] - record.data['message'] = None - record.data['time'] = current_time # reset clock - record.data['save-act-all'] = True - record.data['action-level'] = 2 - else: - record.data['action'] = ['ticket_waitforever'] - record.data['message'] = None - record.data['takeaction'] = False - record.data['save-act-all'] = False - record.data['action-level'] = 2 - #return None - - elif 'waitforever' in record.data['stage']: - # more than 3 days since last action - # TODO: send only on weekdays. - # NOTE: expects that 'time' has been reset before entering waitforever stage - record.data['takeaction'] = True - if delta >= 3*SPERDAY: - record.data['action'] = ['email-againwaitforever'] - record.data['message'] = record.data['message_series'][2] - record.data['time'] = current_time # reset clock - record.data['save-act-all'] = True - record.data['action-level'] = 2 - else: - record.data['action'] = ['waitforever'] - record.data['message'] = None - record.data['takeaction'] = False - record.data['save-act-all'] = False - record.data['action-level'] = 2 - #return None # don't send if there's no action + rec = self.makeRecord( + stage='monitor-end-record', send_email_to=TECH, + action=['close_rt'], take_action=True, + message_index=0, save_act_all=True, + penalty_level=0, ) + record.data.update(rec) else: # There is no action to be taken, possibly b/c the stage has @@ -443,16 +337,15 @@ class MonitorMergeDiagnoseSendEscellate: # 2. delta is not big enough to bump it to the next stage. # TODO: figure out which. for now assume 2. print "UNKNOWN stage for %s; nothing done" % self.hostname - record.data['action'] = ['unknown'] - record.data['message'] = record.data['message_series'][0] - - record.data['email'] = TECH - record.data['action'] = ['noop'] - record.data['message'] = record.data['message_series'][0] - record.data['stage'] = 'stage_actinoneweek' - record.data['time'] = current_time # reset clock - record.data['takeaction'] = False - record.data['save-act-all'] = True + rec = self.makeRecord( + stage='weekone', send_email_to=TECH, + action=['noop'], + take_action=False, + save_act_all=True, + date_action_taken=current_time, + message_index=0, + penalty_level=0, ) + record.data.update(rec) print "%s" % record.data['log'], print "%15s" % record.data['action'] diff --git a/dumpact.py b/dumpact.py index b710a54..713970c 100755 --- a/dumpact.py +++ b/dumpact.py @@ -6,11 +6,12 @@ import sys import time import getopt import database +from monitor.wrapper import plccache def main(): act_all = database.dbLoad(sys.argv[1]) - plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + plcdb_hn2lb = plccache.plcdb_hn2lb s_nodenames = "" sickdb = {} diff --git a/dumpdiag.py b/dumpdiag.py index 2a2d753..4e38459 100755 --- a/dumpdiag.py +++ b/dumpdiag.py @@ -6,11 +6,12 @@ import sys import time import getopt import database +from monitor.wrapper import plccache def main(): sickdb = database.dbLoad(sys.argv[1]) - plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + plcdb_hn2lb = plccache.plcdb_hn2lb s_nodenames = "" sorted_keys = sickdb.keys() diff --git a/findbad.py b/findbad.py index 9d2758c..1e412bc 100755 --- a/findbad.py +++ b/findbad.py @@ -13,9 +13,8 @@ from monitor.util import command from monitor import config from monitor.database import FindbadNodeRecordSync, FindbadNodeRecord from monitor.sources import comon -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache -import syncplcdb from nodequery import verify,query_to_dict,node_select import traceback @@ -255,6 +254,7 @@ def recordPingAndSSH(request, result): fbrec = FindbadNodeRecord( date_checked=datetime.fromtimestamp(values['date_checked']), + round=global_round, hostname=nodename, loginbase=values['loginbase'], kernel_version=values['kernel'], @@ -274,6 +274,7 @@ def recordPingAndSSH(request, result): ssh_status = (values['ssh'] == "SSH"), ssh_error = values['ssherror'], observed_status = values['state'], + observed_category = values['category'], ) fbnodesync.round = global_round @@ -353,7 +354,7 @@ def main(): # history information for all nodes #cohash = {} cohash = cotop.coget(cotop_url) - l_nodes = syncplcdb.create_plcdb() + l_nodes = plccache.l_nodes if config.nodelist: f_nodes = util.file.getListFromFile(config.nodelist) l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes) diff --git a/findbadpcu.py b/findbadpcu.py index 3ab97a3..1af600c 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -17,7 +17,7 @@ from monitor.pcu import reboot from monitor import config from monitor.database import FindbadPCURecordSync, FindbadPCURecord from monitor import util -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache from nodequery import pcu_select plc_lock = threading.Lock() @@ -49,7 +49,7 @@ def get_pcu(pcuname): except: try: #print "GetPCU from file %s" % pcuname - l_pcus = database.dbLoad("pculist") + l_pcus = plccache.l_pcus for i in l_pcus: if i['pcu_id'] == pcuname: l_pcu = i @@ -67,7 +67,7 @@ def get_nodes(node_ids): l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports']) except: try: - plc_nodes = database.dbLoad("l_plcnodes") + plc_nodes = plccache.l_plcnodes for n in plc_nodes: if n['node_id'] in node_ids: l_node.append(n) @@ -123,7 +123,7 @@ def get_plc_site_values(site_id): d_site = d_site[0] except: try: - plc_sites = database.dbLoad("l_plcsites") + plc_sites = plccache.l_plcsites for site in plc_sites: if site['site_id'] == site_id: d_site = site @@ -274,6 +274,7 @@ def recordPingAndSSH(request, result): fbrec = FindbadPCURecord( date_checked=datetime.fromtimestamp(values['date_checked']), + record=fbsync.round, plc_pcuid=pcu_id, plc_pcu_stats=values['plc_pcu_stats'], dns_status=values['dnsmatch'], @@ -344,7 +345,8 @@ def checkAndRecordState(l_pcus, cohash): def main(): global global_round - l_pcus = monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + # monitor.database.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs()) + l_pcus = plccache.l_pcus cohash = {} fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, if_new_set={'round' : global_round}) diff --git a/getconf.py b/getconf.py index 721932f..1f84674 100755 --- a/getconf.py +++ b/getconf.py @@ -1,10 +1,11 @@ #!/usr/bin/python -import plc +from monitor.wrapper import plc +from monitor import config +import monitor.parser as parsermodule api = plc.getAuthAPI() import sys import os -import config def getconf(hostname, force=False, media=None): n = api.GetNodes(hostname) @@ -36,8 +37,6 @@ def getconf(hostname, force=False, media=None): return args if __name__ == '__main__': - import parser as parsermodule - parser = parsermodule.getParser() parser.set_defaults(media='both', force=False) parser.add_option("", "--media", dest="media", metavar="usb, iso, both", @@ -46,7 +45,7 @@ if __name__ == '__main__': help="""Force the recreation of the usb images.""") parser = parsermodule.getParser(['defaults'], parser) - config = parsesrmodule.parse_args(parser) + config = parsermodule.parse_args(parser) ret = {'url_list' : ''} for i in config.args: diff --git a/grouprins.py b/grouprins.py index cfefc6a..1eeb092 100755 --- a/grouprins.py +++ b/grouprins.py @@ -12,30 +12,29 @@ # * do something else to them all. # -import plc +from monitor import config +from monitor import util +from monitor import const +from monitor import database +from monitor import parser as parsermodule +from monitor.pcu import reboot +from monitor.wrapper import plc api = plc.getAuthAPI() import traceback -import config -import util.file from optparse import OptionParser -import const from nodecommon import * from nodequery import verify,query_to_dict,node_select -import database from unified_model import * import os import time -import parser as parsermodule - from model import * + import bootman # debug nodes -import reboot # down nodes without pcu -import mailmonitor # down nodes with pcu +import mailmonitor # down nodes without pcu from emailTxt import mailtxt -#reboot.verbose = 0 import sys class Reboot(object): @@ -237,10 +236,11 @@ if config.node or config.nodelist: if config.node: hostnames = [ config.node ] else: hostnames = util.file.getListFromFile(config.nodelist) -fb = database.dbLoad("findbad") +fbquery = FindbadNodeRecord.get_all_latest() +fb_nodelist = [ n.hostname for n in fbquery ] if config.nodeselect: - hostnames = node_select(config.nodeselect, fb['nodes'].keys(), fb) + hostnames = node_select(config.nodeselect, fb_nodelist) if config.findbad: # rerun findbad with the nodes in the given nodes. diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index 73a6e57..f872d7a 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -100,6 +100,7 @@ def init(): return l_nodes + def create_plcdb(): # get sites, and stats @@ -132,7 +133,9 @@ def create_plcdb(): database.dbDump("l_plcsites", l_sites) return l_nodes - if __name__ == '__main__': create_plcdb() +else: + print "calling plccache init()" + init() diff --git a/monitor_policy.py b/monitor_policy.py index 45242ea..5049db2 100644 --- a/monitor_policy.py +++ b/monitor_policy.py @@ -6,6 +6,7 @@ from unified_model import cmpCategoryVal import sys import emailTxt import string +from monitor.wrapper import plccache from rt import is_host_in_rt_tickets import plc @@ -56,7 +57,7 @@ class Merge: self.merge_list = l_merge # the hostname to loginbase mapping - self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = plccache.plcdb_hn2lb # Previous actions taken on nodes. self.act_all = database.if_cached_else(1, "act_all", lambda : {}) @@ -264,7 +265,7 @@ class RT: class Diagnose: def __init__(self, record_list): self.record_list = record_list - self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = plccache.plcdb_hn2lb self.findbad = database.if_cached_else(1, "findbad", lambda : {}) self.diagnose_in = {} @@ -845,7 +846,7 @@ def reboot_node(args): class Action: def __init__(self, diagnose_out): # the hostname to loginbase mapping - self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = plccache.plcdb_hn2lb # Actions to take. self.diagnose_db = diagnose_out diff --git a/nodebad.py b/nodebad.py index 57f23c0..185c385 100755 --- a/nodebad.py +++ b/nodebad.py @@ -8,11 +8,10 @@ from datetime import datetime,timedelta from nodequery import verify,query_to_dict,node_select -import syncplcdb from nodecommon import * from monitor import config -from monitor.wrapper import plc +from monitor.wrapper import plc,plccache from monitor.const import MINUP from monitor.database import FindbadNodeRecord, HistoryNodeRecord @@ -25,8 +24,7 @@ count = 0 def main(config): - l_nodes = syncplcdb.create_plcdb() - l_plcnodes = database.dbLoad("l_plcnodes") + l_plcnodes = plccache.l_nodes l_nodes = get_nodeset(config) checkAndRecordState(l_nodes, l_plcnodes) @@ -49,7 +47,7 @@ def checkAndRecordState(l_nodes, l_plcnodes): try: # Find the most recent record noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first() - print "NODEREC: ", noderec.date_checked + #print "NODEREC: ", noderec.date_checked except: print "COULD NOT FIND %s" % nodename import traceback diff --git a/nodecommon.py b/nodecommon.py index 334bc3e..8e3d5a0 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -5,7 +5,7 @@ from monitor.pcu import reboot from monitor import util from monitor import database -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache from datetime import datetime from unified_model import PersistFlags @@ -34,8 +34,8 @@ def blue(str): return BLUE + str + NORMAL def get_current_state(fbnode): - if 'state' in fbnode: - state = fbnode['state'] + if 'observed_status' in fbnode: + state = fbnode['observed_status'] else: state = "none" l = state.lower() @@ -122,40 +122,36 @@ def getvalue(fb, path): return None return values -def nodegroup_display(node, fb, conf=None): - if node['hostname'] in fb['nodes']: - node['current'] = get_current_state(fb['nodes'][node['hostname']]['values']) - else: - node['current'] = 'none' - - if fb['nodes'][node['hostname']]['values'] == []: - return "" +def nodegroup_display(node, fbdata, conf=None): + node['current'] = get_current_state(fbdata) - s = fb['nodes'][node['hostname']]['values']['kernel'].split() + s = fbdata['kernel_version'].split() if len(s) >=3: - node['kernel'] = s[2] + node['kernel_version'] = s[2] else: - node['kernel'] = fb['nodes'][node['hostname']]['values']['kernel'] + node['kernel_version'] = fbdata['kernel_version'] - if '2.6' not in node['kernel']: node['kernel'] = "" + if '2.6' not in node['kernel_version']: node['kernel_version'] = "" if conf and not conf.nocolor: node['boot_state'] = color_boot_state(node['boot_state']) node['current'] = color_boot_state(node['current']) - #node['boot_state'] = node['boot_state'] - #node['current'] = node['current'] - node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu'] + + if type(fbdata['plc_node_stats']['pcu_ids']) == type([]): + node['pcu'] = "PCU" node['lastupdate'] = diff_time(node['last_contact']) + pf = PersistFlags(node['hostname'], 1, db='node_persistflags') try: node['lc'] = diff_time(pf.last_changed) except: node['lc'] = "err" - ut = fb['nodes'][node['hostname']]['values']['comonstats']['uptime'] + + ut = fbdata['comon_stats']['uptime'] if ut != "null": - ut = diff_time(float(fb['nodes'][node['hostname']]['values']['comonstats']['uptime']), False) + ut = diff_time(float(fbdata['comon_stats']['uptime']), False) node['uptime'] = ut - return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)35.35s %(lastupdate)12s, %(lc)s, %(uptime)s" % node + return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel_version)35.35s %(lastupdate)12s, %(lc)s, %(uptime)s" % node def datetime_fromstr(str): if '-' in str: @@ -176,7 +172,7 @@ def get_nodeset(config): evaluates to. """ api = plc.getAuthAPI() - l_nodes = database.dbLoad("l_plcnodes") + l_nodes = plccache.l_nodes if config.nodelist: f_nodes = util.file.getListFromFile(config.nodelist) @@ -196,8 +192,9 @@ def get_nodeset(config): # perform this query after the above options, so that the filter above # does not break. if config.nodeselect: - fb = database.dbLoad("findbad") - l_nodes = node_select(config.nodeselect, fb['nodes'].keys(), fb) + fbquery = FindbadNodeRecord.get_all_latest() + node_list = [ n.hostname for n in fbquery ] + l_nodes = node_select(config.nodeselect, node_list, None) return l_nodes diff --git a/nodeconfig.py b/nodeconfig.py index 2327ec0..b205900 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -1,14 +1,15 @@ #!/usr/bin/python -import plc +from monitor.wrapper import plc api = plc.getAuthAPI() -import parser as parsermodule +from monitor import parser as parsermodule from sets import Set from nodecommon import * -import database +from monitor import database +from monitor.database import FindbadNodeRecord def network_config_to_str(net): @@ -21,7 +22,6 @@ def network_config_to_str(net): def main(): - fb = database.dbLoad("findbad") parser = parsermodule.getParser() parser.set_defaults(nodelist=None, @@ -67,7 +67,8 @@ def main(): i = 1 for node in nodelist: print "%-2d" % i, - print nodegroup_display(node, fb) + fbdata = FindbadNodeRecord.get_latest_by(hostname=node['hostname']) + print nodegroup_display(node, fbdata.to_dict()) i += 1 elif config.add and config.nodegroup: diff --git a/nodegroups.py b/nodegroups.py index 3f4b980..9e14e2f 100755 --- a/nodegroups.py +++ b/nodegroups.py @@ -13,19 +13,19 @@ # Given a nodelist, it could tag each one with a nodegroup name. # * -import plc +from monitor import database +from monitor.database import FindbadNodeRecord +from monitor import util +from monitor.wrapper import plc +from monitor import parser as parsermodule + api = plc.getAuthAPI() -import parser as parsermodule -from sets import Set from nodequery import verify,query_to_dict,node_select - from nodecommon import * -import database -import util.file +from sets import Set def main(): - fb = database.dbLoad("findbad") parser = parsermodule.getParser(['nodesets']) parser.set_defaults( list=True, @@ -121,7 +121,9 @@ def main(): i = 1 for node in nodelist: print "%-2d" % i, - print nodegroup_display(node, fb, config) + fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first() + fbdata = fbrec.to_dict() + print nodegroup_display(node, fbdata, config) i += 1 else: diff --git a/nodeinfo.py b/nodeinfo.py index fee8eb3..4a946c5 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -1,21 +1,20 @@ #!/usr/bin/python -import plc +from monitor.wrapper import plc api = plc.getAuthAPI() from monitor import * -#import database -import reboot +from monitor import util +from monitor import parser as parsermodule + +from monitor import database +from monitor.pcu import reboot import time from model import * from nodecommon import * from unified_model import node_end_record, PersistFlags -import util.file - -import parser as parsermodule - parser = parsermodule.getParser() parser.set_defaults(node=None, findbad=False, @@ -138,11 +137,11 @@ if config.findbad: for node in config.args: config.node = node - fb = database.dbLoad("findbad") plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0] - fb_nodeinfo = fb['nodes'][config.node]['values'] - + fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) + fb_nodeinfo = fb_noderec.to_dict() plc_print_nodeinfo(plc_nodeinfo) + fb_nodeinfo['hostname'] = node fb_print_nodeinfo(fb_nodeinfo) diff --git a/nodequery.py b/nodequery.py index 5e182e1..71c62bc 100755 --- a/nodequery.py +++ b/nodequery.py @@ -14,15 +14,13 @@ import re import string from monitor.pcu import reboot -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache api = plc.getAuthAPI() -from monitor.database import FindbadNodeRecord, FindbadNodeRecordSync +from monitor.database import FindbadNodeRecord, FindbadPCURecord from monitor import util from monitor import config -fb = None -fbpcu = None class NoKeyException(Exception): pass @@ -69,8 +67,12 @@ def fb_print_nodeinfo(fbnode, hostname, fields=None): format += "%%(%s)s " % f print format % fbnode +def first(path): + indexes = path.split(".") + return indexes[0] + def get(fb, path): - indexes = path.split("/") + indexes = path.split(".") values = fb for index in indexes: if index in values: @@ -216,19 +218,18 @@ def verify(constraints, data): for key in con.keys(): #print "looking at key: %s" % key - if key in data: + if first(key) in data: value_re = re.compile(con[key]) - if type([]) == type(data[key]): + if type([]) == type(get(data,key)): local_or_true = False - for val in data[key]: + for val in get(data,key): local_or_true = local_or_true | (value_re.search(val) is not None) con_and_true = con_and_true & local_or_true else: - if data[key] is not None: - con_and_true = con_and_true & (value_re.search(data[key]) is not None) - elif key not in data: - print "missing key %s" % key, - pass + if get(data,key) is not None: + con_and_true = con_and_true & (value_re.search(get(data,key)) is not None) + elif first(key) not in data: + print "missing key %s" % first(key) con_or_true = con_or_true | con_and_true @@ -260,38 +261,35 @@ def pcu_in(fbdata): return False def pcu_select(str_query, nodelist=None): - global fb - global fbpcu pcunames = [] nodenames = [] if str_query is None: return (nodenames, pcunames) - if fb is None: - fb = database.dbLoad("findbad") - if fbpcu is None: - fbpcu = database.dbLoad("findbadpcus") + if True: + fbquery = FindbadNodeRecord.get_all_latest() + fb_nodelist = [ n.hostname for n in fbquery ] + if True: + fbpcuquery = FindbadPCURecord.get_all_latest() + fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ] - #print str_query dict_query = query_to_dict(str_query) - #print dict_query - for node in fb['nodes'].keys(): + for noderec in fbquery: if nodelist is not None: - if node not in nodelist: continue + if noderec.hostname not in nodelist: continue - fb_nodeinfo = fb['nodes'][node]['values'] + fb_nodeinfo = noderec.to_dict() if pcu_in(fb_nodeinfo): - pcuinfo = fbpcu['nodes']['id_%s' % fb_nodeinfo['plcnode']['pcu_ids'][0]]['values'] + pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=get(fb_nodeinfo, 'plc_node_stats.pcu_ids')[0]) + pcuinfo = pcurec.to_dict() if verify(dict_query, pcuinfo): - nodenames.append(node) + nodenames.append(noderec.hostname) str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \ (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password']) - #pcunames.append(str) - pcunames.append(pcuinfo['pcu_id']) + pcunames.append(pcuinfo['plc_pcuid']) return (nodenames, pcunames) -def node_select(str_query, nodelist=None, fbdb=None): - global fb +def node_select(str_query, nodelist=None, fb=None): hostnames = [] if str_query is None: return hostnames @@ -300,16 +298,14 @@ def node_select(str_query, nodelist=None, fbdb=None): dict_query = query_to_dict(str_query) #print dict_query - if fbdb is not None: - fb = fbdb - for node in nodelist: #if nodelist is not None: # if node not in nodelist: continue try: fb_noderec = None - fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first() + #fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first() + fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) except: print traceback.print_exc() continue @@ -323,6 +319,7 @@ def node_select(str_query, nodelist=None, fbdb=None): #if verifyDBrecord(dict_query, fb_nodeinfo): if verify(dict_query, fb_nodeinfo): + #print fb_nodeinfo.keys() #print node #fb_nodeinfo hostnames.append(node) else: @@ -333,13 +330,11 @@ def node_select(str_query, nodelist=None, fbdb=None): def main(): - global fb - global fbpcu from monitor import parser as parsermodule parser = parsermodule.getParser() - parser.set_defaults(node=None, fromtime=None, select=None, list=None, + parser.set_defaults(node=None, fromtime=None, select=None, list=None, listkeys=False, pcuselect=None, nodelist=None, daysdown=None, fields=None) parser.add_option("", "--daysdown", dest="daysdown", action="store_true", help="List the node state and days down...") @@ -353,6 +348,8 @@ def main(): help="List all nodes with the given key=value pattern") parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", help="A list of nodes to bring out of debug mode.") + parser.add_option("", "--listkeys", dest="listkeys", action="store_true", + help="A list of nodes to bring out of debug mode.") parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD", help="Specify a starting date from which to begin the query.") @@ -372,18 +369,16 @@ def main(): fb = archive.load(file[:-4]) else: #fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed() - #fb = database.dbLoad("findbad") fb = None - fbpcu = database.dbLoad("findbadpcus") - reboot.fb = fbpcu + #reboot.fb = fbpcu if config.nodelist: nodelist = util.file.getListFromFile(config.nodelist) else: # NOTE: list of nodes should come from findbad db. Otherwise, we # don't know for sure that there's a record in the db.. - plcnodes = database.dbLoad("l_plcnodes") + plcnodes = plccache.l_nodes nodelist = [ node['hostname'] for node in plcnodes ] #nodelist = ['planetlab-1.cs.princeton.edu'] @@ -411,7 +406,15 @@ def main(): fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first() except: print traceback.print_exc() - pass #fb_nodeinfo = fb['nodes'][node]['values'] + pass + + if config.listkeys: + fb_nodeinfo = fb_noderec.to_dict() + print "Primary keys available in the findbad object:" + for key in fb_nodeinfo.keys(): + print "\t",key + sys.exit(0) + if config.list: print node diff --git a/pcubad.py b/pcubad.py index 1fd3371..6a1098b 100755 --- a/pcubad.py +++ b/pcubad.py @@ -11,19 +11,19 @@ from monitor.pcu import reboot from monitor import parser as parsermodule from monitor import config from monitor.database import HistoryPCURecord, FindbadPCURecord -from monitor.wrapper import plc +from monitor.wrapper import plc,plccache from monitor.const import MINUP from nodecommon import * from nodequery import verify,query_to_dict,node_select -import syncplcdb from unified_model import * api = plc.getAuthAPI() def main(config): - l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) + #l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) + l_plcpcus = plccache.l_pcus l_pcus = None if config.pcu: @@ -39,7 +39,7 @@ def main(config): checkAndRecordState(l_pcus, l_plcpcus) -hn2lb = database.dbLoad("plcdb_hn2lb") +hn2lb = plccache.plcdb_hn2lb def checkAndRecordState(l_pcus, l_plcpcus): count = 0 diff --git a/pcuinfo.py b/pcuinfo.py index d6d5e87..c9d1e90 100755 --- a/pcuinfo.py +++ b/pcuinfo.py @@ -30,10 +30,10 @@ if not config.run: print "Add --run to actually perform the command" sys.exit(1) -pculist = database.if_cached_else_refresh(1, - config.refresh, - "pculist", - lambda : plc.GetPCUs()) +pculist = plccache.l_pcus # database.if_cached_else_refresh(1, + # config.refresh, + # "pculist", + # lambda : plc.GetPCUs()) for pcu in pculist: #print pcu #sys.exit(1) diff --git a/printbadcsv.py b/printbadcsv.py index f064c11..cae8480 100755 --- a/printbadcsv.py +++ b/printbadcsv.py @@ -6,9 +6,7 @@ import parser as parsermodule from www.printbadnodes import * def main(): - global fb db = database.dbLoad(config.dbname) - fb = database.dbLoad("findbadpcus") act= database.dbLoad("act_all") ## Field widths used for printing diff --git a/showlatlon.py b/showlatlon.py index af01bd7..a556953 100755 --- a/showlatlon.py +++ b/showlatlon.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import plc +from monitor.wrapper import plc, plccache api = plc.getAuthAPI() import sys @@ -86,9 +86,9 @@ def main(): fbstr = get_filefromglob(d, "production.findbad") fbpcustr = get_filefromglob(d, "production.findbadpcus") - l_plcnodes = database.dbLoad("l_plcnodes") - l_plcsites = database.dbLoad("l_plcsites") - lb2hn = database.dbLoad("plcdb_lb2hn") + l_plcnodes = plccache.l_nodes + l_plcsites = plccache.l_sites + lb2hn = plccache.plcdb_lb2hn fb = archive.load(fbstr) fbpcu = archive.load(fbpcustr) reboot.fb = fbpcu diff --git a/sitebad.py b/sitebad.py index 48ac79c..aff0444 100755 --- a/sitebad.py +++ b/sitebad.py @@ -11,20 +11,19 @@ from monitor.pcu import reboot from monitor import parser as parsermodule from monitor import config from monitor.database import HistorySiteRecord, FindbadNodeRecord -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache from monitor.const import MINUP from nodecommon import * from nodequery import verify,query_to_dict,node_select -import syncplcdb from unified_model import * api = plc.getAuthAPI() def main(config): - l_nodes = syncplcdb.create_plcdb() - l_plcsites = database.dbLoad("l_plcsites") + l_nodes = plccache.l_nodes + l_plcsites = plccache.l_sites if config.site: l_sites = [config.site] @@ -49,7 +48,7 @@ def getnodesup(nodelist): def checkAndRecordState(l_sites, l_plcsites): count = 0 - lb2hn = database.dbLoad("plcdb_lb2hn") + lb2hn = plccache.plcdb_lb2hn for sitename in l_sites: d_site = None for site in l_plcsites: diff --git a/siteinfo.py b/siteinfo.py index e9dc9d5..041bf1c 100755 --- a/siteinfo.py +++ b/siteinfo.py @@ -1,18 +1,18 @@ #!/usr/bin/python -import plc +from monitor.wrapper import plc api = plc.getAuthAPI() -import database -import reboot +from monitor import database +from monitor.pcu import reboot import time from model import * from nodecommon import * -import util.file - -import parser as parsermodule +from monitor import util +from monitor import parser as parsermodule +from unified_model import * parser = parsermodule.getParser() @@ -31,7 +31,6 @@ parser.add_option("", "--disable", dest="disable", action="store_true", help="") config = parsermodule.parse_args(parser) -from unified_model import * def color_sitestatus(status): if status == "good": return green(status) @@ -69,7 +68,7 @@ def plc_print_siteinfo(plcsite): print " Checked: %s" % time.ctime() print "\t host | state | obs | created | updated | last_contact " for plcnode in nodes: - fbnode = fb['nodes'][plcnode['hostname']]['values'] + fbnode = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']).to_dict() plcnode['state'] = color_boot_state(get_current_state(fbnode)) print "\t %37s | %5s | %5s | %11.11s | %11.11s | %12s " % \ (plcnode['hostname'], color_boot_state(plcnode['boot_state']), plcnode['state'], @@ -77,7 +76,6 @@ def plc_print_siteinfo(plcsite): diff_time(plcnode['last_contact'])) -fb = database.dbLoad("findbad") act_all = database.dbLoad("act_all") for site in config.args: diff --git a/todo b/todo index 98ace66..b3dc4de 100644 --- a/todo +++ b/todo @@ -1,4 +1,60 @@ +for each node: + Check Status -> + if Pass Threshold -> + Create Issue -> + Take Action -> + email + bm + pcu + plc reset + apply penalties + flag for admin + +for each issue + check issue.status + if issue.status is "open": + issue.take_next_action() + if issue.closed: + issue.shutdown() + if issue.paused: + pass + +action_list for issuetype (pcudown) + send email + yield + send email, apply penalty + yield + send email, apply second penalty + yield + send email + +action_list for issuetype (badhardware) +action_list for issuetype (dnserror) +action_list for issuetype (nodeconfig) +action_list for issuetype (oldbootcd) + +action_list for issuetype (nodedown) + if pcuok, reboot + yield + if pcuok, and reboot failed, set rins, reboot + yield + create_issue pcubroken + send email + yield + send email, apply penalty + yield + send email, apppy second penalty + yield + send email + + +TOOLS: + * add a '--nocache' to the default set of options. + * add a cache parameter in the monitor.conf file. + + + TODO: * install openssh-server, passwd, perl-libwww-perl (for rt), rt-3.4.1, MySQL-python * had to mount -t devpts devpts /dev/pts to get ssh to work inside the diff --git a/unified_model.py b/unified_model.py index 31b0ef6..805dd0e 100755 --- a/unified_model.py +++ b/unified_model.py @@ -2,7 +2,7 @@ from monitor import database -from monitor.wrapper import plc +from monitor.wrapper import plc, plccache from monitor.wrapper import mailer import time @@ -65,8 +65,6 @@ class PenaltyMap: # condition/penalty is applied, move to the next phase. -#fb = database.dbLoad("findbad") - class RT(object): def __init__(self, ticket_id = None): self.ticket_id = ticket_id @@ -410,7 +408,7 @@ class Record(object): def __init__(self, hostname, data): self.hostname = hostname self.data = data - self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = plccache.plcdb_hn2lb self.loginbase = self.plcdb_hn2lb[self.hostname] return @@ -490,15 +488,27 @@ class Record(object): return daysdown getStrDaysDown = classmethod(getStrDaysDown) - #def getStrDaysDown(cls, diag_record): - # daysdown = cls.getDaysDown(diag_record) - # if daysdown > 0: - # return "%d days down"%daysdown - # elif daysdown == -1: - # return "Never online" - # else: - # return "%d days up"% -daysdown - #getStrDaysDown = classmethod(getStrDaysDown) + def getSendEmailFlag(self): + if not config.mail: + return False + + # resend if open & created longer than 30 days ago. + if 'rt' in self.data and \ + 'Status' in self.data['rt'] and \ + "open" in self.data['rt']['Status'] and \ + self.data['rt']['Created'] > int(time.time() - 60*60*24*30): + # if created-time is greater than the thirty days ago from the current time + return False + + return True + + def getMostRecentStage(self): + lastact = self.data['last_action_record'] + return lastact.stage + + def getMostRecentTime(self): + lastact = self.data['last_action_record'] + return lastact.date_action_taken def takeAction(self, index=0): pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames') @@ -524,7 +534,7 @@ class Record(object): hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn) return hlist def saveAction(self): - if 'save-act-all' in self.data and self.data['save-act-all'] == True: + if 'save_act_all' in self.data and self.data['save_act_all'] == True: return True else: return False @@ -579,79 +589,6 @@ class NodeRecord: self.hostname = hostname self.ticket = None self.target = target - #if hostname in fb['nodes']: - # self.data = fb['nodes'][hostname]['values'] - #else: - # raise Exception("Hostname not in scan database") - - def stageIswaitforever(self): - if 'waitforever' in self.data['stage']: - return True - else: - return False - - def severity(self): - category = self.data['category'] - prev_category = self.data['prev_category'] - print "IMPROVED: ", category, prev_category - val = cmpCategoryVal(category, prev_category) - return val - - def improved(self): - return self.severity() > 0 - - def end_record(self): - return node_end_record(self.hostname) - - def reset_stage(self): - self.data['stage'] = 'findbad' - return True - - def open_tickets(self): - if self.ticket and self.ticket.status['status'] == 'open': - return 1 - return 0 - def setIntrospect(self): - pass - - def email_notice(self): - message = self._get_message_for_condition() - message.send(self._get_contacts_for_condition()) - return True - def close_ticket(self): - if self.ticket: - self.ticket.closeTicket() - - def exempt_from_penalties(self): - bl = database.dbLoad("l_blacklist") - return self.hostname in bl - - def penalties(self): - return [] - def escellate_penalty(self): - return True - def reduce_penalty(self): - return True - - - def atTarget(self): - return self.target.verify(self.data) - - def _get_condition(self): - return self.data['category'].lower() - - def _get_stage(self): - "improvement" - "firstnotice_noop" - "secondnotice_noslicecreation" - "thirdnotice_disableslices" - - delta = current_time - self.data['time'] - - def _get_message_for_condition(self): - pass - def _get_contacts_for_condition(self): - pass class Action(MonRecord): def __init__(self, host, data): -- 2.43.0