X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=clean_policy.py;h=2dd737bd35c387ee9da494d6e95bba1735aec01e;hb=refs%2Fheads%2F1.0;hp=dba9b9b8e75a9b68a4256e106d0b74fc7f480ab9;hpb=d0652340b89d51c6115edb13d5c7c72b34dea66f;p=monitor.git diff --git a/clean_policy.py b/clean_policy.py index dba9b9b..2dd737b 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -1,15 +1,11 @@ -from config import config -#print "policy" -config = config() -import soltesz +import config +import database import time import mailer -from www.printbadnodes import cmpCategoryVal import sys import emailTxt import string -from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node from rt import is_host_in_rt_tickets import plc @@ -23,18 +19,35 @@ from const import * from unified_model import * +def get_ticket_id(record): + if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None: + return record['ticket_id'] + elif 'found_rt_ticket' in record and \ + record['found_rt_ticket'] is not "" and \ + record['found_rt_ticket'] is not None: + return record['found_rt_ticket'] + else: + return None + class MonitorMergeDiagnoseSendEscellate: + act_all = None + fb = None + def __init__(self, hostname, act): self.hostname = hostname self.act = act self.plcdb_hn2lb = None if self.plcdb_hn2lb is None: - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") self.loginbase = self.plcdb_hn2lb[self.hostname] return def getFBRecord(self): - fb = soltesz.dbLoad("findbad") + if MonitorMergeDiagnoseSendEscellate.fb == None: + MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad") + + fb = MonitorMergeDiagnoseSendEscellate.fb + if self.hostname in fb['nodes']: fbnode = fb['nodes'][self.hostname]['values'] else: @@ -43,12 +56,15 @@ class MonitorMergeDiagnoseSendEscellate: def getActionRecord(self): # update ticket status - act_all = soltesz.dbLoad("act_all") + if MonitorMergeDiagnoseSendEscellate.act_all == None: + MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all") + + act_all = MonitorMergeDiagnoseSendEscellate.act_all + if self.hostname in act_all and len(act_all[self.hostname]) > 0: actnode = act_all[self.hostname][0] else: actnode = None - del act_all return actnode def getKernel(self, unamestr): @@ -66,15 +82,20 @@ class MonitorMergeDiagnoseSendEscellate: fbnode['info'] = None fbnode['log'] = None fbnode['time'] = time.time() + fbnode['email'] = TECH + fbnode['action-level'] = 0 + fbnode['action'] = ['noop'] fbnode['date_created'] = time.time() - if actnode is None: + if actnode is None: # there is no entry in act_all actnode = {} actnode.update(fbnode) actnode['ticket_id'] = "" - actnode['prev_category'] = "NORECORD" + actnode['prev_category'] = "ERROR" + actnode['prev_state'] = "DOWN" else: actnode['prev_category']= actnode['category'] + actnode['prev_state'] = actnode['state'] actnode['comonstats'] = fbnode['comonstats'] actnode['category'] = fbnode['category'] actnode['state'] = fbnode['state'] @@ -96,6 +117,10 @@ class MonitorMergeDiagnoseSendEscellate: actnode= self.getActionRecord() actrec = self.mergeRecord(fbnode, actnode) record = Record(self.hostname, actrec) + #print record + #print actrec + #print record.data['time'] + #print time.time() - record.data['time'] diag = self.diagnose(record) if self.act and diag is not None: self.action(record,diag) @@ -104,29 +129,43 @@ class MonitorMergeDiagnoseSendEscellate: diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags') # NOTE: change record stage based on RT status. - diag.setFlag('ResetStage') + #diag.setFlag('ResetStage') if record.stageIswaitforever(): ticket = record.data['rt'] if 'new' in ticket['Status']: - diag.setFlag('ResetStage') + print "Resetting Stage!!!!!" + # diag.setFlag('ResetStage') + record.reset_stage() + #if diag.getFlag('ResetStage'): + # print "diagnose: resetting stage" + # diag.resetFlag('ResetStage') if 'resolved' in ticket['Status']: - diag.setFlag('EndRecord') + diag.setFlag('RTEndRecord') + # NOTE: try to give a default value to catch the errors for + # planetlab1.ias.csusb.edu which seems to have an out-of-date node config + record.data['message_series'] = emailTxt.mailtxt.newdown # NOTE: take category, and prepare action category = record.getCategory() if category == "error": diag.setFlag('SendNodedown') - record.data['message'] = emailTxt.mailtxt.newdown + record.data['message_series'] = emailTxt.mailtxt.newdown record.data['log'] = self.getDownLog(record) - elif category == "prod": - state = diag.getState() + elif category == "prod" or category == "alpha": + state = record.getState() if state == "boot": - diag.setFlag('SendThankyou') - record.data['message'] = emailTxt.mailtxt.newthankyou - record.data['log'] = self.getThankyouLog(record) - + if record.severity() != 0: + diag.setFlag('SendThankyou') + print "RESETTING STAGE: improvement" + record.data['stage'] = 'improvement' + record.data['message_series'] = emailTxt.mailtxt.newthankyou + record.data['log'] = self.getThankyouLog(record) + else: + # NOTE: do nothing, since we've already done the above. + print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname + return None elif state == "debug": pass else: @@ -134,54 +173,83 @@ class MonitorMergeDiagnoseSendEscellate: else: print "unknown category: %s" % category - if diag.getFlag('ResetStage'): - print "resetting stage" - record.reset_stage() + # TODO: how to not send email?... record = self.checkStageAndTime(diag,record) - if record: - print "checkStageAndTime Returned Valid Record" - site = PersistFlags(self.loginbase, 1, db='site_persistflags') + #if record: + print "diagnose: checkStageAndTime Returned Valid Record" + site = PersistFlags(self.loginbase, 1, db='site_persistflags') - if site.status is not "good": - print "Setting site %s for 'squeeze'" % self.loginbase - diag.setFlag('Squeeze') - else: - print "Setting site %s for 'backoff'" % self.loginbase - diag.setFlag('BackOff') - - diag.save() - return diag + if "good" not in site.status: # != "good": + print "diagnose: Setting site %s for 'squeeze'" % self.loginbase + diag.setFlag('Squeeze') else: - print "checkStageAndTime Returned NULL Record" - return None + print "diagnose: Setting site %s for 'backoff'" % self.loginbase + diag.setFlag('BackOff') + + diag.save() + return diag + #else: + # print "checkStageAndTime Returned NULL Record" + # return None def action(self, record, diag): - if record.improved() or diag.getFlag('EndRecord'): - print "end record for %s" % self.hostname - record.end_record() - diag.setFlag('CloseRT') - return None - - if self.getSendEmailFlag(record): - print "sending email" - message = record.getMessage(record.data['ticket_id']) - message.reset() - message.send(record.getContacts()) - if message.rt.ticket_id: - print "setting record ticket_id" - record.data['ticket_id'] = message.rt.ticket_id - if diag.getFlag('CloseRT'): - message.rt.closeTicket() - else: - print "NOT sending email : %s %s" % (config.mail, record.data['rt']) - if record.data['takeaction'] and diag.getFlag('Squeeze'): - print "taking action" - record.takeAction() + message = None + + #print record.data['stage'] + #print "improvement" in record.data['stage'] + #print self.getSendEmailFlag(record) + print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) ) + if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \ + "monitor-end-record" in record.data['stage']: + print "action: getting message" + message = record.getMessage(record.data['ticket_id']) + if message: + print "action: sending email" + message.send(record.getContacts()) + #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" + #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" + #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" + #print message + if message.rt.ticket_id: + print "action: setting record ticket_id" + record.data['ticket_id'] = message.rt.ticket_id + + if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): + print "action: taking squeeze action" + record.takeAction(record.data['action-level']) + diag.resetFlag('Squeeze') + diag.save() + if diag.getFlag('BackOff'): + print "action: taking backoff action" + record.takeAction(0) + diag.resetFlag('BackOff') + diag.save() + + if record.saveAction(): + print "action: saving act_all db" + self.add_and_save_act_all(record) + else: + print "action: NOT saving act_all db" + print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] ) + + if record.improved() or diag.getFlag('RTEndRecord'): + print "action: end record for %s" % self.hostname + record.end_record() + diag.setFlag('CloseRT') + diag.resetFlag('RTEndRecord') + diag.save() + #return None + + if message: + if diag.getFlag('CloseRT'): + message.rt.closeTicket() + diag.resetFlag('CloseRT') + diag.save() - print "saving act_all db" - self.add_and_save_act_all(record) + else: + print "NOT sending email : %s" % config.mail return @@ -193,15 +261,18 @@ class MonitorMergeDiagnoseSendEscellate: if 'rt' in record.data and \ 'Status' in record.data['rt'] and \ "open" in record.data['rt']['Status'] and \ - record.data['rt']['Created'] < 60*60*24*30: + record.data['rt']['Created'] > int(time.time() - 60*60*24*30): + # if created-time is greater than the thirty days ago from the current time return False return True def add_and_save_act_all(self, record): - self.act_all = soltesz.dbLoad("act_all") + self.act_all = database.dbLoad("act_all") + if self.hostname not in self.act_all: + self.act_all[self.hostname] = [] self.act_all[self.hostname].insert(0,record.data) - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) def getDownLog(self, record): @@ -211,7 +282,7 @@ class MonitorMergeDiagnoseSendEscellate: #for key in record.data.keys(): # print "%10s %s %s " % (key, "==", record.data[key]) - if record.data['ticket_id'] == "": + if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data: log = "DOWN: %20s : %-40s == %20s %s" % \ (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket']) else: @@ -224,79 +295,108 @@ class MonitorMergeDiagnoseSendEscellate: record.data['args'] = {'nodename': self.hostname} record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category']) - if record.data['ticket_id'] == "": - log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ + try: + if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data: + log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ (self.loginbase, self.hostname, record.data['stage'], - state, category, record.data['found_rt_ticket']) - else: - log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ + record.data['prev_category'], record.data['category'], record.data['found_rt_ticket']) + else: + log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ (self.loginbase, self.hostname, record.data['stage'], - state, category, record.data['ticket_id']) + record.data['prev_category'], record.data['category'], record.data['ticket_id']) + except: + log = "IMPR: %s improved to %s " % (self.hostname, record.data['category']) return log def checkStageAndTime(self, diag, record): current_time = time.time() delta = current_time - record.data['time'] + #print record.data if 'findbad' in record.data['stage']: # The node is bad, and there's no previous record of it. record.data['email'] = TECH record.data['action'] = ['noop'] record.data['takeaction'] = False - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' + record.data['save-act-all'] = True + record.data['action-level'] = 0 elif 'reboot_node' in record.data['stage']: record.data['email'] = TECH record.data['action'] = ['noop'] - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' record.data['takeaction'] = False + record.data['save-act-all'] = False + record.data['action-level'] = 0 elif 'improvement' in record.data['stage']: - print "backing off of %s" % self.hostname + print "checkStageAndTime: backing off of %s" % self.hostname record.data['action'] = ['close_rt'] record.data['takeaction'] = True - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'monitor-end-record' + record.data['save-act-all'] = True + record.data['action-level'] = 0 elif 'actinoneweek' in record.data['stage']: if delta >= 7 * SPERDAY: + print "checkStageAndTime: transition to next stage actintwoweeks" record.data['email'] = TECH | PI record.data['stage'] = 'stage_actintwoweeks' - record.data['message'] = record.data['message'][1] + record.data['message'] = record.data['message_series'][1] record.data['action'] = ['nocreate' ] record.data['time'] = current_time # reset clock for waitforever record.data['takeaction'] = True + record.data['save-act-all'] = True + record.data['action-level'] = 1 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data: + print "checkStageAndTime: second message in one week" record.data['email'] = TECH - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['action'] = ['sendmailagain-waitforoneweekaction' ] record.data['second-mail-at-oneweek'] = True record.data['takeaction'] = False + record.data['save-act-all'] = True + record.data['action-level'] = 0 else: record.data['message'] = None record.data['action'] = ['waitforoneweekaction' ] - print "ignoring this record for: %s" % self.hostname - return None # don't send if there's no action + record.data['takeaction'] = False + record.data['save-act-all'] = False + record.data['action-level'] = 0 + print "checkStageAndTime: ignoring this record for: %s" % self.hostname + #return None # don't send if there's no action elif 'actintwoweeks' in record.data['stage']: if delta >= 7 * SPERDAY: + print "checkStageAndTime: transition to next stage waitforever" record.data['email'] = TECH | PI | USER record.data['stage'] = 'stage_waitforever' - record.data['message'] = record.data['message'][2] + record.data['message'] = record.data['message_series'][2] record.data['action'] = ['suspendslices'] record.data['time'] = current_time # reset clock for waitforever record.data['takeaction'] = True + record.data['save-act-all'] = True + record.data['action-level'] = 2 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data: + print "checkStageAndTime: second message in one week for stage two" record.data['email'] = TECH | PI - record.data['message'] = record.data['message'][1] + record.data['message'] = record.data['message_series'][1] record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ] record.data['second-mail-at-twoweeks'] = True record.data['takeaction'] = False + record.data['save-act-all'] = True + record.data['action-level'] = 1 else: record.data['message'] = None + record.data['takeaction'] = False record.data['action'] = ['waitfortwoweeksaction'] - return None # don't send if there's no action + record.data['save-act-all'] = False + print "checkStageAndTime: second message in one week for stage two" + record.data['action-level'] = 1 + #return None # don't send if there's no action elif 'ticket_waitforever' in record.data['stage']: record.data['email'] = TECH @@ -307,15 +407,22 @@ class MonitorMergeDiagnoseSendEscellate: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None record.data['time'] = current_time + record.data['save-act-all'] = True + record.data['action-level'] = 2 else: if delta >= 7*SPERDAY: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None record.data['time'] = current_time # reset clock + record.data['save-act-all'] = True + record.data['action-level'] = 2 else: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None - return None + record.data['takeaction'] = False + record.data['save-act-all'] = False + record.data['action-level'] = 2 + #return None elif 'waitforever' in record.data['stage']: # more than 3 days since last action @@ -324,12 +431,17 @@ class MonitorMergeDiagnoseSendEscellate: record.data['takeaction'] = True if delta >= 3*SPERDAY: record.data['action'] = ['email-againwaitforever'] - record.data['message'] = record.data['message'][2] + record.data['message'] = record.data['message_series'][2] record.data['time'] = current_time # reset clock + record.data['save-act-all'] = True + record.data['action-level'] = 2 else: record.data['action'] = ['waitforever'] record.data['message'] = None - return None # don't send if there's no action + record.data['takeaction'] = False + record.data['save-act-all'] = False + record.data['action-level'] = 2 + #return None # don't send if there's no action else: # There is no action to be taken, possibly b/c the stage has @@ -340,14 +452,15 @@ class MonitorMergeDiagnoseSendEscellate: # TODO: figure out which. for now assume 2. print "UNKNOWN stage for %s; nothing done" % self.hostname record.data['action'] = ['unknown'] - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['email'] = TECH record.data['action'] = ['noop'] - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' record.data['time'] = current_time # reset clock record.data['takeaction'] = False + record.data['save-act-all'] = True print "%s" % record.data['log'], print "%15s" % record.data['action']