X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor%2Fpolicy.py;fp=monitor%2Fpolicy.py;h=cc61fee35a3297f09d905a0c5e4a5063b34a2a2a;hb=da913fbd1629fc4669b186915df8ff3a340482d3;hp=0000000000000000000000000000000000000000;hpb=96d9a3873c32ddbf89aca0d5fb09b77fe92b16fc;p=monitor.git diff --git a/monitor/policy.py b/monitor/policy.py new file mode 100644 index 0000000..cc61fee --- /dev/null +++ b/monitor/policy.py @@ -0,0 +1,352 @@ +import config +import database +import time +import mailer +import sys +import emailTxt +import string +from monitor.wrapper import plccache +from datetime import datetime + +from rt import is_host_in_rt_tickets +import plc + +# Time to enforce policy +POLSLEEP = 7200 + +# Where to email the summary +SUMTO = "soltesz@cs.princeton.edu" + +from const import * + +from monitor.model import * + +class MonitorMergeDiagnoseSendEscellate: + act_all = None + + def __init__(self, hostname, act): + self.hostname = hostname + self.act = act + self.plcdb_hn2lb = None + if self.plcdb_hn2lb is None: + self.plcdb_hn2lb = plccache.plcdb_hn2lb + self.loginbase = self.plcdb_hn2lb[self.hostname] + return + + def getFBRecords(self): + fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname) + fbnodes = None + if fbrec: + fbnodes = fbrecs + else: + fbnodes = None + return fbnodes + + def getLastActionRecord(self): + actrec = ActionRecord.get_latest_by(hostname=self.hostname) + actnode = None + if actrec: + actnode = actrec + else: + actnode = None + return actnode + + def getPreviousCategory(self, actrec): + ret = None + if actrec: + ret = actrec.findbad_records[0].observed_category + else: + ret = "ERROR" + return ret + + + def mergeRecord(self, fbnodes, actrec): + + actdefault = {} + actdefault['date_created'] = datetime.now() + actdefault['date_action_taken'] = datetime.now() + + actdefault['stage'] = "initial" + actdefault['message_series'] = None + actdefault['message_index'] = None + actdefault['message_arguments'] = None + + actdefault['send_email_to'] = TECH + actdefault['penalty_level'] = 0 + actdefault['action'] = [ 'noop' ] + actdefault['take_action'] = False + + actdefault['ticket_id'] = "" + actdefault['findbad_records'] = fbnodes + actdefault['last_action_record'] = actrec + + actdefault['prev_category'] = self.getPreviousCategory(actrec) + actdefault['category'] = fbnodes[0].observed_category + + actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id) + + return actdefault + + def run(self): + fbnodes = self.getFBRecords() + actnode= self.getLastActionRecord() + actrec = self.mergeRecord(fbnodes, actnode) + record = Record(self.hostname, actrec) + diag = self.diagnose(record) + if self.act and diag is not None: + self.action(record,diag) + + def diagnose(self, record): + + diag = {} + # NOTE: change record stage based on RT status. + if record.stageIswaitforever(): + ticket = record.data['rt'] + if 'new' in ticket['Status']: + print "Resetting Stage!!!!!" + record.reset_stage() + + if 'resolved' in ticket['Status']: + diag['RTEndRecord'] = True + + # NOTE: take category, and prepare action + category = record.getCategory() + if category == "error": + diag['SendNodedown'] = True + record.data['message_series'] = emailTxt.mailtxt.newdown + record.data['log'] = self.getDownLog(record) + + elif category == "prod" or category == "alpha": + state = record.getState() + if state == "boot": + if record.severity() != 0: + diag['SendThankyou'] = True + print "RESETTING STAGE: improvement" + record.data['stage'] = 'improvement' + record.data['message_series'] = emailTxt.mailtxt.newthankyou + record.data['log'] = self.getThankyouLog(record) + else: + # NOTE: do nothing, since we've already done the above. + print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname + return None + elif state == "debug": + pass + else: + print "unknown state %s for host %s" % (state, self.hostname) + else: + print "unknown category: %s" % category + + + # TODO: how to not send email?... + record = self.checkStageAndTime(record) + #if record: + print "diagnose: checkStageAndTime Returned Valid Record" + siterec = HistorySiteRecord.by_loginbase(self.loginbase) + + if "good" not in siterec.status: # != "good": + print "diagnose: Setting site %s for 'squeeze'" % self.loginbase + diag['Squeeze'] = True + else: + print "diagnose: Setting site %s for 'backoff'" % self.loginbase + diag['BackOff'] = True + + return diag + + def action(self, record, diag): + + message = None + + print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) ) + if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \ + "monitor-end-record" in record.data['stage']: + print "action: getting message" + #### Send EMAIL + message = record.getMessage(record.data['ticket_id']) + if message: + print "action: sending email" + message.send(record.getContacts()) + if message.rt.ticket_id: + print "action: setting record ticket_id" + record.data['ticket_id'] = message.rt.ticket_id + + #### APPLY PENALTY + if ( record.data['take_action'] and diag['Squeeze'] ): + print "action: taking action" + record.takeAction(record.data['penalty_level']) + del diag['Squeeze'] + if diag.getFlag('BackOff'): + record.takeAction(0) + del diag['BackOff'] + + #### SAVE TO DB + if record.saveAction(): + print "action: saving act_all db" + self.add_and_save_act_all(record) + else: + print "action: NOT saving act_all db" + print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] ) + + #### END RECORD + if record.improved() or diag['RTEndRecord']: + print "action: end record for %s" % self.hostname + record.end_record() + diag['CloseRT'] = True + del diag['RTEndRecord'] + + #### CLOSE RT TICKET + if message: + if diag['CloseRT']: + message.rt.closeTicket() + del diag['CloseRT'] + + else: + print "NOT sending email : %s" % config.mail + + return + + def add_and_save_act_all(self, record): + """ + Read the sync record for this node, and increment the round and + create an ActionRecord for this host using the record.data values. + """ + recsync = RecordActionSync.get_by(hostname=self.hostname) + rec = RecordAction(hostname=self.hostname) + recsync.round += 1 + record.data['round'] = recsync.round + # TODO: we will need to delete some of these before setting them in the DB. + rec.set(**record.data) + rec.flush() + + def getDownLog(self, record): + + record.data['args'] = {'nodename': self.hostname} + record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "") + + #for key in record.data.keys(): + # print "%10s %s %s " % (key, "==", record.data[key]) + + if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data: + log = "DOWN: %20s : %-40s == %20s %s" % \ + (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket']) + else: + log = "DOWN: %20s : %-40s == %20s %s" % \ + (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id']) + return log + + def getThankyouLog(self, record): + + record.data['args'] = {'nodename': self.hostname} + record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category']) + + try: + if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data: + log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ + (self.loginbase, self.hostname, record.data['stage'], + record.data['prev_category'], record.data['category'], record.data['found_rt_ticket']) + else: + log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ + (self.loginbase, self.hostname, record.data['stage'], + record.data['prev_category'], record.data['category'], record.data['ticket_id']) + except: + log = "IMPR: %s improved to %s " % (self.hostname, record.data['category']) + return log + + def makeRecord(self, **kwargs): + rec = {} + for key in kwargs.keys(): + rec[key] = kwargs[key] + return rec + + def checkStageAndTime(self, record): + """ + The core variables are: + + send_email_to : defines who to send messages to at this time + take_action : whether or not to take action + penalty_level : how much of a penalty to apply + message_index : where in the escellation sequence we are. + save_act_all : whether or not to save the action record in the db. + + action/stage : stage tracks which state we're in. + """ + stages = { + "initial" : [ { action='noop', next="weekone"}], + "weekone" : [ { action='noop', index=0, save=True, email=TECH, length=7*SPERDAY, next="weektwo" }, ], + "weektwo" : [ { action='nocreate', index=1, save=True, email=TECH|PI, length=7*SPERDAY, next="waitforever" }, ], + "waitforever" : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY, next="waitforever" }, ], + "paused" : [ { action='noop', save=True length=30*SPERDAY, next="weekone" }, ] + "improvement" : [ { action='close_rt', index=0, save=True, email=TECH, next="monitor-end-record" }, ], + } + # TODO: make this time relative to the PREVIOUS action taken. + current_time = time.time() + current_stage = record.getMostRecentStage() + recent_time = record.getMostRecentTime() + + delta = current_time - recent_time + + if current_stage in stages: + values = stages[current_stage][0] + + if delta >= values['length']: + print "checkStageAndTime: transition to next stage" + new_stage = values['next'] + values = stages[new_stage] + + elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data: + print "checkStageAndTime: second message in one week for stage two" + take_action=False + pass + else: + # DO NOTHING + take_action=False, + save_act_all=False, + message_index=None, + print "checkStageAndTime: second message in one week for stage two" + + rec = self.makeRecord( stage=new_stage, send_email_to=values['email'], + action=values['action'], message_index=values['index'], + save_act_all=values['save'], penalty_level=values['index'], + date_action_taken=current_time) + record.data.update(rec) + + + if 'initial' in record.data['stage']: + # The node is bad, and there's no previous record of it. + rec = self.makeRecord( + stage="weekone", send_email_to=TECH, + action=['noop'], take_action=False, + message_index=0, save_act_all=True, + penalty_level=0, ) + record.data.update(rec) + + elif 'improvement' in record.data['stage']: + print "checkStageAndTime: backing off of %s" % self.hostname + rec = self.makeRecord( + stage='monitor-end-record', send_email_to=TECH, + action=['close_rt'], take_action=True, + message_index=0, save_act_all=True, + penalty_level=0, ) + record.data.update(rec) + + else: + # There is no action to be taken, possibly b/c the stage has + # already been performed, but diagnose picked it up again. + # two cases, + # 1. stage is unknown, or + # 2. delta is not big enough to bump it to the next stage. + # TODO: figure out which. for now assume 2. + print "UNKNOWN stage for %s; nothing done" % self.hostname + rec = self.makeRecord( + stage='weekone', send_email_to=TECH, + action=['noop'], + take_action=False, + save_act_all=True, + date_action_taken=current_time, + message_index=0, + penalty_level=0, ) + record.data.update(rec) + + print "%s" % record.data['log'], + print "%15s" % record.data['action'] + return record +