Moved some files around and merged from 1.0 branch:
[monitor.git] / clean_policy.py
index f1249cf..3ae3811 100644 (file)
@@ -2,12 +2,12 @@ import config
 import database 
 import time
 import mailer
-from www.printbadnodes import cmpCategoryVal
 import sys
 import emailTxt
 import string
+from monitor.wrapper import plccache
+from datetime import datetime
 
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
 from rt import is_host_in_rt_tickets
 import plc
 
@@ -22,77 +22,75 @@ from const import *
 from unified_model import *
 
 class MonitorMergeDiagnoseSendEscellate:
+       act_all = None
+
        def __init__(self, hostname, act):
                self.hostname = hostname
                self.act = act
                self.plcdb_hn2lb = None
                if self.plcdb_hn2lb is None:
-                       self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
+                       self.plcdb_hn2lb = plccache.plcdb_hn2lb 
                self.loginbase = self.plcdb_hn2lb[self.hostname]
                return
 
-       def getFBRecord(self):
-               fb = database.dbLoad("findbad")
-               if self.hostname in fb['nodes']:
-                       fbnode = fb['nodes'][self.hostname]['values']
+       def getFBRecords(self):
+               fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname)
+               fbnodes = None
+               if fbrec: 
+                       fbnodes = fbrecs
                else:
-                       raise Exception("Hostname %s not in scan database"% self.hostname)
-               return fbnode
-
-       def getActionRecord(self):
-               # update ticket status
-               act_all = database.dbLoad("act_all")
-               if self.hostname in act_all and len(act_all[self.hostname]) > 0:
-                       actnode = act_all[self.hostname][0]
+                       fbnodes = None
+               return fbnodes
+
+       def getLastActionRecord(self):
+               actrec = ActionRecord.get_latest_by(hostname=self.hostname)
+               actnode = None
+               if actrec:
+                       actnode = actrec
                else:
                        actnode = None
-               del act_all
                return actnode
 
-       def getKernel(self, unamestr):
-               s = unamestr.split()
-               if len(s) > 2:
-                       return s[2]
-               else:
-                       return ""
-
-       def mergeRecord(self, fbnode, actnode):
-               fbnode['kernel'] = self.getKernel(fbnode['kernel'])
-               fbnode['stage'] = "findbad"
-               fbnode['message'] = None
-               fbnode['args'] = None
-               fbnode['info'] = None
-               fbnode['log'] = None
-               fbnode['time'] = time.time()
-               fbnode['date_created'] = time.time()
-
-               if actnode is None:
-                       actnode = {} 
-                       actnode.update(fbnode)
-                       actnode['ticket_id'] = ""
-                       actnode['prev_category'] = "NORECORD" 
+       def getPreviousCategory(self, actrec):
+               ret = None
+               if actrec:
+                       ret = actrec.findbad_records[0].observed_category
                else:
-                       actnode['prev_category']= actnode['category']
-                       actnode['comonstats']   = fbnode['comonstats']
-                       actnode['category']             = fbnode['category']
-                       actnode['state']                = fbnode['state']
-                       actnode['kernel']               = fbnode['kernel']
-                       actnode['bootcd']               = fbnode['bootcd']
-                       actnode['plcnode']              = fbnode['plcnode']
-                       ticket = get_ticket_id(actnode)
-                       if ticket is None: actnode['ticket_id'] = ""
-                       actnode['rt'] = mailer.getTicketStatus(ticket)
-
-                       #for key in actnode.keys():
-                       #       print "%10s %s %s " % (key, "==", actnode[key])
-                       #print "----------------------------"
+                       ret = "ERROR"
+               return ret
 
-               return actnode
+
+       def mergeRecord(self, fbnodes, actrec):
+
+               actdefault = {}
+               actdefault['date_created'] = datetime.now()
+               actdefault['date_action_taken'] = datetime.now()
+
+               actdefault['stage'] = "initial"
+               actdefault['message_series'] = None
+               actdefault['message_index'] = None
+               actdefault['message_arguments'] = None
+
+               actdefault['send_email_to'] = TECH
+               actdefault['penalty_level'] = 0
+               actdefault['action'] = [ 'noop' ]
+               actdefault['take_action'] = False
+
+               actdefault['ticket_id'] = ""
+               actdefault['findbad_records'] = fbnodes
+               actdefault['last_action_record'] = actrec
+
+               actdefault['prev_category'] = self.getPreviousCategory(actrec)
+               actdefault['category']          = fbnodes[0].observed_category
+
+               actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id)
+
+               return actdefault
 
        def run(self):
-               fbnode = self.getFBRecord()
-               actnode= self.getActionRecord()
-               actrec = self.mergeRecord(fbnode, actnode)
+               fbnodes = self.getFBRecords()
+               actnode= self.getLastActionRecord()
+               actrec = self.mergeRecord(fbnodes, actnode)
                record = Record(self.hostname, actrec)
                diag   = self.diagnose(record)
                if self.act and diag is not None:
@@ -100,31 +98,37 @@ class MonitorMergeDiagnoseSendEscellate:
        
        def diagnose(self, record):
 
-               diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
+               diag = {}
                # NOTE: change record stage based on RT status.
-               diag.setFlag('ResetStage')
                if record.stageIswaitforever():
                        ticket = record.data['rt']
                        if 'new' in ticket['Status']:
-                               diag.setFlag('ResetStage')
+                               print "Resetting Stage!!!!!"
+                               record.reset_stage()
                                
                        if 'resolved' in ticket['Status']:
-                               diag.setFlag('EndRecord')
+                               diag['RTEndRecord'] = True
 
                # NOTE: take category, and prepare action
                category = record.getCategory()
                if category == "error":
-                       diag.setFlag('SendNodedown')
-                       record.data['message'] = emailTxt.mailtxt.newdown
+                       diag['SendNodedown'] = True
+                       record.data['message_series'] = emailTxt.mailtxt.newdown
                        record.data['log'] = self.getDownLog(record)
 
-               elif category == "prod":
+               elif category == "prod" or category == "alpha":
                        state = record.getState()
                        if state == "boot":
-                               diag.setFlag('SendThankyou')
-                               record.data['message'] = emailTxt.mailtxt.newthankyou
-                               record.data['log'] = self.getThankyouLog(record)
-
+                               if record.severity() != 0:
+                                       diag['SendThankyou'] = True
+                                       print "RESETTING STAGE: improvement"
+                                       record.data['stage'] = 'improvement'
+                                       record.data['message_series'] = emailTxt.mailtxt.newthankyou
+                                       record.data['log'] = self.getThankyouLog(record)
+                               else:
+                                       # NOTE: do nothing, since we've already done the above.
+                                       print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
+                                       return None
                        elif state == "debug":
                                pass
                        else:
@@ -132,75 +136,87 @@ class MonitorMergeDiagnoseSendEscellate:
                else:
                        print "unknown category: %s" % category
 
-               if diag.getFlag('ResetStage'):
-                       print "resetting stage"
-                       record.reset_stage()
-
-               record = self.checkStageAndTime(diag,record)
-               if record:
-                       print "checkStageAndTime Returned Valid Record"
-                       site = PersistFlags(self.loginbase, 1, db='site_persistflags')
 
-                       if site.status is not "good":
-                               print "Setting site %s for 'squeeze'" % self.loginbase
-                               diag.setFlag('Squeeze')
-                       else:
-                               print "Setting site %s for 'backoff'" % self.loginbase
-                               diag.setFlag('BackOff')
+               # TODO: how to not send email?...
+               record = self.checkStageAndTime(record)
+               #if record:
+               print "diagnose: checkStageAndTime Returned Valid Record"
+               siterec = HistorySiteRecord.by_loginbase(self.loginbase)
 
-                       diag.save()
-                       return diag
+               if "good" not in siterec.status: #  != "good":
+                       print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
+                       diag['Squeeze'] = True
                else:
-                       print "checkStageAndTime Returned NULL Record"
-                       return None
+                       print "diagnose: Setting site %s for 'backoff'" % self.loginbase
+                       diag['BackOff'] = True
 
-       def action(self, record, diag):
-               if record.improved() or diag.getFlag('EndRecord'):
-                       print "end record for %s" % self.hostname
-                       record.end_record()
-                       diag.setFlag('CloseRT')
-                       return None
-
-               if self.getSendEmailFlag(record): 
-                       print "sending email"
-                       message = record.getMessage(record.data['ticket_id'])
-                       message.reset()
-                       message.send(record.getContacts())
-                       if message.rt.ticket_id:
-                               print "setting record ticket_id"
-                               record.data['ticket_id'] = message.rt.ticket_id
-                       if diag.getFlag('CloseRT'):
-                               message.rt.closeTicket()
-               else:
-                       print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
+               return diag
 
-               if record.data['takeaction'] and diag.getFlag('Squeeze'):
-                       print "taking action"
-                       record.takeAction()
+       def action(self, record, diag):
 
-               print "saving act_all db"
-               self.add_and_save_act_all(record)
+               message = None
 
-               return
-
-       def getSendEmailFlag(self, record):
-               if not config.mail:
-                       return False
+               print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
+               if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
+                       "monitor-end-record" in record.data['stage']:
+                       print "action: getting message"
+                       #### Send EMAIL
+                       message = record.getMessage(record.data['ticket_id'])
+                       if message:
+                               print "action: sending email"
+                               message.send(record.getContacts())
+                               if message.rt.ticket_id:
+                                       print "action: setting record ticket_id"
+                                       record.data['ticket_id'] = message.rt.ticket_id
+
+                       #### APPLY PENALTY
+                       if ( record.data['take_action'] and diag['Squeeze'] ): 
+                               print "action: taking action"
+                               record.takeAction(record.data['penalty_level'])
+                               del diag['Squeeze']
+                       if diag.getFlag('BackOff'):
+                               record.takeAction(0)
+                               del diag['BackOff']
+
+                       #### SAVE TO DB
+                       if record.saveAction():
+                               print "action: saving act_all db"
+                               self.add_and_save_act_all(record)
+                       else:
+                               print "action: NOT saving act_all db"
+                               print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] )
+
+                       #### END RECORD
+                       if record.improved() or diag['RTEndRecord']:
+                               print "action: end record for %s" % self.hostname
+                               record.end_record()
+                               diag['CloseRT'] = True
+                               del diag['RTEndRecord']
+
+                       #### CLOSE RT TICKET
+                       if message:
+                               if diag['CloseRT']:
+                                       message.rt.closeTicket()
+                                       del diag['CloseRT']
 
-               # resend if open & created longer than 30 days ago.
-               if  'rt' in record.data and \
-                       'Status' in record.data['rt'] and \
-                       "open" in record.data['rt']['Status'] and \
-                       record.data['rt']['Created'] < 60*60*24*30:
-                       return False
+               else:
+                       print "NOT sending email : %s" % config.mail
 
-               return True
+               return
 
        def add_and_save_act_all(self, record):
-               self.act_all = database.dbLoad("act_all")
-               self.act_all[self.hostname].insert(0,record.data)
-               database.dbDump("act_all", self.act_all)
-               
+               """
+                       Read the sync record for this node, and increment the round and
+                       create an ActionRecord for this host using the record.data values.
+               """
+               recsync = RecordActionSync.get_by(hostname=self.hostname)
+               rec = RecordAction(hostname=self.hostname)
+               recsync.round += 1
+               record.data['round'] = recsync.round
+               # TODO: we will need to delete some of these before setting them in the DB.
+               rec.set(**record.data)
+               rec.flush()
+
        def getDownLog(self, record):
 
                record.data['args'] = {'nodename': self.hostname}
@@ -209,7 +225,7 @@ class MonitorMergeDiagnoseSendEscellate:
                #for key in record.data.keys():
                #       print "%10s %s %s " % (key, "==", record.data[key])
 
-               if record.data['ticket_id'] == "":
+               if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
                        log = "DOWN: %20s : %-40s == %20s %s" % \
                                (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
                else:
@@ -222,112 +238,95 @@ class MonitorMergeDiagnoseSendEscellate:
                record.data['args'] = {'nodename': self.hostname}
                record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
 
-               if record.data['ticket_id'] == "":
-                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+               try:
+                       if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
+                               log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                (self.loginbase, self.hostname, record.data['stage'], 
-                                                state, category, record.data['found_rt_ticket'])
-               else:
-                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                                record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
+                       else:
+                               log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                (self.loginbase, self.hostname, record.data['stage'], 
-                                                state, category, record.data['ticket_id'])
+                                                record.data['prev_category'], record.data['category'], record.data['ticket_id'])
+               except:
+                       log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
                return log
 
-       def checkStageAndTime(self, diag, record):
+       def makeRecord(self, **kwargs):
+               rec = {}
+               for key in kwargs.keys():
+                       rec[key] = kwargs[key]
+               return rec
+
+       def checkStageAndTime(self, record):
+       """
+               The core variables are:
+
+                       send_email_to  : defines who to send messages to at this time
+                       take_action    : whether or not to take action
+                       penalty_level  : how much of a penalty to apply
+                       message_index  : where in the escellation sequence we are.
+                       save_act_all   : whether or not to save the action record in the db.
+
+                       action/stage   : stage tracks which state we're in.
+       """
+               stages = {
+                       "initial"               : [ { action='noop', next="weekone"}],
+                       "weekone"               : [ { action='noop',         index=0, save=True, email=TECH,         length=7*SPERDAY,  next="weektwo" }, ],
+                       "weektwo"               : [ { action='nocreate',     index=1, save=True, email=TECH|PI,      length=7*SPERDAY,  next="waitforever" }, ],
+                       "waitforever"   : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY,  next="waitforever" }, ],
+                       "paused"                : [ { action='noop',                              save=True                                              length=30*SPERDAY, next="weekone" }, ]
+                       "improvement"   : [ { action='close_rt',     index=0, save=True, email=TECH,         next="monitor-end-record" }, ],
+               }
+               # TODO: make this time relative to the PREVIOUS action taken.
                current_time = time.time()
-               delta = current_time - record.data['time']
-               if   'findbad' in record.data['stage']:
+               current_stage = record.getMostRecentStage()
+               recent_time   = record.getMostRecentTime()
+
+               delta = current_time - recent_time
+
+               if current_stage in stages:
+                       values = stages[current_stage][0]
+
+               if delta >= values['length']:
+                       print "checkStageAndTime: transition to next stage"
+                       new_stage = values['next']
+                       values = stages[new_stage]
+
+               elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data:
+                       print "checkStageAndTime: second message in one week for stage two"
+                       take_action=False
+                       pass
+               else:
+                       # DO NOTHING
+                       take_action=False, 
+                       save_act_all=False, 
+                       message_index=None, 
+                       print "checkStageAndTime: second message in one week for stage two"
+
+               rec = self.makeRecord( stage=new_stage, send_email_to=values['email'],
+                                                          action=values['action'], message_index=values['index'], 
+                                                          save_act_all=values['save'], penalty_level=values['index'], 
+                                                          date_action_taken=current_time)
+               record.data.update(rec)
+
+
+               if   'initial' in record.data['stage']:
                        # The node is bad, and there's no previous record of it.
-                       record.data['email'] = TECH
-                       record.data['action'] = ['noop']
-                       record.data['takeaction'] = False
-                       record.data['message'] = record.data['message'][0]
-                       record.data['stage'] = 'stage_actinoneweek'
-
-               elif 'reboot_node' in record.data['stage']:
-                       record.data['email'] = TECH
-                       record.data['action'] = ['noop']
-                       record.data['message'] = record.data['message'][0]
-                       record.data['stage'] = 'stage_actinoneweek'
-                       record.data['takeaction'] = False
-                       
-               elif 'improvement' in record.data['stage']:
-                       print "backing off of %s" % self.hostname
-                       record.data['action'] = ['close_rt']
-                       record.data['takeaction'] = True
-                       record.data['message'] = record.data['message'][0]
-                       record.data['stage'] = 'monitor-end-record'
-
-               elif 'actinoneweek' in record.data['stage']:
-                       if delta >= 7 * SPERDAY: 
-                               record.data['email'] = TECH | PI
-                               record.data['stage'] = 'stage_actintwoweeks'
-                               record.data['message'] = record.data['message'][1]
-                               record.data['action'] = ['nocreate' ]
-                               record.data['time'] = current_time              # reset clock for waitforever
-                               record.data['takeaction'] = True
-                       elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
-                               record.data['email'] = TECH 
-                               record.data['message'] = record.data['message'][0]
-                               record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
-                               record.data['second-mail-at-oneweek'] = True
-                               record.data['takeaction'] = False
-                       else:
-                               record.data['message'] = None
-                               record.data['action'] = ['waitforoneweekaction' ]
-                               print "ignoring this record for: %s" % self.hostname
-                               return None                     # don't send if there's no action
-
-               elif 'actintwoweeks' in record.data['stage']:
-                       if delta >= 7 * SPERDAY:
-                               record.data['email'] = TECH | PI | USER
-                               record.data['stage'] = 'stage_waitforever'
-                               record.data['message'] = record.data['message'][2]
-                               record.data['action'] = ['suspendslices']
-                               record.data['time'] = current_time              # reset clock for waitforever
-                               record.data['takeaction'] = True
-                       elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
-                               record.data['email'] = TECH | PI
-                               record.data['message'] = record.data['message'][1]
-                               record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
-                               record.data['second-mail-at-twoweeks'] = True
-                               record.data['takeaction'] = False
-                       else:
-                               record.data['message'] = None
-                               record.data['action'] = ['waitfortwoweeksaction']
-                               return None                     # don't send if there's no action
-
-               elif 'ticket_waitforever' in record.data['stage']:
-                       record.data['email'] = TECH
-                       record.data['takeaction'] = True
-                       if 'first-found' not in record.data:
-                               record.data['first-found'] = True
-                               record.data['log'] += " firstfound"
-                               record.data['action'] = ['ticket_waitforever']
-                               record.data['message'] = None
-                               record.data['time'] = current_time
-                       else:
-                               if delta >= 7*SPERDAY:
-                                       record.data['action'] = ['ticket_waitforever']
-                                       record.data['message'] = None
-                                       record.data['time'] = current_time              # reset clock
-                               else:
-                                       record.data['action'] = ['ticket_waitforever']
-                                       record.data['message'] = None
-                                       return None
+                       rec = self.makeRecord(
+                                                       stage="weekone", send_email_to=TECH, 
+                                                       action=['noop'], take_action=False, 
+                                                       message_index=0, save_act_all=True, 
+                                                       penalty_level=0, )
+                       record.data.update(rec)
 
-               elif 'waitforever' in record.data['stage']:
-                       # more than 3 days since last action
-                       # TODO: send only on weekdays.
-                       # NOTE: expects that 'time' has been reset before entering waitforever stage
-                       record.data['takeaction'] = True
-                       if delta >= 3*SPERDAY:
-                               record.data['action'] = ['email-againwaitforever']
-                               record.data['message'] = record.data['message'][2]
-                               record.data['time'] = current_time              # reset clock
-                       else:
-                               record.data['action'] = ['waitforever']
-                               record.data['message'] = None
-                               return None                     # don't send if there's no action
+               elif 'improvement' in record.data['stage']:
+                       print "checkStageAndTime: backing off of %s" % self.hostname
+                       rec = self.makeRecord(
+                                                       stage='monitor-end-record', send_email_to=TECH, 
+                                                       action=['close_rt'], take_action=True, 
+                                                       message_index=0, save_act_all=True, 
+                                                       penalty_level=0, )
+                       record.data.update(rec)
 
                else:
                        # There is no action to be taken, possibly b/c the stage has
@@ -337,15 +336,15 @@ class MonitorMergeDiagnoseSendEscellate:
                        #       2. delta is not big enough to bump it to the next stage.
                        # TODO: figure out which. for now assume 2.
                        print "UNKNOWN stage for %s; nothing done" % self.hostname
-                       record.data['action'] = ['unknown']
-                       record.data['message'] = record.data['message'][0]
-
-                       record.data['email'] = TECH
-                       record.data['action'] = ['noop']
-                       record.data['message'] = record.data['message'][0]
-                       record.data['stage'] = 'stage_actinoneweek'
-                       record.data['time'] = current_time              # reset clock
-                       record.data['takeaction'] = False
+                       rec = self.makeRecord(
+                                                       stage='weekone', send_email_to=TECH,
+                                                       action=['noop'], 
+                                                       take_action=False, 
+                                                       save_act_all=True, 
+                                                       date_action_taken=current_time,
+                                                       message_index=0, 
+                                                       penalty_level=0, )
+                       record.data.update(rec)
 
                print "%s" % record.data['log'],
                print "%15s" % record.data['action']