instructs user how to create the 'auth.py' file.
[monitor.git] / policy.py
index 0bdf4bf..3673b8e 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -87,6 +87,16 @@ def getdebug():
 def print_stats(key, stats):
        if key in stats: print "%20s : %d" % (key, stats[key])
 
+def get_ticket_id(record):
+       if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+               return record['ticket_id']
+       elif            'found_rt_ticket' in record and \
+                record['found_rt_ticket'] is not "" and \
+                record['found_rt_ticket'] is not None:
+               return record['found_rt_ticket']
+       else:
+               return None
+
 class Merge(Thread):
        def __init__(self, l_merge, toRT):
                self.toRT = toRT
@@ -199,6 +209,16 @@ class Merge(Thread):
                                if loginbase not in self.mergedb:
                                        self.mergedb[loginbase] = {}
 
+                               # take the info either from act_all or fb-record.
+                               # if node not in act_all
+                               #       then take it from fbrecord, obviously.
+                               # else node in act_all
+                               #   if act_all == 0 length (no previous records)
+                               #               then take it from fbrecord.
+                               #   else
+                               #           take it from act_all.
+                               #   
+
                                # We must compare findbad state with act_all state
                                if nodename not in self.act_all:
                                        # 1) ok, b/c it's a new problem. set ticket_id to null
@@ -208,65 +228,25 @@ class Merge(Thread):
                                        self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
                                else: 
                                        if len(self.act_all[nodename]) == 0:
-                                               print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
-                                               continue
-
-                                       y = self.act_all[nodename][0]
-
-                                       ## skip if end-stage
-                                       #if 'stage' in y and "monitor-end-record" in y['stage']:
-                                       #       # 1) ok, b/c it's a new problem. set ticket_id to null
-                                       ##      self.mergedb[loginbase][nodename] = {} 
-                                       #       self.mergedb[loginbase][nodename].update(x)
-                                       #       self.mergedb[loginbase][nodename]['ticket_id'] = ""
-                                       #       self.mergedb[loginbase][nodename]['prev_category'] = None
-                                       #       continue
-
-                                       ## for legacy actions
-                                       #if 'bucket' in y and y['bucket'][0] == 'dbg':
-                                       #       # Only bootcd debugs made it to the act_all db.
-                                       #       y['prev_category'] = "OLDBOOTCD"
-                                       #elif 'bucket' in y and y['bucket'][0] == 'down':
-                                       #       y['prev_category'] = "ERROR"
-                                       #elif 'bucket' not in y:
-                                       #       # for all other actions, just carry over the
-                                       #       # previous category
-                                       #       y['prev_category'] = y['category']
-                                       #else:
-                                       #       print "UNKNOWN state for record: %s" % y
-                                       #       sys.exit(1)
-
-                                       # determine through translation, if the buckets match
-                                       #if 'category' in y and x['category'] == y['category']:
-                                       #       b_match = True
-                                       #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
-                                       #       b_match = True
-                                       #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
-                                       #       b_match = True
-                                       #else:
-                                       #       b_match = False
-
-                                       #if b_match: 
-                                       #       # 2b) ok, b/c they agree that there's still a problem..
-                                       #       # 2b) Comon & Monitor still agree; RT ticket?
-                                       #else:
-                                       #       # 2a) mismatch, need a policy for how to resolve
-                                       #       #     resolution will be handled in __diagnoseNode()
-                                       #       #         for now just record the two categories.
-                                       #       #if x['category'] == "PROD" and x['state'] == "BOOT" and \
-                                       #       # ( y['bucket'][0] == 'down' or  y['bucket'][0] == 'dbg'):
-                                       #       print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
-                                       #                               (x['category'], y['bucket'])
-
-                                       y['prev_category'] = y['category']
-                                       self.mergedb[loginbase][nodename] = {}
-                                       self.mergedb[loginbase][nodename].update(y)
-                                       self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
-                                       self.mergedb[loginbase][nodename]['category']   = x['category']
-                                       self.mergedb[loginbase][nodename]['state'] = x['state']
-                                       self.mergedb[loginbase][nodename]['kernel']=x['kernel']
-                                       self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
-                                       self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
+                                               self.mergedb[loginbase][nodename] = {} 
+                                               self.mergedb[loginbase][nodename].update(x)
+                                               self.mergedb[loginbase][nodename]['ticket_id'] = ""
+                                               self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
+                                       else:
+                                               y = self.act_all[nodename][0]
+                                               y['prev_category'] = y['category']
+
+                                               self.mergedb[loginbase][nodename] = {}
+                                               self.mergedb[loginbase][nodename].update(y)
+                                               self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
+                                               self.mergedb[loginbase][nodename]['category']   = x['category']
+                                               self.mergedb[loginbase][nodename]['state'] = x['state']
+                                               self.mergedb[loginbase][nodename]['kernel']=x['kernel']
+                                               self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
+                                               self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
+                                               ticket = get_ticket_id(self.mergedb[loginbase][nodename])
+                                               self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
+
                                        # delete the entry from cache_all to keep it out of case 3)
                                        del self.cache_all[nodename]
 
@@ -552,6 +532,14 @@ class Diagnose(Thread):
                                        diag_record['args'] = {'nodename': nodename}
                                        diag_record['info'] = (nodename, node_record['prev_category'], 
                                                                                                         node_record['category'])
+                                       if 'email_pcu' in diag_record:
+                                               if diag_record['email_pcu']:
+                                                       # previously, the pcu failed to reboot, so send
+                                                       # email. Now, reset these values to try the reboot
+                                                       # again.
+                                                       diag_record['email_pcu'] = False
+                                                       del diag_record['reboot_node_failed']
+
                                        if diag_record['ticket_id'] == "":
                                                diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                                        (loginbase, nodename, diag_record['stage'], 
@@ -561,27 +549,27 @@ class Diagnose(Thread):
                                                                        (loginbase, nodename, diag_record['stage'], 
                                                                         state, category, diag_record['ticket_id'])
                                        return diag_record
-                               elif time_diff >= 6*SPERHOUR:
-                                       # heartbeat is older than 30 min.
-                                       # then reset NM.
-                                       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
-                                       diag_record = {}
-                                       diag_record.update(node_record)
-                                       diag_record['message'] = emailTxt.mailtxt.NMReset
-                                       diag_record['args'] = {'nodename': nodename}
-                                       diag_record['stage'] = "nmreset"
-                                       diag_record['info'] = (nodename, 
-                                                                                       node_record['prev_category'], 
-                                                                                       node_record['category'])
-                                       if diag_record['ticket_id'] == "":
-                                               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
-                                                                       (loginbase, nodename, diag_record['stage'], 
-                                                                        state, category, diag_record['found_rt_ticket'])
-                                       else:
-                                               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
-                                                                       (loginbase, nodename, diag_record['stage'])
-
-                                       return diag_record
+                               #elif time_diff >= 6*SPERHOUR:
+                               #       # heartbeat is older than 30 min.
+                               #       # then reset NM.
+                               #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
+                               #       diag_record = {}
+                               #       diag_record.update(node_record)
+                               #       diag_record['message'] = emailTxt.mailtxt.NMReset
+                               #       diag_record['args'] = {'nodename': nodename}
+                               #       diag_record['stage'] = "nmreset"
+                               #       diag_record['info'] = (nodename, 
+                               #                                                       node_record['prev_category'], 
+                               #                                                       node_record['category'])
+                               #       if diag_record['ticket_id'] == "":
+                               #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
+                               #                                       (loginbase, nodename, diag_record['stage'], 
+                               #                                        state, category, diag_record['found_rt_ticket'])
+                               #       else:
+                               #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
+                               #                                       (loginbase, nodename, diag_record['stage'])
+#
+#                                      return diag_record
                                else:
                                        return None
                        else:
@@ -642,6 +630,23 @@ class Diagnose(Thread):
                                #values are equal, carry on.
                                #print "why are we here?"
                                pass
+
+               if 'rt' in node_record and 'Status' in node_record['rt']:
+                       if node_record['stage'] == 'ticket_waitforever':
+                               if 'resolved' in node_record['rt']['Status']:
+                                       print "ending waitforever record for: ", node_record['nodename']
+                                       node_record['action'] = ['noop']
+                                       node_record['message'] = None
+                                       node_record['stage'] = 'monitor-end-record'
+                                       print "oldlog: %s" % node_record['log'],
+                                       print "%15s" % node_record['action']
+                                       return node_record
+                               if 'new' in node_record['rt']['Status'] and \
+                                       'Queue' in node_record['rt'] and \
+                                       'Monitor' in node_record['rt']['Queue']:
+
+                                       print "RESETTING stage to findbad"
+                                       node_record['stage'] = 'findbad'
                        
                #### COMPARE category and prev_category
                # if not_equal
@@ -655,6 +660,8 @@ class Diagnose(Thread):
 
                #### found_RT_ticket
                # TODO: need to record time found, and maybe add a stage for acting on it...
+               # NOTE: after found, if the support ticket is resolved, the block is
+               #               not removed. How to remove the block on this?
                if 'found_rt_ticket' in diag_record and \
                        diag_record['found_rt_ticket'] is not None:
                        if diag_record['stage'] is not 'improvement':
@@ -696,6 +703,7 @@ class Diagnose(Thread):
                elif 'improvement' in diag_record['stage']:
                        # - backoff previous squeeze actions (slice suspend, nocreate)
                        # TODO: add a backoff_squeeze section... Needs to runthrough
+                       print "backing off of %s" % nodename
                        act_record['action'] = ['close_rt']
                        act_record['message'] = message[0]
                        act_record['stage'] = 'monitor-end-record'
@@ -776,8 +784,14 @@ class Diagnose(Thread):
                        print "UNKNOWN stage for %s; nothing done" % nodename
                        act_record['action'] = ['unknown']
                        act_record['message'] = message[0]
+
+                       act_record['email'] = TECH
+                       act_record['action'] = ['noop']
+                       act_record['message'] = message[0]
+                       act_record['stage'] = 'stage_actinoneweek'
+                       act_record['time'] = current_time               # reset clock
                        #print "Exiting..."
-                       return None
+                       #return None
                        #sys.exit(1)
 
                print "%s" % act_record['log'],
@@ -878,7 +892,7 @@ def close_rt_backoff(args):
 
 def reboot_node(args):
        host = args['hostname']
-       return reboot.reboot_new(host, True, config.debug)
+       return reboot.reboot_policy(host, True, config.debug)
 
 def reset_nodemanager(args):
        os.system("ssh root@%s /sbin/service nm restart" % nodename)
@@ -1045,7 +1059,7 @@ class Action(Thread):
                return hlist
 
 
-       def get_email_args(self, act_recordlist):
+       def get_email_args(self, act_recordlist, loginbase=None):
 
                email_args = {}
                email_args['hostname_list'] = ""
@@ -1062,7 +1076,19 @@ class Action(Thread):
                                email_args['pcu_id'] = "-1"
                                        
                        if 'ticket_id' in act_record:
-                               email_args['ticket_id'] = act_record['ticket_id']
+                               if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
+                                       print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
+                                       sys.stdout.flush()
+                                       line = sys.stdin.readline()
+                                       try:
+                                               ticket_id = int(line)
+                                       except:
+                                               print "could not get ticket_id from stdin..."
+                                               os._exit(1)
+                               else:
+                                       ticket_id = act_record['ticket_id']
+                                       
+                               email_args['ticket_id'] = ticket_id
 
                return email_args
 
@@ -1097,20 +1123,25 @@ class Action(Thread):
                for issue in unique_issues.keys():
                        print "\tworking on issue: %s" % issue
                        issue_record_list = unique_issues[issue]
-                       email_args = self.get_email_args(issue_record_list)
+                       email_args = self.get_email_args(issue_record_list, loginbase)
 
                        # for each record.
                        for act_record in issue_record_list:
                                # if there's a pcu record and email config is set
                                if 'email_pcu' in act_record:
-                                       if act_record['email_pcu'] and \
-                                               site_record['config']['email']:
+                                       if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
+                                               # and 'reboot_node' in act_record['stage']:
 
                                                email_args['hostname'] = act_record['nodename']
                                                ticket_id = self.__emailSite(loginbase, 
                                                                                        act_record['email'], 
                                                                                        emailTxt.mailtxt.pcudown[0],
                                                                                        email_args)
+                                               if ticket_id == 0:
+                                                       # error.
+                                                       print "got a ticket_id == 0!!!! %s" % act_record['nodename']
+                                                       os._exit(1)
+                                                       pass
                                                email_args['ticket_id'] = ticket_id
 
                        
@@ -1122,12 +1153,25 @@ class Action(Thread):
                                ticket_id = self.__emailSite(loginbase, act_record['email'], 
                                                                                         act_record['message'], email_args)
 
+                               if ticket_id == 0:
+                                       # error.
+                                       print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+                                       os._exit(1)
+                                       pass
+
                                # Add ticket_id to ALL nodenames
                                for act_record in issue_record_list:
                                        nodename = act_record['nodename']
                                        # update node record with RT ticket_id
                                        if nodename in self.act_all:
                                                self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
+                                               # if the ticket was previously resolved, reset it to new.
+                                               if 'rt' in act_record and \
+                                                       'Status' in act_record['rt'] and \
+                                                       act_record['rt']['Status'] == 'resolved':
+                                                       mailer.setTicketStatus(ticket_id, "new")
+                                               status = mailer.getTicketStatus(ticket_id)
+                                               self.act_all[nodename][0]['rt'] = status
                                        if config.mail: i_nodes_emailed += 1
 
                        print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
@@ -1144,11 +1188,11 @@ class Action(Thread):
                        del self.diagnose_db[loginbase]
                        soltesz.dbDump("diagnose_out", self.diagnose_db)
 
-               #print "sleeping for 1 sec"
-               #time.sleep(1)
-               print "Hit enter to continue..."
-               sys.stdout.flush()
-               line = sys.stdin.readline()
+               print "sleeping for 1 sec"
+               time.sleep(1)
+               #print "Hit enter to continue..."
+               #sys.stdout.flush()
+               #line = sys.stdin.readline()
 
                return (i_nodes_actedon, i_nodes_emailed)