moving pcu transport code
[monitor.git] / monitor_policy.py
index f7985d0..5049db2 100644 (file)
@@ -1,18 +1,26 @@
-from config import config
-#print "policy"
-config = config()
-import soltesz
+import config
+import database
 import time
 import mailer
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
 import sys
 import emailTxt
 import string
+from monitor.wrapper import plccache
 
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
 from rt import is_host_in_rt_tickets
 import plc
 
+def get_ticket_id(record):
+       if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+               return record['ticket_id']
+       elif            'found_rt_ticket' in record and \
+                record['found_rt_ticket'] is not "" and \
+                record['found_rt_ticket'] is not None:
+               return record['found_rt_ticket']
+       else:
+               return None
+
 # Time to enforce policy
 POLSLEEP = 7200
 
@@ -42,18 +50,20 @@ PI=2
 USER=4
 ADMIN=8
 
+from unified_model import *
+
 class Merge:
        def __init__(self, l_merge):
                self.merge_list = l_merge
 
                # the hostname to loginbase mapping
-               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.plcdb_hn2lb = plccache.plcdb_hn2lb
 
                # Previous actions taken on nodes.
-               self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
-               self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+               self.act_all = database.if_cached_else(1, "act_all", lambda : {})
+               self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 
-               self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+               self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
                self.sickdb = {}
                self.mergedb = {}
 
@@ -255,8 +265,8 @@ class RT:
 class Diagnose:
        def __init__(self, record_list):
                self.record_list = record_list
-               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
-               self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+               self.plcdb_hn2lb = plccache.plcdb_hn2lb
+               self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 
                self.diagnose_in = {}
                self.diagnose_out = {}
@@ -396,12 +406,13 @@ class Diagnose:
 
                # NOTE: these settings can be overridden by command line arguments,
                #       or the state of a record, i.e. if already in RT's Support Queue.
-               nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+               pf = PersistFlags(loginbase, 1, db='site_persistflags')
+               nodes_up = pf.nodes_up
                if nodes_up < MINUP:
                        d_diag_site[loginbase]['config']['squeeze'] = True
 
                max_slices = self.getMaxSlices(loginbase)
-               num_nodes = self.getNumNodes(loginbase)
+               num_nodes = pf.nodes_total #self.getNumNodes(loginbase)
                # NOTE: when max_slices == 0, this is either a new site (the old way)
                #       or an old disabled site from previous monitor (before site['enabled'])
                if nodes_up < num_nodes and max_slices != 0:
@@ -433,15 +444,15 @@ class Diagnose:
                        diag_record['args'] = {'nodename': nodename}
                        diag_record['info'] = (nodename, s_daysdown, "")
 
-                       if 'reboot_node_failed' in node_record:
-                               # there was a previous attempt to use the PCU.
-                               if node_record['reboot_node_failed'] == False:
-                                       # then the last attempt apparently, succeeded.
-                                       # But, the category is still 'ERROR'.  Therefore, the
-                                       # PCU-to-Node mapping is broken.
-                                       #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
-                                       diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
-                                       diag_record['email_pcu'] = True
+                       #if 'reboot_node_failed' in node_record:
+                       #       # there was a previous attempt to use the PCU.
+                       #       if node_record['reboot_node_failed'] == False:
+                       #               # then the last attempt apparently, succeeded.
+                       #               # But, the category is still 'ERROR'.  Therefore, the
+                       #               # PCU-to-Node mapping is broken.
+                       #               #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
+                       #               diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
+                       #               diag_record['email_pcu'] = True
 
                        if diag_record['ticket_id'] == "":
                                diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
@@ -493,13 +504,13 @@ class Diagnose:
                                        diag_record['args'] = {'nodename': nodename}
                                        diag_record['info'] = (nodename, node_record['prev_category'], 
                                                                                                         node_record['category'])
-                                       if 'email_pcu' in diag_record:
-                                               if diag_record['email_pcu']:
-                                                       # previously, the pcu failed to reboot, so send
-                                                       # email. Now, reset these values to try the reboot
-                                                       # again.
-                                                       diag_record['email_pcu'] = False
-                                                       del diag_record['reboot_node_failed']
+                                       #if 'email_pcu' in diag_record:
+                                       #       if diag_record['email_pcu']:
+                                       #               # previously, the pcu failed to reboot, so send
+                                       #               # email. Now, reset these values to try the reboot
+                                       #               # again.
+                                       #               diag_record['email_pcu'] = False
+                                       #               del diag_record['reboot_node_failed']
 
                                        if diag_record['ticket_id'] == "":
                                                diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
@@ -820,16 +831,27 @@ class Diagnose:
 
                return up
 
+def close_rt_backoff(args):
+       if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+               mailer.closeTicketViaRT(args['ticket_id'], 
+                                                               "Ticket CLOSED automatically by SiteAssist.")
+               plc.enableSlices(args['hostname'])
+               plc.enableSliceCreation(args['hostname'])
+       return
+
+def reboot_node(args):
+       host = args['hostname']
+       return reboot.reboot_policy(host, True, config.debug)
 
 class Action:
        def __init__(self, diagnose_out):
                # the hostname to loginbase mapping
-               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.plcdb_hn2lb = plccache.plcdb_hn2lb
 
                # Actions to take.
                self.diagnose_db = diagnose_out
                # Actions taken.
-               self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
+               self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
 
                # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
                self.actions = {}
@@ -866,22 +888,24 @@ class Action:
                        print err
                        if config.policysavedb:
                                print "Saving Databases... act_all"
-                               soltesz.dbDump("act_all", self.act_all)
+                               database.dbDump("act_all", self.act_all)
+                               database.dbDump("diagnose_out", self.diagnose_db)
                        sys.exit(1)
 
                #print_stats("sites_observed", stats)
                #print_stats("sites_diagnosed", stats)
                #print_stats("nodes_diagnosed", stats)
-               print_stats("sites_emailed", stats)
+               self.print_stats("sites_emailed", stats)
                #print_stats("nodes_actedon", stats)
                print string.join(stats['allsites'], ",")
 
                if config.policysavedb:
                        print "Saving Databases... act_all"
-                       #soltesz.dbDump("policy.eventlog", self.eventlog)
+                       #database.dbDump("policy.eventlog", self.eventlog)
                        # TODO: remove 'diagnose_out', 
                        #       or at least the entries that were acted on.
-                       soltesz.dbDump("act_all", self.act_all)
+                       database.dbDump("act_all", self.act_all)
+                       database.dbDump("diagnose_out", self.diagnose_db)
 
        def accumSites(self):
                """
@@ -914,18 +938,22 @@ class Action:
                if ADMIN & roles:
                        contacts += [config.email]
                if TECH & roles:
-                       contacts += [TECHEMAIL % loginbase]
+                       #contacts += [TECHEMAIL % loginbase]
+                       contacts += plc.getTechEmails(loginbase)
                if PI & roles:
-                       contacts += [PIEMAIL % loginbase]
+                       #contacts += [PIEMAIL % loginbase]
+                       contacts += plc.getPIEmails(loginbase)
                if USER & roles:
+                       contacts += plc.getSliceUserEmails(loginbase)
                        slices = plc.slices(loginbase)
                        if len(slices) >= 1:
-                               for slice in slices:
-                                       contacts += [SLICEMAIL % slice]
                                print "SLIC: %20s : %d slices" % (loginbase, len(slices))
                        else:
                                print "SLIC: %20s : 0 slices" % loginbase
 
+               unique_contacts = set(contacts)
+               contacts = [ c for c in unique_contacts ]       # convert back into list
+
                try:
                        subject = message[0] % args
                        body = message[1] % args
@@ -1028,23 +1056,23 @@ class Action:
                        email_args = self.get_email_args(issue_record_list, loginbase)
 
                        # for each record.
-                       for act_record in issue_record_list:
-                               # if there's a pcu record and email config is set
-                               if 'email_pcu' in act_record:
-                                       if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
-                                               # and 'reboot_node' in act_record['stage']:
-
-                                               email_args['hostname'] = act_record['nodename']
-                                               ticket_id = self.__emailSite(loginbase, 
-                                                                                       act_record['email'], 
-                                                                                       emailTxt.mailtxt.pcudown[0],
-                                                                                       email_args)
-                                               if ticket_id == 0:
-                                                       # error.
-                                                       print "got a ticket_id == 0!!!! %s" % act_record['nodename']
-                                                       os._exit(1)
-                                                       pass
-                                               email_args['ticket_id'] = ticket_id
+                       #for act_record in issue_record_list:
+                       #       # if there's a pcu record and email config is set
+                       #       if 'email_pcu' in act_record:
+                       #               if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
+                       #                       # and 'reboot_node' in act_record['stage']:
+
+                       #                       email_args['hostname'] = act_record['nodename']
+                       #                       ticket_id = self.__emailSite(loginbase, 
+                       #                                                               act_record['email'], 
+                       #                                                               emailTxt.mailtxt.pcudown[0],
+                       #                                                               email_args)
+                       #                       if ticket_id == 0:
+                       #                               # error.
+                       #                               print "got a ticket_id == 0!!!! %s" % act_record['nodename']
+                       #                               os._exit(1)
+                       #                               pass
+                       #                       email_args['ticket_id'] = ticket_id
 
                        
                        act_record = issue_record_list[0]
@@ -1058,6 +1086,7 @@ class Action:
                                if ticket_id == 0:
                                        # error.
                                        print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+                                       import os
                                        os._exit(1)
                                        pass
 
@@ -1084,11 +1113,11 @@ class Action:
                                i_nodes_actedon += 1
                
                if config.policysavedb:
-                       print "Saving Databases... act_all, diagnose_out"
-                       soltesz.dbDump("act_all", self.act_all)
+                       #print "Saving Databases... act_all, diagnose_out"
+                       #database.dbDump("act_all", self.act_all)
                        # remove site record from diagnose_out, it's in act_all as done.
                        del self.diagnose_db[loginbase]
-                       #soltesz.dbDump("diagnose_out", self.diagnose_db)
+                       #database.dbDump("diagnose_out", self.diagnose_db)
 
                print "sleeping for 1 sec"
                time.sleep(1)
@@ -1111,52 +1140,52 @@ class Action:
                # avoid end records, and nmreset records                                        
                # reboot_node_failed, is set below, so don't reboot repeatedly.
 
-               if 'monitor-end-record' not in act_record['stage'] and \
-                  'nmreset' not in act_record['stage'] and \
-                  'reboot_node_failed' not in act_record:
-
-                       if "DOWN" in act_record['log'] and \
-                                       'pcu_ids' in act_record['plcnode'] and \
-                                       len(act_record['plcnode']['pcu_ids']) > 0:
-
-                               print "%s" % act_record['log'],
-                               print "%15s" % (['reboot_node'],)
-                               # Set node to re-install
-                               plc.nodeBootState(act_record['nodename'], "rins")       
-                               try:
-                                       ret = reboot_node({'hostname': act_record['nodename']})
-                               except Exception, exc:
-                                       print "exception on reboot_node:"
-                                       import traceback
-                                       print traceback.print_exc()
-                                       ret = False
-
-                               if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
-                                       # Reboot Succeeded
-                                       print "reboot succeeded for %s" % act_record['nodename']
-                                       act_record2 = {}
-                                       act_record2.update(act_record)
-                                       act_record2['action'] = ['reboot_node']
-                                       act_record2['stage'] = "reboot_node"
-                                       act_record2['reboot_node_failed'] = False
-                                       act_record2['email_pcu'] = False
-
-                                       if nodename not in self.act_all: 
-                                               self.act_all[nodename] = []
-                                       print "inserting 'reboot_node' record into act_all"
-                                       self.act_all[nodename].insert(0,act_record2)
-
-                                       # return None to avoid further action
-                                       print "Taking no further action"
-                                       return None
-                               else:
-                                       print "reboot failed for %s" % act_record['nodename']
-                                       # set email_pcu to also send pcu notice for this record.
-                                       act_record['reboot_node_failed'] = True
-                                       act_record['email_pcu'] = True
+               #if 'monitor-end-record' not in act_record['stage'] and \
+               #   'nmreset' not in act_record['stage'] and \
+               #   'reboot_node_failed' not in act_record:
 
-                       print "%s" % act_record['log'],
-                       print "%15s" % act_record['action']
+               #       if "DOWN" in act_record['log'] and \
+               #                       'pcu_ids' in act_record['plcnode'] and \
+               #                       len(act_record['plcnode']['pcu_ids']) > 0:
+#
+#                              print "%s" % act_record['log'],
+#                              print "%15s" % (['reboot_node'],)
+#                              # Set node to re-install
+#                              plc.nodeBootState(act_record['nodename'], "rins")       
+#                              try:
+#                                      ret = reboot_node({'hostname': act_record['nodename']})
+#                              except Exception, exc:
+#                                      print "exception on reboot_node:"
+#                                      import traceback
+#                                      print traceback.print_exc()
+#                                      ret = False
+#
+#                              if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
+#                                      # Reboot Succeeded
+#                                      print "reboot succeeded for %s" % act_record['nodename']
+#                                      act_record2 = {}
+#                                      act_record2.update(act_record)
+#                                      act_record2['action'] = ['reboot_node']
+#                                      act_record2['stage'] = "reboot_node"
+#                                      act_record2['reboot_node_failed'] = False
+#                                      act_record2['email_pcu'] = False
+#
+#                                      if nodename not in self.act_all: 
+#                                              self.act_all[nodename] = []
+#                                      print "inserting 'reboot_node' record into act_all"
+#                                      self.act_all[nodename].insert(0,act_record2)
+#
+#                                      # return None to avoid further action
+#                                      print "Taking no further action"
+#                                      return None
+#                              else:
+#                                      print "reboot failed for %s" % act_record['nodename']
+#                                      # set email_pcu to also send pcu notice for this record.
+#                                      act_record['reboot_node_failed'] = True
+#                                      act_record['email_pcu'] = True
+#
+#                      print "%s" % act_record['log'],
+#                      print "%15s" % act_record['action']
 
                if act_record['stage'] is not 'monitor-end-record' and \
                   act_record['stage'] is not 'nmreset':