Moved util dir from root to the 'monitor' python module directory.

[monitor.git] / monitor_policy.py
diff --git a/monitor_policy.py b/monitor_policy.py

index f7985d0..f7c3edb 100644 (file)
--- a/monitor_policy.py
+++ b/monitor_policy.py
@@ -1,18 +1,25 @@
-from config import config
-#print "policy"
-config = config()
-import soltesz
+import config
+import database
  import time
  import mailer
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
  import sys
  import emailTxt
  import string
  
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
  from rt import is_host_in_rt_tickets
  import plc
  
+def get_ticket_id(record):
+       if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+               return record['ticket_id']
+       elif            'found_rt_ticket' in record and \
+                record['found_rt_ticket'] is not "" and \
+                record['found_rt_ticket'] is not None:
+               return record['found_rt_ticket']
+       else:
+               return None
+
  # Time to enforce policy
  POLSLEEP = 7200
  
@@ -42,18 +49,20 @@ PI=2
  USER=4
  ADMIN=8
  
+from unified_model import *
+
  class Merge:
         def __init__(self, l_merge):
                 self.merge_list = l_merge
  
                 # the hostname to loginbase mapping
-               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
  
                 # Previous actions taken on nodes.
-               self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
-               self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+               self.act_all = database.if_cached_else(1, "act_all", lambda : {})
+               self.findbad = database.if_cached_else(1, "findbad", lambda : {})
  
-               self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+               self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
                 self.sickdb = {}
                 self.mergedb = {}
  
@@ -255,8 +264,8 @@ class RT:
  class Diagnose:
         def __init__(self, record_list):
                 self.record_list = record_list
-               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
-               self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+               self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
+               self.findbad = database.if_cached_else(1, "findbad", lambda : {})
  
                 self.diagnose_in = {}
                 self.diagnose_out = {}
@@ -396,12 +405,13 @@ class Diagnose:
  
                 # NOTE: these settings can be overridden by command line arguments,
                 #       or the state of a record, i.e. if already in RT's Support Queue.
-               nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+               pf = PersistFlags(loginbase, 1, db='site_persistflags')
+               nodes_up = pf.nodes_up
                 if nodes_up < MINUP:
                         d_diag_site[loginbase]['config']['squeeze'] = True
  
                 max_slices = self.getMaxSlices(loginbase)
-               num_nodes = self.getNumNodes(loginbase)
+               num_nodes = pf.nodes_total #self.getNumNodes(loginbase)
                 # NOTE: when max_slices == 0, this is either a new site (the old way)
                 #       or an old disabled site from previous monitor (before site['enabled'])
                 if nodes_up < num_nodes and max_slices != 0:
@@ -433,15 +443,15 @@ class Diagnose:
                         diag_record['args'] = {'nodename': nodename}
                         diag_record['info'] = (nodename, s_daysdown, "")
  
-                       if 'reboot_node_failed' in node_record:
-                               # there was a previous attempt to use the PCU.
-                               if node_record['reboot_node_failed'] == False:
-                                       # then the last attempt apparently, succeeded.
-                                       # But, the category is still 'ERROR'.  Therefore, the
-                                       # PCU-to-Node mapping is broken.
-                                       #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
-                                       diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
-                                       diag_record['email_pcu'] = True
+                       #if 'reboot_node_failed' in node_record:
+                       #       # there was a previous attempt to use the PCU.
+                       #       if node_record['reboot_node_failed'] == False:
+                       #               # then the last attempt apparently, succeeded.
+                       #               # But, the category is still 'ERROR'.  Therefore, the
+                       #               # PCU-to-Node mapping is broken.
+                       #               #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
+                       #               diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
+                       #               diag_record['email_pcu'] = True
  
                         if diag_record['ticket_id'] == "":
                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
@@ -493,13 +503,13 @@ class Diagnose:
                                         diag_record['args'] = {'nodename': nodename}
                                         diag_record['info'] = (nodename, node_record['prev_category'], 
                                                                                                          node_record['category'])
-                                       if 'email_pcu' in diag_record:
-                                               if diag_record['email_pcu']:
-                                                       # previously, the pcu failed to reboot, so send
-                                                       # email. Now, reset these values to try the reboot
-                                                       # again.
-                                                       diag_record['email_pcu'] = False
-                                                       del diag_record['reboot_node_failed']
+                                       #if 'email_pcu' in diag_record:
+                                       #       if diag_record['email_pcu']:
+                                       #               # previously, the pcu failed to reboot, so send
+                                       #               # email. Now, reset these values to try the reboot
+                                       #               # again.
+                                       #               diag_record['email_pcu'] = False
+                                       #               del diag_record['reboot_node_failed']
  
                                         if diag_record['ticket_id'] == "":
                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
@@ -820,16 +830,27 @@ class Diagnose:
  
                 return up
  
+def close_rt_backoff(args):
+       if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+               mailer.closeTicketViaRT(args['ticket_id'], 
+                                                               "Ticket CLOSED automatically by SiteAssist.")
+               plc.enableSlices(args['hostname'])
+               plc.enableSliceCreation(args['hostname'])
+       return
+
+def reboot_node(args):
+       host = args['hostname']
+       return reboot.reboot_policy(host, True, config.debug)
  
  class Action:
         def __init__(self, diagnose_out):
                 # the hostname to loginbase mapping
-               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
  
                 # Actions to take.
                 self.diagnose_db = diagnose_out
                 # Actions taken.
-               self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
+               self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
  
                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
                 self.actions = {}
@@ -866,22 +887,24 @@ class Action:
                         print err
                         if config.policysavedb:
                                 print "Saving Databases... act_all"
-                               soltesz.dbDump("act_all", self.act_all)
+                               database.dbDump("act_all", self.act_all)
+                               database.dbDump("diagnose_out", self.diagnose_db)
                         sys.exit(1)
  
                 #print_stats("sites_observed", stats)
                 #print_stats("sites_diagnosed", stats)
                 #print_stats("nodes_diagnosed", stats)
-               print_stats("sites_emailed", stats)
+               self.print_stats("sites_emailed", stats)
                 #print_stats("nodes_actedon", stats)
                 print string.join(stats['allsites'], ",")
  
                 if config.policysavedb:
                         print "Saving Databases... act_all"
-                       #soltesz.dbDump("policy.eventlog", self.eventlog)
+                       #database.dbDump("policy.eventlog", self.eventlog)
                         # TODO: remove 'diagnose_out', 
                         #       or at least the entries that were acted on.
-                       soltesz.dbDump("act_all", self.act_all)
+                       database.dbDump("act_all", self.act_all)
+                       database.dbDump("diagnose_out", self.diagnose_db)
  
         def accumSites(self):
                 """
@@ -1028,23 +1051,23 @@ class Action:
                         email_args = self.get_email_args(issue_record_list, loginbase)
  
                         # for each record.
-                       for act_record in issue_record_list:
-                               # if there's a pcu record and email config is set
-                               if 'email_pcu' in act_record:
-                                       if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
-                                               # and 'reboot_node' in act_record['stage']:
-
-                                               email_args['hostname'] = act_record['nodename']
-                                               ticket_id = self.__emailSite(loginbase, 
-                                                                                       act_record['email'], 
-                                                                                       emailTxt.mailtxt.pcudown[0],
-                                                                                       email_args)
-                                               if ticket_id == 0:
-                                                       # error.
-                                                       print "got a ticket_id == 0!!!! %s" % act_record['nodename']
-                                                       os._exit(1)
-                                                       pass
-                                               email_args['ticket_id'] = ticket_id
+                       #for act_record in issue_record_list:
+                       #       # if there's a pcu record and email config is set
+                       #       if 'email_pcu' in act_record:
+                       #               if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
+                       #                       # and 'reboot_node' in act_record['stage']:
+
+                       #                       email_args['hostname'] = act_record['nodename']
+                       #                       ticket_id = self.__emailSite(loginbase, 
+                       #                                                               act_record['email'], 
+                       #                                                               emailTxt.mailtxt.pcudown[0],
+                       #                                                               email_args)
+                       #                       if ticket_id == 0:
+                       #                               # error.
+                       #                               print "got a ticket_id == 0!!!! %s" % act_record['nodename']
+                       #                               os._exit(1)
+                       #                               pass
+                       #                       email_args['ticket_id'] = ticket_id
  
                         
                         act_record = issue_record_list[0]
@@ -1058,6 +1081,7 @@ class Action:
                                 if ticket_id == 0:
                                         # error.
                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+                                       import os
                                         os._exit(1)
                                         pass
  
@@ -1084,11 +1108,11 @@ class Action:
                                 i_nodes_actedon += 1
                 
                 if config.policysavedb:
-                       print "Saving Databases... act_all, diagnose_out"
-                       soltesz.dbDump("act_all", self.act_all)
+                       #print "Saving Databases... act_all, diagnose_out"
+                       #database.dbDump("act_all", self.act_all)
                         # remove site record from diagnose_out, it's in act_all as done.
                         del self.diagnose_db[loginbase]
-                       #soltesz.dbDump("diagnose_out", self.diagnose_db)
+                       #database.dbDump("diagnose_out", self.diagnose_db)
  
                 print "sleeping for 1 sec"
                 time.sleep(1)
@@ -1111,52 +1135,52 @@ class Action:
                 # avoid end records, and nmreset records                                        
                 # reboot_node_failed, is set below, so don't reboot repeatedly.
  
-               if 'monitor-end-record' not in act_record['stage'] and \
-                  'nmreset' not in act_record['stage'] and \
-                  'reboot_node_failed' not in act_record:
-
-                       if "DOWN" in act_record['log'] and \
-                                       'pcu_ids' in act_record['plcnode'] and \
-                                       len(act_record['plcnode']['pcu_ids']) > 0:
-
-                               print "%s" % act_record['log'],
-                               print "%15s" % (['reboot_node'],)
-                               # Set node to re-install
-                               plc.nodeBootState(act_record['nodename'], "rins")       
-                               try:
-                                       ret = reboot_node({'hostname': act_record['nodename']})
-                               except Exception, exc:
-                                       print "exception on reboot_node:"
-                                       import traceback
-                                       print traceback.print_exc()
-                                       ret = False
-
-                               if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
-                                       # Reboot Succeeded
-                                       print "reboot succeeded for %s" % act_record['nodename']
-                                       act_record2 = {}
-                                       act_record2.update(act_record)
-                                       act_record2['action'] = ['reboot_node']
-                                       act_record2['stage'] = "reboot_node"
-                                       act_record2['reboot_node_failed'] = False
-                                       act_record2['email_pcu'] = False
-
-                                       if nodename not in self.act_all: 
-                                               self.act_all[nodename] = []
-                                       print "inserting 'reboot_node' record into act_all"
-                                       self.act_all[nodename].insert(0,act_record2)
-
-                                       # return None to avoid further action
-                                       print "Taking no further action"
-                                       return None
-                               else:
-                                       print "reboot failed for %s" % act_record['nodename']
-                                       # set email_pcu to also send pcu notice for this record.
-                                       act_record['reboot_node_failed'] = True
-                                       act_record['email_pcu'] = True
+               #if 'monitor-end-record' not in act_record['stage'] and \
+               #   'nmreset' not in act_record['stage'] and \
+               #   'reboot_node_failed' not in act_record:
  
-                       print "%s" % act_record['log'],
-                       print "%15s" % act_record['action']
+               #       if "DOWN" in act_record['log'] and \
+               #                       'pcu_ids' in act_record['plcnode'] and \
+               #                       len(act_record['plcnode']['pcu_ids']) > 0:
+#
+#                              print "%s" % act_record['log'],
+#                              print "%15s" % (['reboot_node'],)
+#                              # Set node to re-install
+#                              plc.nodeBootState(act_record['nodename'], "rins")       
+#                              try:
+#                                      ret = reboot_node({'hostname': act_record['nodename']})
+#                              except Exception, exc:
+#                                      print "exception on reboot_node:"
+#                                      import traceback
+#                                      print traceback.print_exc()
+#                                      ret = False
+#
+#                              if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
+#                                      # Reboot Succeeded
+#                                      print "reboot succeeded for %s" % act_record['nodename']
+#                                      act_record2 = {}
+#                                      act_record2.update(act_record)
+#                                      act_record2['action'] = ['reboot_node']
+#                                      act_record2['stage'] = "reboot_node"
+#                                      act_record2['reboot_node_failed'] = False
+#                                      act_record2['email_pcu'] = False
+#
+#                                      if nodename not in self.act_all: 
+#                                              self.act_all[nodename] = []
+#                                      print "inserting 'reboot_node' record into act_all"
+#                                      self.act_all[nodename].insert(0,act_record2)
+#
+#                                      # return None to avoid further action
+#                                      print "Taking no further action"
+#                                      return None
+#                              else:
+#                                      print "reboot failed for %s" % act_record['nodename']
+#                                      # set email_pcu to also send pcu notice for this record.
+#                                      act_record['reboot_node_failed'] = True
+#                                      act_record['email_pcu'] = True
+#
+#                      print "%s" % act_record['log'],
+#                      print "%15s" % act_record['action']
  
                 if act_record['stage'] is not 'monitor-end-record' and \
                    act_record['stage'] is not 'nmreset':