-from config import config
-#print "policy"
-config = config()
-import soltesz
+import config
+import database
import time
import mailer
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
import sys
import emailTxt
import string
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
from rt import is_host_in_rt_tickets
import plc
+def get_ticket_id(record):
+ if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+ return record['ticket_id']
+ elif 'found_rt_ticket' in record and \
+ record['found_rt_ticket'] is not "" and \
+ record['found_rt_ticket'] is not None:
+ return record['found_rt_ticket']
+ else:
+ return None
+
# Time to enforce policy
POLSLEEP = 7200
USER=4
ADMIN=8
+from unified_model import *
+
class Merge:
def __init__(self, l_merge):
self.merge_list = l_merge
# the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
# Previous actions taken on nodes.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+ self.act_all = database.if_cached_else(1, "act_all", lambda : {})
+ self.findbad = database.if_cached_else(1, "findbad", lambda : {})
- self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+ self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
self.sickdb = {}
self.mergedb = {}
class Diagnose:
def __init__(self, record_list):
self.record_list = record_list
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
+ self.findbad = database.if_cached_else(1, "findbad", lambda : {})
self.diagnose_in = {}
self.diagnose_out = {}
# NOTE: these settings can be overridden by command line arguments,
# or the state of a record, i.e. if already in RT's Support Queue.
- nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+ pf = PersistFlags(loginbase, 1, db='site_persistflags')
+ nodes_up = pf.nodes_up
if nodes_up < MINUP:
d_diag_site[loginbase]['config']['squeeze'] = True
max_slices = self.getMaxSlices(loginbase)
- num_nodes = self.getNumNodes(loginbase)
+ num_nodes = pf.nodes_total #self.getNumNodes(loginbase)
# NOTE: when max_slices == 0, this is either a new site (the old way)
# or an old disabled site from previous monitor (before site['enabled'])
if nodes_up < num_nodes and max_slices != 0:
diag_record['args'] = {'nodename': nodename}
diag_record['info'] = (nodename, s_daysdown, "")
- if 'reboot_node_failed' in node_record:
- # there was a previous attempt to use the PCU.
- if node_record['reboot_node_failed'] == False:
- # then the last attempt apparently, succeeded.
- # But, the category is still 'ERROR'. Therefore, the
- # PCU-to-Node mapping is broken.
- #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
- diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
- diag_record['email_pcu'] = True
+ #if 'reboot_node_failed' in node_record:
+ # # there was a previous attempt to use the PCU.
+ # if node_record['reboot_node_failed'] == False:
+ # # then the last attempt apparently, succeeded.
+ # # But, the category is still 'ERROR'. Therefore, the
+ # # PCU-to-Node mapping is broken.
+ # #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
+ # diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
+ # diag_record['email_pcu'] = True
if diag_record['ticket_id'] == "":
diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
diag_record['args'] = {'nodename': nodename}
diag_record['info'] = (nodename, node_record['prev_category'],
node_record['category'])
- if 'email_pcu' in diag_record:
- if diag_record['email_pcu']:
- # previously, the pcu failed to reboot, so send
- # email. Now, reset these values to try the reboot
- # again.
- diag_record['email_pcu'] = False
- del diag_record['reboot_node_failed']
+ #if 'email_pcu' in diag_record:
+ # if diag_record['email_pcu']:
+ # # previously, the pcu failed to reboot, so send
+ # # email. Now, reset these values to try the reboot
+ # # again.
+ # diag_record['email_pcu'] = False
+ # del diag_record['reboot_node_failed']
if diag_record['ticket_id'] == "":
diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
return up
+def close_rt_backoff(args):
+ if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+ mailer.closeTicketViaRT(args['ticket_id'],
+ "Ticket CLOSED automatically by SiteAssist.")
+ plc.enableSlices(args['hostname'])
+ plc.enableSliceCreation(args['hostname'])
+ return
+
+def reboot_node(args):
+ host = args['hostname']
+ return reboot.reboot_policy(host, True, config.debug)
class Action:
def __init__(self, diagnose_out):
# the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
# Actions to take.
self.diagnose_db = diagnose_out
# Actions taken.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+ self.act_all = database.if_cached_else(1, "act_all", lambda : {})
# A dict of actions to specific functions. PICKLE doesnt' like lambdas.
self.actions = {}
print err
if config.policysavedb:
print "Saving Databases... act_all"
- soltesz.dbDump("act_all", self.act_all)
+ database.dbDump("act_all", self.act_all)
+ database.dbDump("diagnose_out", self.diagnose_db)
sys.exit(1)
#print_stats("sites_observed", stats)
#print_stats("sites_diagnosed", stats)
#print_stats("nodes_diagnosed", stats)
- print_stats("sites_emailed", stats)
+ self.print_stats("sites_emailed", stats)
#print_stats("nodes_actedon", stats)
print string.join(stats['allsites'], ",")
if config.policysavedb:
print "Saving Databases... act_all"
- #soltesz.dbDump("policy.eventlog", self.eventlog)
+ #database.dbDump("policy.eventlog", self.eventlog)
# TODO: remove 'diagnose_out',
# or at least the entries that were acted on.
- soltesz.dbDump("act_all", self.act_all)
+ database.dbDump("act_all", self.act_all)
+ database.dbDump("diagnose_out", self.diagnose_db)
def accumSites(self):
"""
email_args = self.get_email_args(issue_record_list, loginbase)
# for each record.
- for act_record in issue_record_list:
- # if there's a pcu record and email config is set
- if 'email_pcu' in act_record:
- if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
- # and 'reboot_node' in act_record['stage']:
-
- email_args['hostname'] = act_record['nodename']
- ticket_id = self.__emailSite(loginbase,
- act_record['email'],
- emailTxt.mailtxt.pcudown[0],
- email_args)
- if ticket_id == 0:
- # error.
- print "got a ticket_id == 0!!!! %s" % act_record['nodename']
- os._exit(1)
- pass
- email_args['ticket_id'] = ticket_id
+ #for act_record in issue_record_list:
+ # # if there's a pcu record and email config is set
+ # if 'email_pcu' in act_record:
+ # if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
+ # # and 'reboot_node' in act_record['stage']:
+
+ # email_args['hostname'] = act_record['nodename']
+ # ticket_id = self.__emailSite(loginbase,
+ # act_record['email'],
+ # emailTxt.mailtxt.pcudown[0],
+ # email_args)
+ # if ticket_id == 0:
+ # # error.
+ # print "got a ticket_id == 0!!!! %s" % act_record['nodename']
+ # os._exit(1)
+ # pass
+ # email_args['ticket_id'] = ticket_id
act_record = issue_record_list[0]
if ticket_id == 0:
# error.
print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+ import os
os._exit(1)
pass
i_nodes_actedon += 1
if config.policysavedb:
- print "Saving Databases... act_all, diagnose_out"
- soltesz.dbDump("act_all", self.act_all)
+ #print "Saving Databases... act_all, diagnose_out"
+ #database.dbDump("act_all", self.act_all)
# remove site record from diagnose_out, it's in act_all as done.
del self.diagnose_db[loginbase]
- #soltesz.dbDump("diagnose_out", self.diagnose_db)
+ #database.dbDump("diagnose_out", self.diagnose_db)
print "sleeping for 1 sec"
time.sleep(1)
# avoid end records, and nmreset records
# reboot_node_failed, is set below, so don't reboot repeatedly.
- if 'monitor-end-record' not in act_record['stage'] and \
- 'nmreset' not in act_record['stage'] and \
- 'reboot_node_failed' not in act_record:
-
- if "DOWN" in act_record['log'] and \
- 'pcu_ids' in act_record['plcnode'] and \
- len(act_record['plcnode']['pcu_ids']) > 0:
-
- print "%s" % act_record['log'],
- print "%15s" % (['reboot_node'],)
- # Set node to re-install
- plc.nodeBootState(act_record['nodename'], "rins")
- try:
- ret = reboot_node({'hostname': act_record['nodename']})
- except Exception, exc:
- print "exception on reboot_node:"
- import traceback
- print traceback.print_exc()
- ret = False
-
- if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
- # Reboot Succeeded
- print "reboot succeeded for %s" % act_record['nodename']
- act_record2 = {}
- act_record2.update(act_record)
- act_record2['action'] = ['reboot_node']
- act_record2['stage'] = "reboot_node"
- act_record2['reboot_node_failed'] = False
- act_record2['email_pcu'] = False
-
- if nodename not in self.act_all:
- self.act_all[nodename] = []
- print "inserting 'reboot_node' record into act_all"
- self.act_all[nodename].insert(0,act_record2)
-
- # return None to avoid further action
- print "Taking no further action"
- return None
- else:
- print "reboot failed for %s" % act_record['nodename']
- # set email_pcu to also send pcu notice for this record.
- act_record['reboot_node_failed'] = True
- act_record['email_pcu'] = True
+ #if 'monitor-end-record' not in act_record['stage'] and \
+ # 'nmreset' not in act_record['stage'] and \
+ # 'reboot_node_failed' not in act_record:
- print "%s" % act_record['log'],
- print "%15s" % act_record['action']
+ # if "DOWN" in act_record['log'] and \
+ # 'pcu_ids' in act_record['plcnode'] and \
+ # len(act_record['plcnode']['pcu_ids']) > 0:
+#
+# print "%s" % act_record['log'],
+# print "%15s" % (['reboot_node'],)
+# # Set node to re-install
+# plc.nodeBootState(act_record['nodename'], "rins")
+# try:
+# ret = reboot_node({'hostname': act_record['nodename']})
+# except Exception, exc:
+# print "exception on reboot_node:"
+# import traceback
+# print traceback.print_exc()
+# ret = False
+#
+# if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
+# # Reboot Succeeded
+# print "reboot succeeded for %s" % act_record['nodename']
+# act_record2 = {}
+# act_record2.update(act_record)
+# act_record2['action'] = ['reboot_node']
+# act_record2['stage'] = "reboot_node"
+# act_record2['reboot_node_failed'] = False
+# act_record2['email_pcu'] = False
+#
+# if nodename not in self.act_all:
+# self.act_all[nodename] = []
+# print "inserting 'reboot_node' record into act_all"
+# self.act_all[nodename].insert(0,act_record2)
+#
+# # return None to avoid further action
+# print "Taking no further action"
+# return None
+# else:
+# print "reboot failed for %s" % act_record['nodename']
+# # set email_pcu to also send pcu notice for this record.
+# act_record['reboot_node_failed'] = True
+# act_record['email_pcu'] = True
+#
+# print "%s" % act_record['log'],
+# print "%15s" % act_record['action']
if act_record['stage'] is not 'monitor-end-record' and \
act_record['stage'] is not 'nmreset':