X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor_policy.py;h=5db440f6824bef780f49f017df14ac19cb62744a;hb=refs%2Fheads%2F1.0;hp=f7985d0a1d90ecf6ae7767add36f8bbdd81a9886;hpb=77e8cfd3570139709c16c59418c9a5fc6ddf953b;p=monitor.git diff --git a/monitor_policy.py b/monitor_policy.py index f7985d0..5db440f 100644 --- a/monitor_policy.py +++ b/monitor_policy.py @@ -1,18 +1,25 @@ -from config import config -#print "policy" -config = config() -import soltesz +import config +import database import time import mailer -from www.printbadnodes import cmpCategoryVal +from unified_model import cmpCategoryVal import sys import emailTxt import string -from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node from rt import is_host_in_rt_tickets import plc +def get_ticket_id(record): + if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None: + return record['ticket_id'] + elif 'found_rt_ticket' in record and \ + record['found_rt_ticket'] is not "" and \ + record['found_rt_ticket'] is not None: + return record['found_rt_ticket'] + else: + return None + # Time to enforce policy POLSLEEP = 7200 @@ -42,18 +49,20 @@ PI=2 USER=4 ADMIN=8 +from unified_model import * + class Merge: def __init__(self, l_merge): self.merge_list = l_merge # the hostname to loginbase mapping - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") # Previous actions taken on nodes. - self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {}) - self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {}) + self.act_all = database.if_cached_else(1, "act_all", lambda : {}) + self.findbad = database.if_cached_else(1, "findbad", lambda : {}) - self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {}) + self.cache_all = database.if_cached_else(1, "act_all", lambda : {}) self.sickdb = {} self.mergedb = {} @@ -255,8 +264,8 @@ class RT: class Diagnose: def __init__(self, record_list): self.record_list = record_list - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") - self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {}) + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") + self.findbad = database.if_cached_else(1, "findbad", lambda : {}) self.diagnose_in = {} self.diagnose_out = {} @@ -272,6 +281,8 @@ class Diagnose: print "----------------" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print err #if config.policysavedb: sys.exit(1) @@ -396,12 +407,13 @@ class Diagnose: # NOTE: these settings can be overridden by command line arguments, # or the state of a record, i.e. if already in RT's Support Queue. - nodes_up = self.getUpAtSite(loginbase, d_diag_site) + pf = PersistFlags(loginbase, 1, db='site_persistflags') + nodes_up = pf.nodes_up if nodes_up < MINUP: d_diag_site[loginbase]['config']['squeeze'] = True max_slices = self.getMaxSlices(loginbase) - num_nodes = self.getNumNodes(loginbase) + num_nodes = pf.nodes_total #self.getNumNodes(loginbase) # NOTE: when max_slices == 0, this is either a new site (the old way) # or an old disabled site from previous monitor (before site['enabled']) if nodes_up < num_nodes and max_slices != 0: @@ -433,15 +445,15 @@ class Diagnose: diag_record['args'] = {'nodename': nodename} diag_record['info'] = (nodename, s_daysdown, "") - if 'reboot_node_failed' in node_record: - # there was a previous attempt to use the PCU. - if node_record['reboot_node_failed'] == False: - # then the last attempt apparently, succeeded. - # But, the category is still 'ERROR'. Therefore, the - # PCU-to-Node mapping is broken. - #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename - diag_record['message'] = emailTxt.mailtxt.pcutonodemapping - diag_record['email_pcu'] = True + #if 'reboot_node_failed' in node_record: + # # there was a previous attempt to use the PCU. + # if node_record['reboot_node_failed'] == False: + # # then the last attempt apparently, succeeded. + # # But, the category is still 'ERROR'. Therefore, the + # # PCU-to-Node mapping is broken. + # #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename + # diag_record['message'] = emailTxt.mailtxt.pcutonodemapping + # diag_record['email_pcu'] = True if diag_record['ticket_id'] == "": diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \ @@ -493,13 +505,13 @@ class Diagnose: diag_record['args'] = {'nodename': nodename} diag_record['info'] = (nodename, node_record['prev_category'], node_record['category']) - if 'email_pcu' in diag_record: - if diag_record['email_pcu']: - # previously, the pcu failed to reboot, so send - # email. Now, reset these values to try the reboot - # again. - diag_record['email_pcu'] = False - del diag_record['reboot_node_failed'] + #if 'email_pcu' in diag_record: + # if diag_record['email_pcu']: + # # previously, the pcu failed to reboot, so send + # # email. Now, reset these values to try the reboot + # # again. + # diag_record['email_pcu'] = False + # del diag_record['reboot_node_failed'] if diag_record['ticket_id'] == "": diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ @@ -820,16 +832,27 @@ class Diagnose: return up +def close_rt_backoff(args): + if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None): + mailer.closeTicketViaRT(args['ticket_id'], + "Ticket CLOSED automatically by SiteAssist.") + plc.enableSlices(args['hostname']) + plc.enableSliceCreation(args['hostname']) + return + +def reboot_node(args): + host = args['hostname'] + return reboot.reboot_policy(host, True, config.debug) class Action: def __init__(self, diagnose_out): # the hostname to loginbase mapping - self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb") + self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb") # Actions to take. self.diagnose_db = diagnose_out # Actions taken. - self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {}) + self.act_all = database.if_cached_else(1, "act_all", lambda : {}) # A dict of actions to specific functions. PICKLE doesnt' like lambdas. self.actions = {} @@ -863,25 +886,29 @@ class Action: print "----------------" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print err if config.policysavedb: print "Saving Databases... act_all" - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) + database.dbDump("diagnose_out", self.diagnose_db) sys.exit(1) #print_stats("sites_observed", stats) #print_stats("sites_diagnosed", stats) #print_stats("nodes_diagnosed", stats) - print_stats("sites_emailed", stats) + self.print_stats("sites_emailed", stats) #print_stats("nodes_actedon", stats) print string.join(stats['allsites'], ",") if config.policysavedb: print "Saving Databases... act_all" - #soltesz.dbDump("policy.eventlog", self.eventlog) + #database.dbDump("policy.eventlog", self.eventlog) # TODO: remove 'diagnose_out', # or at least the entries that were acted on. - soltesz.dbDump("act_all", self.act_all) + database.dbDump("act_all", self.act_all) + database.dbDump("diagnose_out", self.diagnose_db) def accumSites(self): """ @@ -914,18 +941,22 @@ class Action: if ADMIN & roles: contacts += [config.email] if TECH & roles: - contacts += [TECHEMAIL % loginbase] + #contacts += [TECHEMAIL % loginbase] + contacts += plc.getTechEmails(loginbase) if PI & roles: - contacts += [PIEMAIL % loginbase] + #contacts += [PIEMAIL % loginbase] + contacts += plc.getPIEmails(loginbase) if USER & roles: + contacts += plc.getSliceUserEmails(loginbase) slices = plc.slices(loginbase) if len(slices) >= 1: - for slice in slices: - contacts += [SLICEMAIL % slice] print "SLIC: %20s : %d slices" % (loginbase, len(slices)) else: print "SLIC: %20s : 0 slices" % loginbase + unique_contacts = set(contacts) + contacts = [ c for c in unique_contacts ] # convert back into list + try: subject = message[0] % args body = message[1] % args @@ -943,6 +974,8 @@ class Action: print "exception on message:" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print message return ticket_id @@ -1028,23 +1061,23 @@ class Action: email_args = self.get_email_args(issue_record_list, loginbase) # for each record. - for act_record in issue_record_list: - # if there's a pcu record and email config is set - if 'email_pcu' in act_record: - if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']: - # and 'reboot_node' in act_record['stage']: - - email_args['hostname'] = act_record['nodename'] - ticket_id = self.__emailSite(loginbase, - act_record['email'], - emailTxt.mailtxt.pcudown[0], - email_args) - if ticket_id == 0: - # error. - print "got a ticket_id == 0!!!! %s" % act_record['nodename'] - os._exit(1) - pass - email_args['ticket_id'] = ticket_id + #for act_record in issue_record_list: + # # if there's a pcu record and email config is set + # if 'email_pcu' in act_record: + # if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']: + # # and 'reboot_node' in act_record['stage']: + + # email_args['hostname'] = act_record['nodename'] + # ticket_id = self.__emailSite(loginbase, + # act_record['email'], + # emailTxt.mailtxt.pcudown[0], + # email_args) + # if ticket_id == 0: + # # error. + # print "got a ticket_id == 0!!!! %s" % act_record['nodename'] + # os._exit(1) + # pass + # email_args['ticket_id'] = ticket_id act_record = issue_record_list[0] @@ -1058,6 +1091,7 @@ class Action: if ticket_id == 0: # error. print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename']) + import os os._exit(1) pass @@ -1084,11 +1118,11 @@ class Action: i_nodes_actedon += 1 if config.policysavedb: - print "Saving Databases... act_all, diagnose_out" - soltesz.dbDump("act_all", self.act_all) + #print "Saving Databases... act_all, diagnose_out" + #database.dbDump("act_all", self.act_all) # remove site record from diagnose_out, it's in act_all as done. del self.diagnose_db[loginbase] - #soltesz.dbDump("diagnose_out", self.diagnose_db) + #database.dbDump("diagnose_out", self.diagnose_db) print "sleeping for 1 sec" time.sleep(1) @@ -1111,52 +1145,52 @@ class Action: # avoid end records, and nmreset records # reboot_node_failed, is set below, so don't reboot repeatedly. - if 'monitor-end-record' not in act_record['stage'] and \ - 'nmreset' not in act_record['stage'] and \ - 'reboot_node_failed' not in act_record: - - if "DOWN" in act_record['log'] and \ - 'pcu_ids' in act_record['plcnode'] and \ - len(act_record['plcnode']['pcu_ids']) > 0: - - print "%s" % act_record['log'], - print "%15s" % (['reboot_node'],) - # Set node to re-install - plc.nodeBootState(act_record['nodename'], "rins") - try: - ret = reboot_node({'hostname': act_record['nodename']}) - except Exception, exc: - print "exception on reboot_node:" - import traceback - print traceback.print_exc() - ret = False - - if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False): - # Reboot Succeeded - print "reboot succeeded for %s" % act_record['nodename'] - act_record2 = {} - act_record2.update(act_record) - act_record2['action'] = ['reboot_node'] - act_record2['stage'] = "reboot_node" - act_record2['reboot_node_failed'] = False - act_record2['email_pcu'] = False - - if nodename not in self.act_all: - self.act_all[nodename] = [] - print "inserting 'reboot_node' record into act_all" - self.act_all[nodename].insert(0,act_record2) - - # return None to avoid further action - print "Taking no further action" - return None - else: - print "reboot failed for %s" % act_record['nodename'] - # set email_pcu to also send pcu notice for this record. - act_record['reboot_node_failed'] = True - act_record['email_pcu'] = True + #if 'monitor-end-record' not in act_record['stage'] and \ + # 'nmreset' not in act_record['stage'] and \ + # 'reboot_node_failed' not in act_record: - print "%s" % act_record['log'], - print "%15s" % act_record['action'] + # if "DOWN" in act_record['log'] and \ + # 'pcu_ids' in act_record['plcnode'] and \ + # len(act_record['plcnode']['pcu_ids']) > 0: +# +# print "%s" % act_record['log'], +# print "%15s" % (['reboot_node'],) +# # Set node to re-install +# plc.nodeBootState(act_record['nodename'], "rins") +# try: +# ret = reboot_node({'hostname': act_record['nodename']}) +# except Exception, exc: +# print "exception on reboot_node:" +# import traceback +# print traceback.print_exc() +# ret = False +# +# if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False): +# # Reboot Succeeded +# print "reboot succeeded for %s" % act_record['nodename'] +# act_record2 = {} +# act_record2.update(act_record) +# act_record2['action'] = ['reboot_node'] +# act_record2['stage'] = "reboot_node" +# act_record2['reboot_node_failed'] = False +# act_record2['email_pcu'] = False +# +# if nodename not in self.act_all: +# self.act_all[nodename] = [] +# print "inserting 'reboot_node' record into act_all" +# self.act_all[nodename].insert(0,act_record2) +# +# # return None to avoid further action +# print "Taking no further action" +# return None +# else: +# print "reboot failed for %s" % act_record['nodename'] +# # set email_pcu to also send pcu notice for this record. +# act_record['reboot_node_failed'] = True +# act_record['email_pcu'] = True +# +# print "%s" % act_record['log'], +# print "%15s" % act_record['action'] if act_record['stage'] is not 'monitor-end-record' and \ act_record['stage'] is not 'nmreset':