X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor_policy.py;h=5db440f6824bef780f49f017df14ac19cb62744a;hb=refs%2Fheads%2F1.0;hp=f7985d0a1d90ecf6ae7767add36f8bbdd81a9886;hpb=77e8cfd3570139709c16c59418c9a5fc6ddf953b;p=monitor.git

diff --git a/monitor_policy.py b/monitor_policy.py
index f7985d0..5db440f 100644
--- a/monitor_policy.py
+++ b/monitor_policy.py
@@ -1,18 +1,25 @@
-from config import config
-#print "policy"
-config = config()
-import soltesz
+import config
+import database
 import time
 import mailer
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
 import sys
 import emailTxt
 import string
 
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
 from rt import is_host_in_rt_tickets
 import plc
 
+def get_ticket_id(record):
+	if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+		return record['ticket_id']
+	elif 		'found_rt_ticket' in record and \
+		 record['found_rt_ticket'] is not "" and \
+		 record['found_rt_ticket'] is not None:
+		return record['found_rt_ticket']
+	else:
+		return None
+
 # Time to enforce policy
 POLSLEEP = 7200
 
@@ -42,18 +49,20 @@ PI=2
 USER=4
 ADMIN=8
 
+from unified_model import *
+
 class Merge:
 	def __init__(self, l_merge):
 		self.merge_list = l_merge
 
 		# the hostname to loginbase mapping
-		self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+		self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 
 		# Previous actions taken on nodes.
-		self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
-		self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+		self.act_all = database.if_cached_else(1, "act_all", lambda : {})
+		self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 
-		self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+		self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
 		self.sickdb = {}
 		self.mergedb = {}
 
@@ -255,8 +264,8 @@ class RT:
 class Diagnose:
 	def __init__(self, record_list):
 		self.record_list = record_list
-		self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
-		self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
+		self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
+		self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 
 		self.diagnose_in = {}
 		self.diagnose_out = {}
@@ -272,6 +281,8 @@ class Diagnose:
 			print "----------------"
 			import traceback
 			print traceback.print_exc()
+			from nodecommon import email_exception
+			email_exception()
 			print err
 			#if config.policysavedb:
 			sys.exit(1)
@@ -396,12 +407,13 @@ class Diagnose:
 
 		# NOTE: these settings can be overridden by command line arguments,
 		#       or the state of a record, i.e. if already in RT's Support Queue.
-		nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+		pf = PersistFlags(loginbase, 1, db='site_persistflags')
+		nodes_up = pf.nodes_up
 		if nodes_up < MINUP:
 			d_diag_site[loginbase]['config']['squeeze'] = True
 
 		max_slices = self.getMaxSlices(loginbase)
-		num_nodes = self.getNumNodes(loginbase)
+		num_nodes = pf.nodes_total #self.getNumNodes(loginbase)
 		# NOTE: when max_slices == 0, this is either a new site (the old way)
 		#       or an old disabled site from previous monitor (before site['enabled'])
 		if nodes_up < num_nodes and max_slices != 0:
@@ -433,15 +445,15 @@ class Diagnose:
 			diag_record['args'] = {'nodename': nodename}
 			diag_record['info'] = (nodename, s_daysdown, "")
 
-			if 'reboot_node_failed' in node_record:
-				# there was a previous attempt to use the PCU.
-				if node_record['reboot_node_failed'] == False:
-					# then the last attempt apparently, succeeded.
-					# But, the category is still 'ERROR'.  Therefore, the
-					# PCU-to-Node mapping is broken.
-					#print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
-					diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
-					diag_record['email_pcu'] = True
+			#if 'reboot_node_failed' in node_record:
+			#	# there was a previous attempt to use the PCU.
+			#	if node_record['reboot_node_failed'] == False:
+			#		# then the last attempt apparently, succeeded.
+			#		# But, the category is still 'ERROR'.  Therefore, the
+			#		# PCU-to-Node mapping is broken.
+			#		#print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
+			#		diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
+			#		diag_record['email_pcu'] = True
 
 			if diag_record['ticket_id'] == "":
 				diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
@@ -493,13 +505,13 @@ class Diagnose:
 					diag_record['args'] = {'nodename': nodename}
 					diag_record['info'] = (nodename, node_record['prev_category'], 
 													 node_record['category'])
-					if 'email_pcu' in diag_record:
-						if diag_record['email_pcu']:
-							# previously, the pcu failed to reboot, so send
-							# email. Now, reset these values to try the reboot
-							# again.
-							diag_record['email_pcu'] = False
-							del diag_record['reboot_node_failed']
+					#if 'email_pcu' in diag_record:
+					#	if diag_record['email_pcu']:
+					#		# previously, the pcu failed to reboot, so send
+					#		# email. Now, reset these values to try the reboot
+					#		# again.
+					#		diag_record['email_pcu'] = False
+					#		del diag_record['reboot_node_failed']
 
 					if diag_record['ticket_id'] == "":
 						diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
@@ -820,16 +832,27 @@ class Diagnose:
 
 		return up
 
+def close_rt_backoff(args):
+	if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+		mailer.closeTicketViaRT(args['ticket_id'], 
+								"Ticket CLOSED automatically by SiteAssist.")
+		plc.enableSlices(args['hostname'])
+		plc.enableSliceCreation(args['hostname'])
+	return
+
+def reboot_node(args):
+	host = args['hostname']
+	return reboot.reboot_policy(host, True, config.debug)
 
 class Action:
 	def __init__(self, diagnose_out):
 		# the hostname to loginbase mapping
-		self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+		self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 
 		# Actions to take.
 		self.diagnose_db = diagnose_out
 		# Actions taken.
-		self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
+		self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
 
 		# A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 		self.actions = {}
@@ -863,25 +886,29 @@ class Action:
 			print "----------------"
 			import traceback
 			print traceback.print_exc()
+			from nodecommon import email_exception
+			email_exception()
 			print err
 			if config.policysavedb:
 				print "Saving Databases... act_all"
-				soltesz.dbDump("act_all", self.act_all)
+				database.dbDump("act_all", self.act_all)
+				database.dbDump("diagnose_out", self.diagnose_db)
 			sys.exit(1)
 
 		#print_stats("sites_observed", stats)
 		#print_stats("sites_diagnosed", stats)
 		#print_stats("nodes_diagnosed", stats)
-		print_stats("sites_emailed", stats)
+		self.print_stats("sites_emailed", stats)
 		#print_stats("nodes_actedon", stats)
 		print string.join(stats['allsites'], ",")
 
 		if config.policysavedb:
 			print "Saving Databases... act_all"
-			#soltesz.dbDump("policy.eventlog", self.eventlog)
+			#database.dbDump("policy.eventlog", self.eventlog)
 			# TODO: remove 'diagnose_out', 
 			#	or at least the entries that were acted on.
-			soltesz.dbDump("act_all", self.act_all)
+			database.dbDump("act_all", self.act_all)
+			database.dbDump("diagnose_out", self.diagnose_db)
 
 	def accumSites(self):
 		"""
@@ -914,18 +941,22 @@ class Action:
 		if ADMIN & roles:
 			contacts += [config.email]
 		if TECH & roles:
-			contacts += [TECHEMAIL % loginbase]
+			#contacts += [TECHEMAIL % loginbase]
+			contacts += plc.getTechEmails(loginbase)
 		if PI & roles:
-			contacts += [PIEMAIL % loginbase]
+			#contacts += [PIEMAIL % loginbase]
+			contacts += plc.getPIEmails(loginbase)
 		if USER & roles:
+			contacts += plc.getSliceUserEmails(loginbase)
 			slices = plc.slices(loginbase)
 			if len(slices) >= 1:
-				for slice in slices:
-					contacts += [SLICEMAIL % slice]
 				print "SLIC: %20s : %d slices" % (loginbase, len(slices))
 			else:
 				print "SLIC: %20s : 0 slices" % loginbase
 
+		unique_contacts = set(contacts)
+		contacts = [ c for c in unique_contacts ]	# convert back into list
+
 		try:
 			subject = message[0] % args
 			body = message[1] % args
@@ -943,6 +974,8 @@ class Action:
 			print "exception on message:"
 			import traceback
 			print traceback.print_exc()
+			from nodecommon import email_exception
+			email_exception()
 			print message
 
 		return ticket_id
@@ -1028,23 +1061,23 @@ class Action:
 			email_args = self.get_email_args(issue_record_list, loginbase)
 
 			# for each record.
-			for act_record in issue_record_list:
-				# if there's a pcu record and email config is set
-				if 'email_pcu' in act_record:
-					if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
-						# and 'reboot_node' in act_record['stage']:
-
-						email_args['hostname'] = act_record['nodename']
-						ticket_id = self.__emailSite(loginbase, 
-											act_record['email'], 
-											emailTxt.mailtxt.pcudown[0],
-											email_args)
-						if ticket_id == 0:
-							# error.
-							print "got a ticket_id == 0!!!! %s" % act_record['nodename']
-							os._exit(1)
-							pass
-						email_args['ticket_id'] = ticket_id
+			#for act_record in issue_record_list:
+			#	# if there's a pcu record and email config is set
+			#	if 'email_pcu' in act_record:
+			#		if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
+			#			# and 'reboot_node' in act_record['stage']:
+
+			#			email_args['hostname'] = act_record['nodename']
+			#			ticket_id = self.__emailSite(loginbase, 
+			#								act_record['email'], 
+			#								emailTxt.mailtxt.pcudown[0],
+			#								email_args)
+			#			if ticket_id == 0:
+			#				# error.
+			#				print "got a ticket_id == 0!!!! %s" % act_record['nodename']
+			#				os._exit(1)
+			#				pass
+			#			email_args['ticket_id'] = ticket_id
 
 			
 			act_record = issue_record_list[0]
@@ -1058,6 +1091,7 @@ class Action:
 				if ticket_id == 0:
 					# error.
 					print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+					import os
 					os._exit(1)
 					pass
 
@@ -1084,11 +1118,11 @@ class Action:
 				i_nodes_actedon += 1
 		
 		if config.policysavedb:
-			print "Saving Databases... act_all, diagnose_out"
-			soltesz.dbDump("act_all", self.act_all)
+			#print "Saving Databases... act_all, diagnose_out"
+			#database.dbDump("act_all", self.act_all)
 			# remove site record from diagnose_out, it's in act_all as done.
 			del self.diagnose_db[loginbase]
-			#soltesz.dbDump("diagnose_out", self.diagnose_db)
+			#database.dbDump("diagnose_out", self.diagnose_db)
 
 		print "sleeping for 1 sec"
 		time.sleep(1)
@@ -1111,52 +1145,52 @@ class Action:
 		# avoid end records, and nmreset records					
 		# reboot_node_failed, is set below, so don't reboot repeatedly.
 
-		if 'monitor-end-record' not in act_record['stage'] and \
-		   'nmreset' not in act_record['stage'] and \
-		   'reboot_node_failed' not in act_record:
-
-			if "DOWN" in act_record['log'] and \
-					'pcu_ids' in act_record['plcnode'] and \
-					len(act_record['plcnode']['pcu_ids']) > 0:
-
-				print "%s" % act_record['log'],
-				print "%15s" % (['reboot_node'],)
-				# Set node to re-install
-				plc.nodeBootState(act_record['nodename'], "rins")	
-				try:
-					ret = reboot_node({'hostname': act_record['nodename']})
-				except Exception, exc:
-					print "exception on reboot_node:"
-					import traceback
-					print traceback.print_exc()
-					ret = False
-
-				if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
-					# Reboot Succeeded
-					print "reboot succeeded for %s" % act_record['nodename']
-					act_record2 = {}
-					act_record2.update(act_record)
-					act_record2['action'] = ['reboot_node']
-					act_record2['stage'] = "reboot_node"
-					act_record2['reboot_node_failed'] = False
-					act_record2['email_pcu'] = False
-
-					if nodename not in self.act_all: 
-						self.act_all[nodename] = []
-					print "inserting 'reboot_node' record into act_all"
-					self.act_all[nodename].insert(0,act_record2)
-
-					# return None to avoid further action
-					print "Taking no further action"
-					return None
-				else:
-					print "reboot failed for %s" % act_record['nodename']
-					# set email_pcu to also send pcu notice for this record.
-					act_record['reboot_node_failed'] = True
-					act_record['email_pcu'] = True
+		#if 'monitor-end-record' not in act_record['stage'] and \
+		#   'nmreset' not in act_record['stage'] and \
+		#   'reboot_node_failed' not in act_record:
 
-			print "%s" % act_record['log'],
-			print "%15s" % act_record['action']
+		#	if "DOWN" in act_record['log'] and \
+		#			'pcu_ids' in act_record['plcnode'] and \
+		#			len(act_record['plcnode']['pcu_ids']) > 0:
+#
+#				print "%s" % act_record['log'],
+#				print "%15s" % (['reboot_node'],)
+#				# Set node to re-install
+#				plc.nodeBootState(act_record['nodename'], "rins")	
+#				try:
+#					ret = reboot_node({'hostname': act_record['nodename']})
+#				except Exception, exc:
+#					print "exception on reboot_node:"
+#					import traceback
+#					print traceback.print_exc()
+#					ret = False
+#
+#				if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
+#					# Reboot Succeeded
+#					print "reboot succeeded for %s" % act_record['nodename']
+#					act_record2 = {}
+#					act_record2.update(act_record)
+#					act_record2['action'] = ['reboot_node']
+#					act_record2['stage'] = "reboot_node"
+#					act_record2['reboot_node_failed'] = False
+#					act_record2['email_pcu'] = False
+#
+#					if nodename not in self.act_all: 
+#						self.act_all[nodename] = []
+#					print "inserting 'reboot_node' record into act_all"
+#					self.act_all[nodename].insert(0,act_record2)
+#
+#					# return None to avoid further action
+#					print "Taking no further action"
+#					return None
+#				else:
+#					print "reboot failed for %s" % act_record['nodename']
+#					# set email_pcu to also send pcu notice for this record.
+#					act_record['reboot_node_failed'] = True
+#					act_record['email_pcu'] = True
+#
+#			print "%s" % act_record['log'],
+#			print "%15s" % act_record['action']
 
 		if act_record['stage'] is not 'monitor-end-record' and \
 		   act_record['stage'] is not 'nmreset':