#!/usr/bin/python

# This script is used to manipulate the operational state of nodes in
# different node groups.  These are basically set operations on nodes via the
# PLC api.
# 
# Take the ng name as an argument....
# optionally, 
#  * get a list of nodes in the given nodegroup.
#  * set some or all in the set to rins.
#  * restart them all.
#  * do something else to them all.
# 

import os
import time
import traceback
import sys
from optparse import OptionParser

import bootman 		# debug nodes

from monitor import util
from monitor import const
from monitor import reboot
from monitor import config
from monitor import database
from monitor import parser as parsermodule
from monitor.common import *
from monitor.model import *
from monitor.wrapper import plc
from monitor.wrapper import plccache
from monitor.wrapper.emailTxt import mailtxt
from monitor.database.info.model import *

from nodequery import verify,query_to_dict,node_select

api = plc.getAuthAPI()


class SiteInterface(HistorySiteRecord):
	@classmethod
	def get_or_make(cls, if_new_set={}, **kwargs):
		if 'hostname' in kwargs:
			kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
			del kwargs['hostname']
		res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
		return SiteInterface(res)
	
	def __init__(self, sitehist):
		self.db = sitehist

	def getRecentActions(self, **kwargs):
		# TODO: make query only return records within a certin time range,
		# i.e. greater than 0.5 days ago. or 5 days, etc.

		#print "kwargs: ", kwargs

		recent_actions = []
		if 'loginbase' in kwargs:
			recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
		elif 'hostname' in kwargs:
			recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
		return recent_actions
	
	def increasePenalty(self):
		#act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
		self.db.penalty_level += 1
		# NOTE: this is to prevent overflow or index errors in applyPenalty.
		#       there's probably a better approach to this.
		if self.db.penalty_level >= 2:
			self.db.penalty_level = 2
		self.db.penalty_applied = True
	
	def applyPenalty(self):
		penalty_map = [] 
		penalty_map.append( { 'name': 'noop',      		'enable'   : lambda site: None,
														'disable'  : lambda site: None } )
		penalty_map.append( { 'name': 'nocreate',		'enable'   : lambda site: plc.removeSiteSliceCreation(site),
														'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
		penalty_map.append( { 'name': 'suspendslices',	'enable'   : lambda site: plc.suspendSiteSlices(site),
														'disable'  : lambda site: plc.enableSiteSlices(site) } )

		for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
			print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
			penalty_map[i]['disable'](self.db.loginbase) 

		for i in range(0,self.db.penalty_level+1):
			print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
			penalty_map[i]['enable'](self.db.loginbase)

		return

	def pausePenalty(self):
		act = ActionRecord(loginbase=self.db.loginbase,
							action='penalty',
							action_type='pause_penalty',)
	
	def clearPenalty(self):
		#act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
		self.db.penalty_level = 0
		self.db.penalty_applied = False
	
	def getTicketStatus(self):
		if self.db.message_id != 0:
			rtstatus = mailer.getTicketStatus(self.db.message_id)
			self.db.message_status = rtstatus['Status']
			self.db.message_queue = rtstatus['Queue']
			self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])

	def setTicketStatus(self, status):
		print 'SETTING status %s' % status
		if self.db.message_id != 0:
			rtstatus = mailer.setTicketStatus(self.db.message_id, status)

	def getContacts(self):
		contacts = []
		if self.db.penalty_level >= 0:
			contacts += plc.getTechEmails(self.db.loginbase)

		if self.db.penalty_level >= 1:
			contacts += plc.getPIEmails(self.db.loginbase)

		if self.db.penalty_level >= 2:
			contacts += plc.getSliceUserEmails(self.db.loginbase)

		return contacts

	def sendMessage(self, type, **kwargs):

		# NOTE: evidently changing an RT message's subject opens the ticket.
		#       the logic in this policy depends up a ticket only being 'open'
        #       if a user has replied to it.
        #       So, to preserve these semantics, we check the status before
        #           sending, then after sending, reset the status to the
        #           previous status.
        #       There is a very tiny race here, where a user sends a reply
        #           within the time it takes to check, send, and reset.
        #       This sucks.  It's almost certainly fragile.

		# 
		# TODO: catch any errors here, and add an ActionRecord that contains
		#       those errors.
		
		args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
		args.update(kwargs)

		hostname = None
		if 'hostname' in args:
			hostname = args['hostname']

		if hasattr(mailtxt, type):

			message = getattr(mailtxt, type)
			viart = True
			if 'viart' in kwargs:
				viart = kwargs['viart']

			if viart:
				self.getTicketStatus()		# get current message status

			m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)

			contacts = self.getContacts()
			contacts = [config.cc_email]	# TODO: remove after testing...

			print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)

			ret = m.send(contacts)
			if viart:
				self.db.message_id = ret
				# reset to previous status, since a new subject 'opens' RT tickets.
				self.setTicketStatus(self.db.message_status) 

				# NOTE: only make a record of it if it's in RT.
				act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
								action_type=type, message_id=self.db.message_id)

		else:
			print "+-- WARNING! ------------------------------"
			print "| No such message name in emailTxt.mailtxt: %s" % type
			print "+------------------------------------------"

		return

	def closeTicket(self):
		# TODO: close the rt ticket before overwriting the message_id
		mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
		act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
							action_type='end_notice', message_id=self.db.message_id)
		self.db.message_id = 0
		self.db.message_status = "new"

	def runBootManager(self, hostname):
		print "attempting BM reboot of %s" % hostname
		ret = ""
		try:
			ret = bootman.restore(self, hostname)
			err = ""
		except:
			err = traceback.format_exc()
			print err

		act = ActionRecord(loginbase=self.db.loginbase,
							hostname=hostname,
							action='reboot',
							action_type='bootmanager_restore',
							error_string=err)
		return ret

	def attemptReboot(self, hostname):
		print "attempting PCU reboot of %s" % hostname
		ret = reboot.reboot_str(hostname)
		if ret == 0 or ret == "0":
			ret = ""
		act = ActionRecord(loginbase=self.db.loginbase,
							hostname=hostname,
							action='reboot',
							action_type='first_try_reboot',
							error_string=ret)

def logic():

	plc.nodeBootState(host, 'rins')
	node_end_record(host)


def main(hostnames, sitenames):
	# commands:
	i = 1
	node_count = 1
	site_count = 1
	#print "hosts: %s" % hostnames
	for host in hostnames:
		try:
			lb = plccache.plcdb_hn2lb[host]
		except:
			print "unknown host in plcdb_hn2lb %s" % host
			continue

		nodeblack = BlacklistRecord.get_by(hostname=host)

		if nodeblack and not nodeblack.expired():
			print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
			continue

		sitehist = SiteInterface.get_or_make(loginbase=lb)

		recent_actions = sitehist.getRecentActions(hostname=host)

		nodehist = HistoryNodeRecord.findby_or_create(hostname=host)

		print "%s %s" % ( nodehist.hostname, nodehist.status)
		if nodehist.status == 'good' and \
			changed_lessthan(nodehist.last_changed, 1.0) and \
			not found_within(recent_actions, 'online_notice', 0.5):
				# NOTE: there is a narrow window in which this command must be
				# evaluated, otherwise the notice will not go out.  this is not ideal.
				sitehist.sendMessage('online_notice', hostname=host)
				print "send message for host %s online" % host

				pass

		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
			changed_greaterthan(nodehist.last_changed,1.0) and \
			not found_between(recent_actions, 'first_try_reboot', 3.5, 1):

				sitehist.attemptReboot(host)
				print "send message for host %s first_try_reboot" % host
				pass

		# NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
		# 		will be false for a day after the above condition is satisfied
		if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
			changed_greaterthan(nodehist.last_changed,1.5) and \
			found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
			not found_within(recent_actions, 'pcufailed_notice', 3.5):
			# found_within(recent_actions, 'first_try_reboot', 3.5) and \
				
				# send pcu failure message
				#act = ActionRecord(**kwargs)
				sitehist.sendMessage('pcufailed_notice', hostname=host)
				print "send message for host %s PCU Failure" % host
				pass

		if nodehist.status == 'monitordebug' and \
			changed_greaterthan(nodehist.last_changed, 1) and \
			not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
				# send down node notice
				# delay 0.5 days before retrying...

				print "send message for host %s bootmanager_restore" % host
				sitehist.runBootManager(host)
			#	sitehist.sendMessage('retry_bootman', hostname=host)

		if nodehist.status == 'down' and \
			changed_greaterthan(nodehist.last_changed, 2) and \
			not found_within(recent_actions, 'down_notice', 3.5):
				# send down node notice

				sitehist.sendMessage('down_notice', hostname=host)
				print "send message for host %s offline" % host
				pass

		node_count = node_count + 1

	for site in sitenames:
		sitehist = SiteInterface.get_or_make(loginbase=site)
		# TODO: make query only return records within a certin time range,
		# 		i.e. greater than 0.5 days ago. or 5 days, etc.
		recent_actions = sitehist.getRecentActions(loginbase=site)

		#sitehist.sendMessage('test_notice', host)

		print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
		if sitehist.db.status == 'down':
			if  not found_within(recent_actions, 'pause_penalty', 30) and \
				not found_within(recent_actions, 'increase_penalty', 7) and \
				changed_greaterthan(sitehist.db.last_changed, 7):

				# TODO: catch errors
				sitehist.increasePenalty()
				#sitehist.applyPenalty()
				sitehist.sendMessage('increase_penalty')

				print "send message for site %s penalty increase" % site

		if sitehist.db.status == 'good':
			# clear penalty
			# NOTE: because 'all clear' should have an indefinite status, we
			# 		have a boolean value rather than a 'recent action'
			if sitehist.db.penalty_applied:
				# send message that penalties are cleared.

				sitehist.clearPenalty()
				#sitehist.applyPenalty()
				sitehist.sendMessage('clear_penalty')
				sitehist.closeTicket()

				print "send message for site %s penalty cleared" % site

		# find all ticket ids for site ( could be on the site record? )
		# determine if there are penalties within the last 30 days?
		# if so, add a 'pause_penalty' action.
		if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
			#	pause escalation
			print "Pausing penalties for %s" % site
			sitehist.pausePenalty()

		site_count = site_count + 1

	session.flush()

	return


if __name__ == "__main__":
	parser = parsermodule.getParser(['nodesets'])
	parser.set_defaults( timewait=0,
						skip=0,
						rins=False,
						reboot=False,
						findbad=False,
						force=False, 
						nosetup=False, 
						verbose=False, 
						quiet=False,
						)

	parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
						help="The select string that must evaluate to true for the node to be considered 'done'")
	parser.add_option("", "--findbad", dest="findbad", action="store_true", 
						help="Re-run findbad on the nodes we're going to check before acting.")
	parser.add_option("", "--force", dest="force", action="store_true", 
						help="Force action regardless of previous actions/logs.")
	parser.add_option("", "--rins", dest="rins", action="store_true", 
						help="Set the boot_state to 'rins' for all nodes.")
	parser.add_option("", "--reboot", dest="reboot", action="store_true", 
						help="Actively try to reboot the nodes, keeping a log of actions.")

	parser.add_option("", "--verbose", dest="verbose", action="store_true", 
						help="Extra debug output messages.")
	parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
						help="Do not perform the orginary setup phase.")
	parser.add_option("", "--skip", dest="skip", 
						help="Number of machines to skip on the input queue.")
	parser.add_option("", "--timewait", dest="timewait", 
						help="Minutes to wait between iterations of 10 nodes.")

	parser = parsermodule.getParser(['defaults'], parser)
	config = parsermodule.parse_args(parser)

#	# COLLECT nodegroups, nodes and node lists
#	if config.nodegroup:
#		ng = api.GetNodeGroups({'name' : config.nodegroup})
#		nodelist = api.GetNodes(ng[0]['node_ids'])
#		hostnames = [ n['hostname'] for n in nodelist ]

	fbquery = HistoryNodeRecord.query.all()
	hostnames = [ n.hostname for n in fbquery ]
	
	fbquery = HistorySiteRecord.query.all()
	sitenames = [ s.loginbase for s in fbquery ]

	if config.site:
		# TODO: replace with calls to local db.  the api fails so often that
		# 		these calls should be regarded as unreliable.
		site = api.GetSites(config.site)
		l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
		filter_hostnames = [ n['hostname'] for n in l_nodes ]

		hostnames = filter(lambda x: x in filter_hostnames, hostnames)
		sitenames = [config.site]

	if config.node:
		hostnames = [ config.node ] 
		sitenames = [ plccache.plcdb_hn2lb[config.node] ]

	try:
		main(hostnames, sitenames)
	except KeyboardInterrupt:
		print "Killed by interrupt"
		sys.exit(0)
	except:
		#email_exception()
		print traceback.print_exc();
		print "Continuing..."