#!/usr/bin/python

# This script is used to manipulate the operational state of nodes in
# different node groups.  These are basically set operations on nodes via the
# PLC api.
# 
# Take the ng name as an argument....
# optionally, 
#  * get a list of nodes in the given nodegroup.
#  * set some or all in the set to rins.
#  * restart them all.
#  * do something else to them all.
# 

from monitor import config
from monitor import util
from monitor import const
from monitor import database
from monitor import parser as parsermodule
from pcucontrol  import reboot
from monitor.wrapper import plc
api = plc.getAuthAPI()

import traceback
from optparse import OptionParser

from monitor.common import *
from nodequery import verify,query_to_dict,node_select
from monitor.model import *
import os

import time

import bootman 		# debug nodes
import mailmonitor 	# down nodes without pcu
from monitor.wrapper.emailTxt import mailtxt
import sys

class Reboot(object):
	def __init__(self, fbnode):
		self.fbnode = fbnode

	def _send_pcunotice(self, host):
		args = {}
		args['hostname'] = host
		try:
			args['pcu_id'] = plc.getpcu(host)['pcu_id']
		except:
			args['pcu_id'] = host
			
		m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
								 mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')

		loginbase = plc.siteId(host)
		m.send([const.TECHEMAIL % loginbase])

	def pcu(self, host):
		# TODO: It should be possible to diagnose the various conditions of
		# 		the PCU here, and send different messages as appropriate.
		print "'%s'" % self.fbnode['pcu']
		if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
			self.action = "reboot.reboot('%s')" % host

			pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
			#pflags.resetRecentFlag('pcutried')
			if not pflags.getRecentFlag('pcutried'):
				try:
					print "CALLING REBOOT!!!"
					ret = reboot.reboot(host)

					pflags.setRecentFlag('pcutried')
					pflags.save()
					return ret

				except Exception,e:
					print traceback.print_exc(); print e

					# NOTE: this failure could be an implementation issue on
					# 		our end.  So, extra notices are confusing...
					# self._send_pcunotice(host) 

					pflags.setRecentFlag('pcufailed')
					pflags.save()
					return False

			elif not pflags.getRecentFlag('pcu_rins_tried'):
				try:
					# set node to 'rins' boot state.
					print "CALLING REBOOT +++ RINS"
					plc.nodeBootState(host, 'rins')
					ret = reboot.reboot(host)

					pflags.setRecentFlag('pcu_rins_tried')
					pflags.save()
					return ret

				except Exception,e:
					print traceback.print_exc(); print e

					# NOTE: this failure could be an implementation issue on
					# 		our end.  So, extra notices are confusing...
					# self._send_pcunotice(host) 

					pflags.setRecentFlag('pcufailed')
					pflags.save()
					return False
			else:
				# we've tried the pcu recently, but it didn't work,
				# so did we send a message about it recently?
				if not pflags.getRecentFlag('pcumessagesent'): 

					self._send_pcunotice(host)

					pflags.setRecentFlag('pcumessagesent')
					pflags.save()

				# This will result in mail() being called next, to try to
				# engage the technical contact to take care of it also.
				print "RETURNING FALSE"
				return False

		else:
			print "NO PCUOK"
			self.action = "None"
			return False

	def mail(self, host):

		# Reset every 4 weeks or so
		pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
		if not pflags.getRecentFlag('endrecord'):
			node_end_record(host)
			pflags.setRecentFlag('endrecord')
			pflags.save()

		# Then in either case, run mailmonitor.reboot()
		self.action = "mailmonitor.reboot('%s')" % host
		try:
			return mailmonitor.reboot(host)
		except Exception, e:
			print traceback.print_exc(); print e
			return False

class RebootDebug(Reboot):

	def direct(self, host):
		self.action = "bootman.reboot('%s', config, None)" % host
		return bootman.reboot(host, config, None)
	
class RebootBoot(Reboot):

	def direct(self, host):
		self.action = "bootman.reboot('%s', config, 'reboot')" % host
		return bootman.reboot(host, config, 'reboot')

class RebootDown(Reboot):

	def direct(self, host):
		self.action = "None"
		return False    # this always fails, since the node will be down.

def set_node_to_rins(host, fb):

	node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
	record = {'observation' : node[0], 
			  'model' : 'USER_REQUEST', 
			  'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
			  'time' : time.time()}
	l = Log(host, record)

	ret = api.UpdateNode(host, {'boot_state' : 'rins'})
	if ret:
		# it's nice to see the current status rather than the previous status on the console
		node = api.GetNodes(host)[0]
		print l
		print "%-2d" % (i-1), nodegroup_display(node, fb)
		return l
	else:
		print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
		return None


try:
	rebootlog = database.dbLoad("rebootlog")
except:
	rebootlog = LogRoll()

parser = parsermodule.getParser(['nodesets'])
parser.set_defaults( timewait=0,
					skip=0,
					rins=False,
					reboot=False,
					findbad=False,
					force=False, 
					nosetup=False, 
					verbose=False, 
					quiet=False,
					)

parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
					help="The select string that must evaluate to true for the node to be considered 'done'")
parser.add_option("", "--findbad", dest="findbad", action="store_true", 
					help="Re-run findbad on the nodes we're going to check before acting.")
parser.add_option("", "--force", dest="force", action="store_true", 
					help="Force action regardless of previous actions/logs.")
parser.add_option("", "--rins", dest="rins", action="store_true", 
					help="Set the boot_state to 'rins' for all nodes.")
parser.add_option("", "--reboot", dest="reboot", action="store_true", 
					help="Actively try to reboot the nodes, keeping a log of actions.")

parser.add_option("", "--verbose", dest="verbose", action="store_true", 
					help="Extra debug output messages.")
parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
					help="Do not perform the orginary setup phase.")
parser.add_option("", "--skip", dest="skip", 
					help="Number of machines to skip on the input queue.")
parser.add_option("", "--timewait", dest="timewait", 
					help="Minutes to wait between iterations of 10 nodes.")

parser = parsermodule.getParser(['defaults'], parser)
config = parsermodule.parse_args(parser)

# COLLECT nodegroups, nodes and node lists
if config.nodegroup:
	ng = api.GetNodeGroups({'name' : config.nodegroup})
	nodelist = api.GetNodes(ng[0]['node_ids'])
	hostnames = [ n['hostname'] for n in nodelist ]

if config.site:
	site = api.GetSites(config.site)
	l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
	hostnames = [ n['hostname'] for n in l_nodes ]

if config.node or config.nodelist:
	if config.node: hostnames = [ config.node ] 
	else: hostnames = util.file.getListFromFile(config.nodelist)

fbquery = FindbadNodeRecord.get_all_latest()
fb_nodelist = [ n.hostname for n in fbquery ]

if config.nodeselect:
	hostnames = node_select(config.nodeselect, fb_nodelist)

if config.findbad:
	# rerun findbad with the nodes in the given nodes.
	file = "findbad.txt"
	util.file.setFileFromList(file, hostnames)
	os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
	# TODO: shouldn't we reload the node list now?

l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
# commands:
i = 1
count = 1
#print "hosts: %s" % hostnames
for host in hostnames:

	#if 'echo' in host or 'hptest-1' in host: continue

	try:
		try:
			node = api.GetNodes(host)[0]
		except:
			print traceback.print_exc(); 
			print "FAILED GETNODES for host: %s" % host
			continue
			
		print "%-2d" % i, nodegroup_display(node, fb)
		i += 1
		if i-1 <= int(config.skip): continue
		if host in l_blacklist:
			print "%s is blacklisted.  Skipping." % host
			continue

		if config.stopselect:
			dict_query = query_to_dict(config.stopselect)
			fbnode = fb['nodes'][host]['values']
			observed_state = get_current_state(fbnode)

			if verify(dict_query, fbnode) and observed_state != "dbg ":
				# evaluates to true, therefore skip.
				print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
				try:
					# todo: clean up act_all record here.
					# todo: send thank you, etc.
					mailmonitor.reboot(host)
				except Exception, e:
					print traceback.print_exc(); print e

				continue
			#else:
				#print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
				#sys.exit(1)

		if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
			print "recently rebooted %s.  skipping... " % host
			continue

		if config.reboot:

			fbnode = fb['nodes'][host]['values']
			observed_state = get_current_state(fbnode)

			if	 observed_state == "dbg ":
				o = RebootDebug(fbnode)

			elif observed_state == "boot" :
				if config.rins:
					l = set_node_to_rins(host, fb)
					if l: rebootlog.add(l)

				o = RebootBoot(fbnode)

			elif observed_state == "down":
				if config.rins:
					l = set_node_to_rins(host, fb)
					if l: rebootlog.add(l)

				o = RebootDown(fbnode)


			if o.direct(host):
				record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, 
						  'action' : o.action,
						  'model' : "none",
						  'time' : time.time()}
			elif o.pcu(host):
				record = {'observation' : "PCU_SUCCESS: %s" % observed_state, 
						  'action' : o.action,
						  'model' : "none",
						  'time' : time.time()}
			elif o.mail(host):
				record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, 
						  'action' : o.action,
						  'model' : "none",
						  'time' : time.time()}
			else:
				record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
						  'action' : "log failure",
						  'model' : "none",
						  'time' : time.time()}

				print "ALL METHODS OF RESTARTING %s FAILED" % host
				args = {}
				args['hostname'] = host
				#m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
				#							 "CANNOT CONTACT", False, db='suspect_persistmessages')
				#m.reset()
				#m.send(['monitor-list@lists.planet-lab.org'])

			l = Log(host, record)
			print l
			rebootlog.add(l)
	except KeyboardInterrupt:
		print "Killed by interrupt"
		sys.exit(0)
	except:
		print traceback.print_exc();
		print "Continuing..."

	time.sleep(1)
	if count % 10 == 0:
		print "Saving rebootlog"
		database.dbDump("rebootlog", rebootlog)
		wait_time = int(config.timewait)
		print "Sleeping %d minutes" % wait_time
		ti = 0
		print "Minutes slept: ",
		sys.stdout.flush()
		while ti < wait_time:
			print "%s" % ti,
			sys.stdout.flush()
			time.sleep(60)
			ti = ti+1

	count = count + 1

print "Saving rebootlog"
database.dbDump("rebootlog", rebootlog)