From: Stephen Soltesz <soltesz@cs.princeton.edu>
Date: Wed, 11 Mar 2009 20:13:45 +0000 (+0000)
Subject: break out the functions that are needed by the monitor module for rebooting
X-Git-Tag: Monitor-2.0-4~8
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=7ba32adfce46497868ab1aa373761a0165408df1;p=monitor.git

break out the functions that are needed by the monitor module for rebooting
and pcucontrol, in order to make pcucontrol a stand-alone module with no
dependencies.
---

diff --git a/monitor/reboot.py b/monitor/reboot.py
new file mode 100755
index 0000000..c3f6103
--- /dev/null
+++ b/monitor/reboot.py
@@ -0,0 +1,573 @@
+#!/usr/bin/python
+#
+# Reboot specified nodes
+#
+
+import getpass, getopt
+import os, sys
+import xml, xmlrpclib
+import errno, time, traceback
+import urllib2
+import urllib
+import threading, popen2
+import array, struct
+import base64
+from subprocess import PIPE, Popen
+import pcucontrol.transports.ssh.pxssh as pxssh
+import pcucontrol.transports.ssh.pexpect as pexpect
+import socket
+
+
+
+# Use our versions of telnetlib and pyssh
+sys.path.insert(0, os.path.dirname(sys.argv[0]))
+import pcucontrol.transports.telnetlib as telnetlib
+sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
+import pcucontrol.transports.pyssh as pyssh
+
+from monitor import config
+from monitor.util import command
+from monitor.wrapper import plc
+
+
+# Event class ID from pcu events
+#NODE_POWER_CONTROL = 3
+
+# Monitor user ID
+#MONITOR_USER_ID = 11142
+
+import logging
+logger = logging.getLogger("monitor")
+verbose = 1
+#dryrun = 0;
+
+class ExceptionNoTransport(Exception): pass
+class ExceptionNotFound(Exception): pass
+class ExceptionPassword(Exception): pass
+class ExceptionTimeout(Exception): pass
+class ExceptionPrompt(Exception): pass
+class ExceptionSequence(Exception): pass
+class ExceptionReset(Exception): pass
+class ExceptionPort(Exception): pass
+class ExceptionUsername(Exception): pass
+
+
+
+# PCU has model, host, preferred-port, user, passwd, 
+
+# This is an object derived directly form the PLCAPI DB fields
+class PCU(object):
+	def __init__(self, plc_pcu_dict):
+		for field in ['username', 'password', 'site_id', 
+						'hostname', 'ip', 
+						'pcu_id', 'model', 
+						'node_ids', 'ports', ]:
+			if field in plc_pcu_dict:
+				self.__setattr__(field, plc_pcu_dict[field])
+			else:
+				raise Exception("No such field %s in PCU object" % field)
+
+# These are the convenience functions build around the PCU object.
+class PCUModel(PCU):
+	def __init__(self, plc_pcu_dict):
+		PCU.__init__(self, plc_pcu_dict)
+		self.host = self.pcu_name()
+
+	def pcu_name(self):
+		if self.hostname is not None and self.hostname is not "":
+			return self.hostname
+		elif self.ip is not None and self.ip is not "":
+			return self.ip
+		else:
+			return None
+
+	def nodeidToPort(self, node_id):
+		if node_id in self.node_ids:
+			for i in range(0, len(self.node_ids)):
+				if node_id == self.node_ids[i]:
+					return self.ports[i]
+
+		raise Exception("No such Node ID: %d" % node_id)
+
+# This class captures the observed pcu records from FindBadPCUs.py
+class PCURecord:
+	def __init__(self, pcu_record_dict):
+		for field in ['port_status', 
+						'dns_status', 
+						'entry_complete', ]:
+			if field in pcu_record_dict:
+				if field == "reboot":
+					self.__setattr__("reboot_str", pcu_record_dict[field])
+				else:
+					self.__setattr__(field, pcu_record_dict[field])
+			#else:
+			#	raise Exception("No such field %s in pcu record dict" % field)
+
+class Transport:
+	TELNET = "telnet"
+	SSH    = "ssh"
+	HTTP   = "http"
+	HTTPS  = "https"
+	IPAL   = "ipal"
+	DRAC   = "drac"
+	AMT    = "amt"
+
+	TELNET_TIMEOUT = 120
+
+	porttypemap = {
+			5869 : DRAC,
+			22 : SSH,
+			23 : TELNET,
+			443 : HTTPS,
+			80 :  HTTP,
+			9100 : IPAL,
+			16992 : AMT,
+		}
+
+	def __init__(self, type, verbose):
+		self.type = type
+		self.verbose = verbose
+		self.transport = None
+
+	def open(self, host, username=None, password=None, prompt="User Name"):
+		transport = None
+
+		if self.type == self.TELNET:
+			transport = telnetlib.Telnet(host, timeout=self.TELNET_TIMEOUT)
+			transport.set_debuglevel(self.verbose)
+			if username is not None:
+				self.transport = transport
+				self.ifThenSend(prompt, username, ExceptionUsername)
+
+		elif self.type == self.SSH:
+			if username is not None:
+				transport = pyssh.Ssh(username, host)
+				transport.set_debuglevel(self.verbose)
+				transport.open()
+				# TODO: have an ssh set_debuglevel() also...
+			else:
+				raise Exception("Username cannot be None for ssh transport.")
+		elif self.type == self.HTTP:
+			# NOTE: this does not work for all web-based services...
+			self.url = "http://%s:%d/" % (host,80)
+			uri = "%s:%d" % (host,80)
+
+			# create authinfo
+			authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
+			authinfo.add_password (None, uri, username, password)
+			authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
+
+			transport = urllib2.build_opener(authhandler)
+		else:
+			raise Exception("Unknown transport type: %s" % self.type)
+
+		self.transport = transport
+		return True
+
+	def close(self):
+		if self.type == self.TELNET:
+			self.transport.close() 
+		elif self.type == self.SSH:
+			self.transport.close() 
+		elif self.type == self.HTTP:
+			pass
+		else:
+			raise Exception("Unknown transport type %s" % self.type)
+		self.transport = None
+
+	def write(self, msg):
+		return self.send(msg)
+
+	def send(self, msg):
+		if self.transport == None:
+			raise ExceptionNoTransport("transport object is type None")
+			
+		return self.transport.write(msg)
+
+	def sendPassword(self, password, prompt=None):
+		if self.type == self.TELNET:
+			if prompt == None:
+				self.ifThenSend("Password", password, ExceptionPassword)
+			else:
+				self.ifThenSend(prompt, password, ExceptionPassword)
+		elif self.type == self.SSH:
+			self.ifThenSend("password:", password, ExceptionPassword)
+		elif self.type == self.HTTP:
+			pass
+		else:
+			raise Exception("Unknown transport type: %s" % self.type)
+
+	def sendHTTP(self, resource, data):
+		if self.verbose:
+			print "POSTing '%s' to %s" % (data,self.url + resource)
+
+		try:
+			f = self.transport.open(self.url + resource ,data)
+			r = f.read()
+			if self.verbose:
+				print r
+
+		except urllib2.URLError,err:
+			logger.info('Could not open http connection', err)
+			return "http transport error"
+
+		return 0
+
+	def ifThenSend(self, expected, buffer, ErrorClass=ExceptionPrompt):
+
+		if self.transport != None:
+			output = self.transport.read_until(expected, self.TELNET_TIMEOUT)
+			if output.find(expected) == -1:
+				print "OUTPUT: --%s--" % output
+				raise ErrorClass, "'%s' not found" % expected
+			else:
+				self.transport.write(buffer + "\r\n")
+		else:
+			raise ExceptionNoTransport("transport object is type None")
+
+	def ifElse(self, expected, ErrorClass):
+		try:
+			self.transport.read_until(expected, self.TELNET_TIMEOUT)
+		except:
+			raise ErrorClass("Could not find '%s' within timeout" % expected)
+
+class PCUControl(PCUModel,PCURecord):
+
+	""" 
+		There are three cases:
+			1) the pcu_record passed below includes port_status from an
+				external probe.
+			2) the external probe failed, and the values are empty
+			3) this call is made independent of port_status.
+
+		In the first case, the first open port is used.
+		In the third case, the ports are tried in sequence.
+
+		In this way, the port_status value serves only as an optimization,
+		because closed ports are avoided.  The supported_ports value should
+		order ports by their preferred usage.
+	"""
+
+	supported_ports = []
+
+	def __init__(self, plc_pcu_record, verbose, ignored=None):
+		PCUModel.__init__(self, plc_pcu_record)
+		PCURecord.__init__(self, plc_pcu_record)
+
+	def reboot(self, node_port, dryrun):
+
+		port_list = []
+		# There are two sources of potential ports.  Those that are open and
+		# those that are part of the PCU's supported_ports.  
+		#  I think we should start with supported_ports and then filter that
+		#  by the open ports.
+
+		port_list = self.supported_ports
+
+		if hasattr(self, 'port_status') and self.port_status:
+			# get out the open ports
+			port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
+			port_list = [ int(x) for x in port_list ]
+			# take only the open ports that are supported_ports
+			port_list = filter(lambda x: x in self.supported_ports, port_list)
+			if port_list == []:
+				raise ExceptionPort("No Open Port: No transport from open ports")
+
+		print port_list
+
+		ret = "No implementation for open ports on selected PCU model"
+		for port in port_list:
+			if port not in Transport.porttypemap:
+				continue
+
+			type = Transport.porttypemap[port]
+			self.transport = Transport(type, verbose)
+
+			print "checking for run_%s" % type
+			if hasattr(self, "run_%s" % type):
+				print "found run_%s" % type
+				fxn = getattr(self, "run_%s" % type)
+				ret = self.catcherror(fxn, node_port, dryrun)
+				if ret == 0: # NOTE: success!, so stop
+					break
+			else:
+				continue
+
+		return ret
+
+	def run(self, node_port, dryrun):
+		""" This function is to be defined by the specific PCU instance.  """
+		raise Exception("This function is not implemented")
+		pass
+
+	#def reboot(self, node_port, dryrun):
+
+	def catcherror(self, function, node_port, dryrun):
+		try:
+			return function(node_port, dryrun)
+		except ExceptionNotFound, err:
+			return "error: " + str(err)
+		except ExceptionPassword, err:
+			return "Password exception: " + str(err)
+		except ExceptionTimeout, err:
+			return "Timeout exception: " + str(err)
+		except ExceptionUsername, err:
+			return "No username prompt: " + str(err)
+		except ExceptionSequence, err:
+			return "Sequence error: " + str(err)
+		except ExceptionPrompt, err:
+			return "Prompt exception: " + str(err)
+		except ExceptionNoTransport, err:
+			return "No Transport: " + str(err)
+		except ExceptionPort, err:
+			return "No ports exception: " + str(err)
+		except socket.error, err:
+			return "socket error: timeout: " + str(err)
+		except urllib2.HTTPError, err:
+			return "HTTPError: " + str(err)
+		except urllib2.URLError, err:
+			return "URLError: " + str(err)
+		except EOFError, err:
+			self.transport.close()
+			import traceback
+			traceback.print_exc()
+			return "EOF connection reset" + str(err)
+		except Exception, err:
+			from monitor.common import email_exception
+			email_exception(self.host)
+			raise Exception(err)
+
+from pcucontrol.models import *
+
+def pcu_name(pcu):
+	if pcu['hostname'] is not None and pcu['hostname'] is not "":
+		return pcu['hostname']
+	elif pcu['ip'] is not None and pcu['ip'] is not "":
+		return pcu['ip']
+	else:
+		return None
+
+def get_pcu_values(pcu_id):
+	from monitor.database.info.model import FindbadPCURecord
+	print "pcuid: %s" % pcu_id
+	try:
+		pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
+		if pcurec:
+			values = pcurec.to_dict()
+		else:
+			values = None
+	except:
+		values = None
+
+	return values
+
+def reboot(nodename):
+	return reboot_policy(nodename, True, False)
+
+def reboot_str(nodename):
+	global verbose
+	continue_probe = True
+	dryrun=False
+
+	pcu = plc.getpcu(nodename)
+	if not pcu:
+		logger.debug("no pcu for %s" % nodename)
+		print "no pcu for %s" % nodename
+		return False # "%s has no pcu" % nodename
+
+	values = get_pcu_values(pcu['pcu_id'])
+	if values == None:
+		logger.debug("No values for pcu probe %s" % nodename)
+		print "No values for pcu probe %s" % nodename
+		return False #"no info for pcu_id %s" % pcu['pcu_id']
+	
+	# Try the PCU first
+	logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+	ret = reboot_test_new(nodename, values, verbose, dryrun)
+	return ret
+	
+def reboot_policy(nodename, continue_probe, dryrun):
+	global verbose
+
+	pcu = plc.getpcu(nodename)
+	if not pcu:
+		logger.debug("no pcu for %s" % nodename)
+		print "no pcu for %s" % nodename
+		return False # "%s has no pcu" % nodename
+
+	values = get_pcu_values(pcu['pcu_id'])
+	if values == None:
+		logger.debug("No values for pcu probe %s" % nodename)
+		print "No values for pcu probe %s" % nodename
+		return False #"no info for pcu_id %s" % pcu['pcu_id']
+	
+	# Try the PCU first
+	logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+	ret = reboot_test_new(nodename, values, verbose, dryrun)
+
+	if ret != 0:
+		print ret
+		return False
+	else:
+		print "return true"
+		return True
+
+class Unknown(PCUControl):
+	supported_ports = [22,23,80,443,5869,9100,16992]
+
+def model_to_object(modelname):
+	if modelname is None:
+		return ManualPCU 
+	if "AMT" in modelname:
+		return IntelAMT
+	elif "BayTech" in modelname:
+		return BayTech
+	elif "HPiLO" in modelname:
+		return HPiLO
+	elif "IPAL" in modelname:
+		return IPAL
+	elif "APC" in modelname:
+		return APCControl
+	elif "DRAC" in modelname:
+		return DRAC
+	elif "WTI" in modelname:
+		return WTIIPS4
+	elif "ePowerSwitch" in modelname:
+		return ePowerSwitchNew
+	elif "IPMI" in modelname:
+		return IPMI
+	elif "BlackBoxPSMaverick" in modelname:
+		return BlackBoxPSMaverick
+	elif "PM211MIP" in modelname:
+		return PM211MIP
+	elif "ManualPCU" in modelname:
+		return ManualPCU 
+	else:
+		print "UNKNOWN model %s"%modelname
+		return Unknown
+
+def reboot_api(node, pcu): #, verbose, dryrun):
+	rb_ret = ""
+
+	try:
+		modelname = pcu['model']
+		if modelname:
+			# get object instance 
+			instance = eval('%s(pcu, verbose)' % modelname)
+			# get pcu port 
+			i = pcu['node_ids'].index(node['node_id'])
+			p = pcu['ports'][i]
+			# reboot
+			rb_ret = instance.reboot(p, False)
+		else:
+			rb_ret =  "No modelname in PCU record."
+		# TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
+	except Exception, err:
+		rb_ret = str(err)
+
+	return rb_ret
+
+def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
+	newmodelname = None
+	update = {	'AP79xx' : 'APCControl13p13',
+				'Masterswitch' : 'APCControl13p13',
+				'DS4-RPC' : 'BayTech',
+				'IP-41x_IP-81x' : 'IPAL',
+				'DRAC3' : 'DRAC',
+				'DRAC4' : 'DRAC',
+				'ePowerSwitch' : 'ePowerSwitchOld',
+				'ilo2' : 'HPiLO',
+				'ilo1' : 'HPiLO',
+				'PM211-MIP' : 'PM211MIP',
+				'AMT2.5' : 'IntelAMT',
+				'AMT3.0' : 'IntelAMT',
+				'WTI_IPS-4' : 'WTIIPS4',
+				'unknown'  : 'ManualPCU',
+				'DRAC5'	: 'DRAC',
+				'ipmi'	: 'OpenIPMI',
+				'bbsemaverick' : 'BlackBoxPSMaverick',
+				'manualadmin'  : 'ManualPCU',
+	}
+
+	if oldmodelname in update:
+		newmodelname = update[oldmodelname]
+	else:
+		newmodelname = oldmodelname
+
+	if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
+		newmodelname = 'APCControl12p3'
+	elif pcu_id in [1110,86]:
+		newmodelname = 'APCControl1p4'
+	elif pcu_id in [1221,1225,1220,1192]:
+		newmodelname = 'APCControl121p3'
+	elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
+		newmodelname = 'APCControl121p1'
+	elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
+		newmodelname = 'BayTechCtrlC'
+	elif pcu_id in [93]:
+		newmodelname = 'BayTechRPC3NC'
+	elif pcu_id in [1057]:
+		newmodelname = 'BayTechCtrlCUnibe'
+	elif pcu_id in [1012]:
+		newmodelname = 'BayTechRPC16'
+	elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
+		newmodelname = 'ePowerSwitchNew'
+
+	return newmodelname
+
+def reboot_test_new(nodename, values, verbose, dryrun):
+	rb_ret = ""
+	if 'plc_pcu_stats' in values:
+		values.update(values['plc_pcu_stats'])
+
+	try:
+		modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
+		if modelname:
+			object = eval('%s(values, verbose)' % modelname)
+			rb_ret = object.reboot(values[nodename], dryrun)
+		else:
+			rb_ret =  "Not_Run"
+		# TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
+	except ExceptionPort, err:
+		rb_ret = str(err)
+	except NameError, err:
+		rb_ret = str(err)
+
+	return rb_ret
+
+def main():
+	logger.setLevel(logging.DEBUG)
+	ch = logging.StreamHandler()
+	ch.setLevel(logging.DEBUG)
+	formatter = logging.Formatter('LOGGER - %(message)s')
+	ch.setFormatter(formatter)
+	logger.addHandler(ch)
+
+	try:
+		if "test" in sys.argv:
+			dryrun = True
+		else:
+			dryrun = False
+
+		for node in sys.argv[1:]:
+			if node == "test": continue
+
+			print "Rebooting %s" % node
+			if reboot_policy(node, True, dryrun):
+				print "success"
+			else:
+				print "failed"
+	except Exception, err:
+		import traceback; traceback.print_exc()
+		from monitor.common import email_exception
+		email_exception(node)
+		print err
+
+if __name__ == '__main__':
+	logger = logging.getLogger("monitor")
+	main()
+	f = open("/tmp/rebootlog", 'a')
+	f.write("reboot %s\n" % sys.argv)
+	f.close()