X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=reboot.py;h=461009597f6f88328d93e6c996439c839d5d4d18;hb=ba703068c955a5cb38c65a5744e0a69672f290f3;hp=297abe631d56bcc2e7d0048fe121c103e0893f55;hpb=652c9acfcc50bdaa6df1e3b88440f7afc50e0892;p=monitor.git diff --git a/reboot.py b/reboot.py index 297abe6..4610095 100755 --- a/reboot.py +++ b/reboot.py @@ -10,9 +10,12 @@ import errno, time, traceback import urllib2 import threading, popen2 import array, struct -from socket import * +#from socket import * +import socket import plc +plc_lock = threading.Lock() + # Use our versions of telnetlib and pyssh sys.path.insert(0, os.path.dirname(sys.argv[0])) import telnetlib @@ -20,7 +23,7 @@ sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh") import pyssh # Timeouts in seconds -TELNET_TIMEOUT = 20 +TELNET_TIMEOUT = 30 # Event class ID from pcu events #NODE_POWER_CONTROL = 3 @@ -31,27 +34,72 @@ TELNET_TIMEOUT = 20 import logging logger = logging.getLogger("monitor") verbose = 1 -dryrun = 0; +#dryrun = 0; + +class ExceptionNotFound(Exception): pass +class ExceptionPassword(Exception): pass +class ExceptionTimeout(Exception): pass +class ExceptionPrompt(Exception): pass +class ExceptionPort(Exception): pass def telnet_answer(telnet, expected, buffer): global verbose output = telnet.read_until(expected, TELNET_TIMEOUT) - if verbose: - logger.debug(output) + #if verbose: + # logger.debug(output) if output.find(expected) == -1: - raise Exception, "'%s' not found" % expected + raise ExceptionNotFound, "'%s' not found" % expected else: telnet.write(buffer + "\r\n") -def ipal_reboot(ip, password, port): - global dryrun, verbose +# PCU has model, host, preferred-port, user, passwd, + +class PCUExpect: + def __init__(self, protocol, verbose, dryrun): + self.verbose = verbose + self.protocol = protocol + self.dryrun = dryrun + + def telnet_answer(telnet, expected, buffer): + global verbose + + output = telnet.read_until(expected, TELNET_TIMEOUT) + #if verbose: + # logger.debug(output) + if output.find(expected) == -1: + raise ExceptionNotFound, "'%s' not found" % expected + else: + telnet.write(buffer + "\r\n") + + def _run(self, host, user, passwd, node_port, protocols): + self.run() + + def run(self): + pass + + + +def ipal_reboot(ip, password, port, dryrun): + global verbose + global plc_lock + telnet = None try: + #plc_lock.acquire() + #print "lock acquired" + + #try: + #telnet = telnetlib.Telnet(ip) # , timeout=TELNET_TIMEOUT) telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT) + #except: + # import traceback + # traceback.print_exc() + + telnet.set_debuglevel(verbose) # XXX Some iPals require you to hit Enter a few times first @@ -60,6 +108,9 @@ def ipal_reboot(ip, password, port): # Login telnet_answer(telnet, "Password >", password) + # XXX Some iPals require you to hit Enter a few times first + telnet.write("\r\n\r\n") + # P# - Pulse relay if not dryrun: telnet_answer(telnet, "Enter >", "P%d" % port) @@ -68,33 +119,184 @@ def ipal_reboot(ip, password, port): # Close telnet.close() + + #print "lock released" + #plc_lock.release() return 0 except EOFError, err: if verbose: + logger.debug("ipal_reboot: EOF") logger.debug(err) telnet.close() + import traceback + traceback.print_exc() + #print "lock released" + #plc_lock.release() return errno.ECONNRESET + except socket.error, err: + logger.debug("ipal_reboot: Socket Error") + logger.debug(err) + import traceback + traceback.print_exc() + + return errno.ETIMEDOUT + except Exception, err: if verbose: + logger.debug("ipal_reboot: Exception") logger.debug(err) if telnet: telnet.close() + import traceback + traceback.print_exc() + #print "lock released" + #plc_lock.release() + return "ipal error" + +def apc_reboot_original(ip, username, password, port, protocol, dryrun): + global verbose + + transport = None + + # TODO: I may need to differentiate between different models of APC + # hardware... + # for instance, the original code didn't work for: + # planetdev03.fm.intel.com + # American Power Conversion + # Network Management Card AOS v3.3.0 + # (c) Copyright 2005 All Rights Reserved + # Rack PDU APP v3.3.1 + + + try: + #if "ssh" in protocol: + if "22" in protocol and protocol['22'] == "open": + transport = pyssh.Ssh(username, ip) + transport.open() + # Login + telnet_answer(transport, "password:", password) + #elif "telnet" in protocol: + elif "23" in protocol and protocol['23'] == "open": + transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT) + #transport = telnetlib.Telnet(ip) + transport.set_debuglevel(verbose) + # Login + telnet_answer(transport, "User Name", username) + telnet_answer(transport, "Password", password) + else: + logger.debug("Unknown protocol %s" %protocol) + raise "Closed protocol ports!" + + + # 1- Device Manager + # 2- Network + # 3- System + # 4- Logout + + # 1- Device Manager + telnet_answer(transport, "\r\n> ", "1") + + # 1- Phase Monitor/Configuration + # 2- Outlet Restriction Configuration + # 3- Outlet Control/Config + # 4- Power Supply Status + + # 3- Outlet Control/Config + #telnet_answer(transport, "\r\n> ", "2") + #telnet_answer(transport, "\r\n> ", "1") + + # 3- Outlet Control/Config + telnet_answer(transport, "\r\n> ", "3") + + # 1- Outlet 1 + # 2- Outlet 2 + # ... + + # n- Outlet n + telnet_answer(transport, "\r\n> ", str(port)) + + # 1- Control Outlet + # 2- Configure Outlet + + # 1- Control Outlet + telnet_answer(transport, "\r\n> ", "1") + + # 1- Immediate On + # 2- Immediate Off + # 3- Immediate Reboot + # 4- Delayed On + # 5- Delayed Off + # 6- Delayed Reboot + # 7- Cancel + + # 3- Immediate Reboot + telnet_answer(transport, "\r\n> ", "3") + + if not dryrun: + telnet_answer(transport, + "Enter 'YES' to continue or to cancel", "YES\r\n") + telnet_answer(transport, + "Press to continue...", "") + + # Close + transport.close() + return 0 + + except EOFError, err: + if verbose: + logger.debug(err) + if transport: + transport.close() + return errno.ECONNRESET + except socket.error, err: + if verbose: + logger.debug(err) return errno.ETIMEDOUT + except Exception, err: + import traceback + traceback.print_exc() + if verbose: + logger.debug(err) + if transport: + transport.close() + return "apc error: check password" + +def apc_reboot(ip, username, password, port, protocol, dryrun): + global verbose -def apc_reboot(ip, username, password, port): - global dryrun, verbose + transport = None + + # TODO: I may need to differentiate between different models of APC + # hardware... + # for instance, the original code didn't work for: + # planetdev03.fm.intel.com + # American Power Conversion + # Network Management Card AOS v3.3.0 + # (c) Copyright 2005 All Rights Reserved + # Rack PDU APP v3.3.1 - telnet = None try: - telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT) - telnet.set_debuglevel(verbose) + #if "ssh" in protocol: + if "22" in protocol and protocol['22'] == "open": + transport = pyssh.Ssh(username, ip) + transport.open() + # Login + telnet_answer(transport, "password:", password) + #elif "telnet" in protocol: + elif "23" in protocol and protocol['23'] == "open": + transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT) + #transport = telnetlib.Telnet(ip) + transport.set_debuglevel(verbose) + # Login + telnet_answer(transport, "User Name", username) + telnet_answer(transport, "Password", password) + else: + logger.debug("Unknown protocol %s" %protocol) + raise "Closed protocol ports!" - # Login - telnet_answer(telnet, "User Name", username) - telnet_answer(telnet, "Password", password) # 1- Device Manager # 2- Network @@ -102,7 +304,7 @@ def apc_reboot(ip, username, password, port): # 4- Logout # 1- Device Manager - telnet_answer(telnet, "\r\n> ", "1") + telnet_answer(transport, "\r\n> ", "1") # 1- Phase Monitor/Configuration # 2- Outlet Restriction Configuration @@ -110,20 +312,24 @@ def apc_reboot(ip, username, password, port): # 4- Power Supply Status # 3- Outlet Control/Config - telnet_answer(telnet, "\r\n> ", "3") + telnet_answer(transport, "\r\n> ", "2") + telnet_answer(transport, "\r\n> ", "1") + + # 3- Outlet Control/Config + #telnet_answer(transport, "\r\n> ", "3") # 1- Outlet 1 # 2- Outlet 2 # ... # n- Outlet n - telnet_answer(telnet, "\r\n> ", str(port)) + telnet_answer(transport, "\r\n> ", str(port)) # 1- Control Outlet # 2- Configure Outlet # 1- Control Outlet - telnet_answer(telnet, "\r\n> ", "1") + telnet_answer(transport, "\r\n> ", "1") # 1- Immediate On # 2- Immediate Off @@ -134,39 +340,158 @@ def apc_reboot(ip, username, password, port): # 7- Cancel # 3- Immediate Reboot - telnet_answer(telnet, "\r\n> ", "3") + telnet_answer(transport, "\r\n> ", "3") if not dryrun: - telnet_answer(telnet, + telnet_answer(transport, "Enter 'YES' to continue or to cancel", "YES\r\n") - telnet_answer(telnet, + telnet_answer(transport, "Press to continue...", "") # Close - telnet.close() + transport.close() return 0 except EOFError, err: if verbose: logger.debug(err) - if telnet: - telnet.close() + if transport: + transport.close() return errno.ECONNRESET + except socket.error, err: + if verbose: + logger.debug(err) + return errno.ETIMEDOUT + except Exception, err: + import traceback + traceback.print_exc() if verbose: logger.debug(err) - if telnet: - telnet.close() + if transport: + transport.close() + return apc_reboot_original(ip, username, password, port, protocol, dryrun) + +def drac_reboot(ip, username, password, dryrun): + global verbose + ssh = None + try: + ssh = pyssh.Ssh(username, ip) + ssh.set_debuglevel(verbose) + ssh.open() + # Login + print "password" + telnet_answer(ssh, "password:", password) + + # Testing Reboot ? + print "reset or power" + if dryrun: + telnet_answer(ssh, "[%s]#" % username, "getsysinfo") + else: + # Reset this machine + telnet_answer(ssh, "[%s]#" % username, "serveraction powercycle") + + print "exit" + telnet_answer(ssh, "[%s]#" % username, "exit") + + # Close + print "close" + output = ssh.close() + return 0 + + except socket.error, err: + print "exception" + import traceback + traceback.print_exc() + if verbose: + logger.debug(err) + if ssh: + output = ssh.close() + if verbose: + logger.debug(err) return errno.ETIMEDOUT + except Exception, err: + print "exception" + import traceback + traceback.print_exc() + if verbose: + logger.debug(err) + if ssh: + output = ssh.close() + if verbose: + logger.debug(err) + return "drac error: check password" + +def ilo_reboot(ip, username, password, dryrun): + global verbose + ssh = None + + try: + ssh = pyssh.Ssh(username, ip) + ssh.set_debuglevel(verbose) + ssh.open() + # Login + print "password" + telnet_answer(ssh, "password:", password) + + # User:vici logged-in to ILOUSE701N7N4.CS.Princeton.EDU(128.112.154.171) + # iLO Advanced 1.26 at 10:01:40 Nov 17 2006 + # Server Name: USE701N7N400 + # Server Power: On + # + # hpiLO-> + print "cd system1" + telnet_answer(ssh, "hpiLO->", "cd system1") + + # Reboot Outlet N (Y/N)? + print "reset or power" + if dryrun: + telnet_answer(ssh, "hpiLO->", "POWER") + else: + # Reset this machine + telnet_answer(ssh, "hpiLO->", "reset") + + print "exit" + telnet_answer(ssh, "hpiLO->", "exit") + + # Close + print "close" + output = ssh.close() + return 0 + + except socket.error, err: + print "exception" + import traceback + traceback.print_exc() + if verbose: + logger.debug(err) + if ssh: + output = ssh.close() + if verbose: + logger.debug(err) + return errno.ETIMEDOUT + except Exception, err: + print "exception" + import traceback + traceback.print_exc() + if verbose: + logger.debug(err) + if ssh: + output = ssh.close() + if verbose: + logger.debug(err) + return "ilo error: check password" -def baytech_reboot(ip, username, password, port): - global dryrun, verbose +def baytech_reboot(ip, username, password, port, dryrun): + global verbose ssh = None + #verbose = 1 try: ssh = pyssh.Ssh(username, ip) + ssh.set_debuglevel(verbose) ssh.open() # Login @@ -183,7 +508,15 @@ def baytech_reboot(ip, username, password, port): telnet_answer(ssh, "Enter Request :", "5") # Reboot N - telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port) + try: + telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port) + except ExceptionNotFound, msg: + # one machine is configured to ask for a username, + # even after login... + print "msg: %s" % msg + ssh.write(username + "\r\n") + telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port) + # Reboot Outlet N (Y/N)? if dryrun: @@ -194,11 +527,12 @@ def baytech_reboot(ip, username, password, port): # Close output = ssh.close() - if verbose: - logger.debug(err) return 0 - except Exception, err: + except socket.error, err: + print "exception" + import traceback + traceback.print_exc() if verbose: logger.debug(err) if ssh: @@ -206,6 +540,17 @@ def baytech_reboot(ip, username, password, port): if verbose: logger.debug(err) return errno.ETIMEDOUT + except Exception, err: + print "exception" + import traceback + traceback.print_exc() + if verbose: + logger.debug(err) + if ssh: + output = ssh.close() + if verbose: + logger.debug(err) + return "baytech error: check password" ### rebooting european BlackBox PSE boxes # Thierry Parmentelat - May 11 2005 @@ -216,9 +561,9 @@ def baytech_reboot(ip, username, password, port): # curl --http1.0 --basic --user : --data P=r \ # http://:/cmd.html && echo OK -def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port): +def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port, dryrun): - global dryrun, verbose + global verbose url = "http://%s:%d/cmd.html" % (pcu_ip,http_port) data= "P%d=r" % port_in_pcu @@ -246,7 +591,7 @@ def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port): except urllib2.URLError,err: logger.info('Could not open http connection', err) - return -1 + return "bbpse error" ### rebooting x10toggle based systems addressed by port # Marc E. Fiuczynski - May 31 2005 @@ -254,8 +599,8 @@ def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port): # uses ssh and password to login to an account # that will cause the system to be powercycled. -def x10toggle_reboot(ip, username, password, port): - global dryrun, verbose +def x10toggle_reboot(ip, username, password, port, dryrun): + global verbose ssh = None try: @@ -359,8 +704,8 @@ def runcmd(command, args, username, password, timeout = None): out += "; output follows:\n" + data raise Exception, out -def racadm_reboot(ip, username, password, port): - global dryrun, verbose +def racadm_reboot(ip, username, password, port, dryrun): + global verbose try: cmd = "/usr/sbin/racadm" @@ -382,8 +727,156 @@ def racadm_reboot(ip, username, password, port): logger.debug(err) return errno.ETIMEDOUT +def pcu_name(pcu): + if pcu['hostname'] is not None and pcu['hostname'] is not "": + return pcu['hostname'] + elif pcu['ip'] is not None and pcu['ip'] is not "": + return pcu['ip'] + else: + return None + +def get_pcu_values(pcu_id): + # TODO: obviously, this shouldn't be loaded each time... + import soltesz + fb =soltesz.dbLoad("findbadpcus") + + try: + values = fb['nodes']["id_%s" % pcu_id]['values'] + except: + values = None + + return values + +def check_open_port(values, port_list): + ret = False + + if 'portstatus' in values: + for port in port_list: + if port in values['portstatus'] and \ + values['portstatus'][port] == "open": + + ret = True + + return ret + + +def reboot_new(nodename, continue_probe, dryrun): + + pcu = plc.getpcu(nodename) + if not pcu: + return False + + values = get_pcu_values(pcu['pcu_id']) + if values == None: + return False + + # Try the PCU first + logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model'])) + + # DataProbe iPal (many sites) + if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0: + if check_open_port(values, ['23']): + rb_ret = ipal_reboot(pcu_name(values), + values['password'], + pcu[nodename], + dryrun) + else: + rb_ret = "Unsupported_Port" + + + # APC Masterswitch (Berkeley) + elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0: + if check_open_port(values, ['22', '23']): + rb_ret = apc_reboot(pcu_name(values), + values['username'], + values['password'], + pcu[nodename], + values['portstatus'], + dryrun) + else: + rb_ret = "Unsupported_Port" + # BayTech DS4-RPC + elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0: + if check_open_port(values, ['22']): + rb_ret = baytech_reboot(pcu_name(values), + values['username'], + values['password'], + pcu[nodename], + dryrun) + else: + rb_ret = "Unsupported_Port" + + + # iLO + elif continue_probe and values['model'].find("HP iLO") >= 0: + if check_open_port(values, ['22']): + rb_ret = ilo_reboot(pcu_name(values), + values['username'], + values['password'], + dryrun) + else: + rb_ret = "Unsupported_Port" + + # DRAC ssh + elif continue_probe and values['model'].find("Dell RAC") >= 0: + if check_open_port(values, ['22']): + rb_ret = drac_reboot(pcu_name(values), + values['username'], + values['password'], + dryrun) + else: + rb_ret = "Unsupported_Port" + + + # BlackBox PSExxx-xx (e.g. PSE505-FR) + elif continue_probe and \ + (values['model'].find("BlackBox PS5xx") >= 0 or + values['model'].find("ePowerSwitch 1/4/8x") >=0 ): + if check_open_port(values, ['80']): + rb_ret = bbpse_reboot(pcu_name(values), + values['username'], + values['password'], + pcu[nodename], + 80, + dryrun) + else: + rb_ret = "Unsupported_PCU" + + # x10toggle + elif continue_probe and values['protocol'] == "ssh" and \ + values['model'] == "x10toggle": + rb_ret = x10toggle_reboot(pcu_name(values), + values['username'], + values['password'], + pcu[nodename], + dryrun) + # ???? + elif continue_probe and values['protocol'] == "racadm" and \ + values['model'] == "RAC": + rb_ret = racadm_reboot(pcu_name(values), + values['username'], + values['password'], + pcu[nodename], + dryrun) + elif continue_probe: + rb_ret = "Unsupported_PCU" + + elif continue_probe == False: + if 'portstatus' in values: + rb_ret = "NetDown" + else: + rb_ret = "Not_Run" + else: + rb_ret = -1 + + if rb_ret != 0: + return False + else: + return True + + # Returns true if rebooted via PCU -def reboot(nodename): +def reboot(nodename, dryrun): pcu = plc.getpcu(nodename) if not pcu: plc.nodePOD(nodename) @@ -392,29 +885,30 @@ def reboot(nodename): logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model'])) # APC Masterswitch (Berkeley) - if pcu['protocol'] == "telnet" and pcu['model'] == "APC Masterswitch": - err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename]) + if pcu['model'] == "APC Masterswitch": + err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'], + pcu[nodename], pcu['protocol'], dryrun) # DataProbe iPal (many sites) elif pcu['protocol'] == "telnet" and pcu['model'].find("IP-4") >= 0: - err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename]) + err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename], dryrun) # BayTech DS4-RPC elif pcu['protocol'] == "ssh" and \ (pcu['model'].find("Baytech") >= 0 or pcu['model'].find("DS4") >= 0): - err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename]) + err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun) # BlackBox PSExxx-xx (e.g. PSE505-FR) elif pcu['protocol'] == "http" and (pcu['model'] == "bbpse"): - err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80) + err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80, dryrun) # x10toggle elif pcu['protocol'] == "ssh" and (pcu['model'] == "x10toggle"): - err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename]) + err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun) - # x10toggle + # elif pcu['protocol'] == "racadm" and (pcu['model'] == "RAC"): - err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename]) + err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename], dryrun) # Unknown or unsupported else: