3 # Reboot specified nodes
9 import errno, time, traceback
11 import threading, popen2
17 plc_lock = threading.Lock()
19 # Use our versions of telnetlib and pyssh
20 sys.path.insert(0, os.path.dirname(sys.argv[0]))
22 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")
28 # Event class ID from pcu events
29 #NODE_POWER_CONTROL = 3
32 #MONITOR_USER_ID = 11142
35 logger = logging.getLogger("monitor")
39 class ExceptionNotFound(Exception): pass
40 class ExceptionPassword(Exception): pass
41 class ExceptionTimeout(Exception): pass
42 class ExceptionPrompt(Exception): pass
43 class ExceptionPort(Exception): pass
45 def telnet_answer(telnet, expected, buffer):
48 output = telnet.read_until(expected, TELNET_TIMEOUT)
50 # logger.debug(output)
51 if output.find(expected) == -1:
52 raise ExceptionNotFound, "'%s' not found" % expected
54 telnet.write(buffer + "\r\n")
57 # PCU has model, host, preferred-port, user, passwd,
60 def __init__(self, protocol, verbose, dryrun):
61 self.verbose = verbose
62 self.protocol = protocol
65 def telnet_answer(telnet, expected, buffer):
68 output = telnet.read_until(expected, TELNET_TIMEOUT)
70 # logger.debug(output)
71 if output.find(expected) == -1:
72 raise ExceptionNotFound, "'%s' not found" % expected
74 telnet.write(buffer + "\r\n")
76 def _run(self, host, user, passwd, node_port, protocols):
84 def ipal_reboot(ip, password, port, dryrun):
93 #print "lock acquired"
96 #telnet = telnetlib.Telnet(ip) # , timeout=TELNET_TIMEOUT)
97 telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
100 # traceback.print_exc()
103 telnet.set_debuglevel(verbose)
105 # XXX Some iPals require you to hit Enter a few times first
106 telnet_answer(telnet, "Password >", "\r\n\r\n")
109 telnet_answer(telnet, "Password >", password)
111 # XXX Some iPals require you to hit Enter a few times first
112 telnet.write("\r\n\r\n")
116 telnet_answer(telnet, "Enter >", "P%d" % port)
118 telnet.read_until("Enter >", TELNET_TIMEOUT)
123 #print "lock released"
127 except EOFError, err:
129 logger.debug("ipal_reboot: EOF")
133 traceback.print_exc()
134 #print "lock released"
136 return errno.ECONNRESET
137 except socket.error, err:
138 logger.debug("ipal_reboot: Socket Error")
141 traceback.print_exc()
143 return errno.ETIMEDOUT
145 except Exception, err:
147 logger.debug("ipal_reboot: Exception")
152 traceback.print_exc()
153 #print "lock released"
158 def apc_reboot(ip, username, password, port, protocol, dryrun):
163 # TODO: I may need to differentiate between different models of APC
165 # for instance, the original code didn't work for:
166 # planetdev03.fm.intel.com
167 # American Power Conversion
168 # Network Management Card AOS v3.3.0
169 # (c) Copyright 2005 All Rights Reserved
170 # Rack PDU APP v3.3.1
174 #if "ssh" in protocol:
175 if "22" in protocol and protocol['22'] == "open":
176 transport = pyssh.Ssh(username, ip)
179 telnet_answer(transport, "password:", password)
180 #elif "telnet" in protocol:
181 elif "23" in protocol and protocol['23'] == "open":
182 transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
183 #transport = telnetlib.Telnet(ip)
184 transport.set_debuglevel(verbose)
186 telnet_answer(transport, "User Name", username)
187 telnet_answer(transport, "Password", password)
189 logger.debug("Unknown protocol %s" %protocol)
190 raise "Closed protocol ports!"
199 telnet_answer(transport, "\r\n> ", "1")
201 # 1- Phase Monitor/Configuration
202 # 2- Outlet Restriction Configuration
203 # 3- Outlet Control/Config
204 # 4- Power Supply Status
206 # 3- Outlet Control/Config
207 telnet_answer(transport, "\r\n> ", "2")
208 telnet_answer(transport, "\r\n> ", "1")
210 # 3- Outlet Control/Config
211 #telnet_answer(transport, "\r\n> ", "3")
218 telnet_answer(transport, "\r\n> ", str(port))
221 # 2- Configure Outlet
224 telnet_answer(transport, "\r\n> ", "1")
228 # 3- Immediate Reboot
234 # 3- Immediate Reboot
235 telnet_answer(transport, "\r\n> ", "3")
238 telnet_answer(transport,
239 "Enter 'YES' to continue or <ENTER> to cancel", "YES\r\n")
240 telnet_answer(transport,
241 "Press <ENTER> to continue...", "")
247 except EOFError, err:
252 return errno.ECONNRESET
253 except socket.error, err:
256 return errno.ETIMEDOUT
258 except Exception, err:
260 traceback.print_exc()
265 return "apc error: check password"
267 def drac_reboot(ip, username, password, dryrun):
271 ssh = pyssh.Ssh(username, ip)
272 ssh.set_debuglevel(verbose)
276 telnet_answer(ssh, "password:", password)
279 print "reset or power"
281 telnet_answer(ssh, "[%s]#" % username, "getsysinfo")
284 telnet_answer(ssh, "[%s]#" % username, "serveraction powercycle")
287 telnet_answer(ssh, "[%s]#" % username, "exit")
294 except socket.error, err:
297 traceback.print_exc()
304 return errno.ETIMEDOUT
305 except Exception, err:
308 traceback.print_exc()
315 return "drac error: check password"
317 def ilo_reboot(ip, username, password, dryrun):
323 ssh = pyssh.Ssh(username, ip)
324 ssh.set_debuglevel(verbose)
328 telnet_answer(ssh, "password:", password)
330 # User:vici logged-in to ILOUSE701N7N4.CS.Princeton.EDU(128.112.154.171)
331 # iLO Advanced 1.26 at 10:01:40 Nov 17 2006
332 # Server Name: USE701N7N400
337 telnet_answer(ssh, "</>hpiLO->", "cd system1")
339 # Reboot Outlet N (Y/N)?
340 print "reset or power"
342 telnet_answer(ssh, "</system1>hpiLO->", "POWER")
345 telnet_answer(ssh, "</system1>hpiLO->", "reset")
348 telnet_answer(ssh, "</system1>hpiLO->", "exit")
355 except socket.error, err:
358 traceback.print_exc()
365 return errno.ETIMEDOUT
366 except Exception, err:
369 traceback.print_exc()
376 return "ilo error: check password"
378 def baytech_reboot(ip, username, password, port, dryrun):
385 ssh = pyssh.Ssh(username, ip)
386 ssh.set_debuglevel(verbose)
390 telnet_answer(ssh, "password:", password)
392 # PL1 comm output (2 ,1).........1
393 # PL2 comm output (2 ,2).........2
394 # PL3 comm output (2 ,3).........3
395 # no machine (2 ,4).........4
396 # Control Outlets (5 ,1).........5
397 # Logout..........................T
399 # Control Outlets (5 ,1).........5
400 telnet_answer(ssh, "Enter Request :", "5")
404 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
405 except ExceptionNotFound, msg:
406 # one machine is configured to ask for a username,
407 # even after login...
408 print "msg: %s" % msg
409 ssh.write(username + "\r\n")
410 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
413 # Reboot Outlet N (Y/N)?
415 telnet_answer(ssh, "(Y/N)?", "N")
417 telnet_answer(ssh, "(Y/N)?", "Y")
418 telnet_answer(ssh, "DS-RPC>", "")
424 except socket.error, err:
427 traceback.print_exc()
434 return errno.ETIMEDOUT
435 except Exception, err:
438 traceback.print_exc()
445 return "baytech error: check password"
447 ### rebooting european BlackBox PSE boxes
448 # Thierry Parmentelat - May 11 2005
449 # tested on 4-ports models known as PSE505-FR
450 # uses http to POST a data 'P<port>=r'
451 # relies on basic authentication within http1.0
452 # first curl-based script was
453 # curl --http1.0 --basic --user <username>:<password> --data P<port>=r \
454 # http://<hostname>:<http_port>/cmd.html && echo OK
456 def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port, dryrun):
460 url = "http://%s:%d/cmd.html" % (pcu_ip,http_port)
461 data= "P%d=r" % port_in_pcu
463 logger.debug("POSTing '%s' on %s" % (data,url))
465 authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
466 uri = "%s:%d" % (pcu_ip,http_port)
467 authinfo.add_password (None, uri, username, password)
468 authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
470 opener = urllib2.build_opener(authhandler)
471 urllib2.install_opener(opener)
477 f = urllib2.urlopen(url,data)
484 except urllib2.URLError,err:
485 logger.info('Could not open http connection', err)
488 ### rebooting x10toggle based systems addressed by port
489 # Marc E. Fiuczynski - May 31 2005
490 # tested on 4-ports models known as PSE505-FR
491 # uses ssh and password to login to an account
492 # that will cause the system to be powercycled.
494 def x10toggle_reboot(ip, username, password, port, dryrun):
499 ssh = pyssh.Ssh(username, ip)
503 telnet_answer(ssh, "password:", password)
507 telnet_answer(ssh, "x10toggle>", "A%d" % port)
515 except Exception, err:
522 return errno.ETIMEDOUT
524 ### rebooting Dell systems via RAC card
525 # Marc E. Fiuczynski - June 01 2005
526 # tested with David Lowenthal's itchy/scratchy nodes at UGA
529 def runcmd(command, args, username, password, timeout = None):
532 result_ready = threading.Condition()
536 result_ready.acquire()
540 result_ready.notify()
541 result_ready.release()
543 def do_command(command, username, password):
546 # Popen4 is a popen-type class that combines stdout and stderr
547 p = popen2.Popen4(command)
549 # read all output data
550 p.tochild.write("%s\n" % username)
551 p.tochild.write("%s\n" % password)
553 data = p.fromchild.read()
556 # might get interrupted by a signal in poll() or waitpid()
559 set_result((retval, data))
562 if ex.errno == errno.EINTR:
565 except Exception, ex:
569 command = " ".join([command] + args)
571 worker = threading.Thread(target = do_command, args = (command, username, password, ))
572 worker.setDaemon(True)
573 result_ready.acquire()
575 result_ready.wait(timeout)
578 raise Exception, "command timed-out: '%s'" % command
580 result_ready.release()
583 if isinstance(result, Exception):
586 (retval, data) = result
587 if os.WIFEXITED(retval) and os.WEXITSTATUS(retval) == 0:
590 out = "system command ('%s') " % command
591 if os.WIFEXITED(retval):
592 out += "failed, rc = %d" % os.WEXITSTATUS(retval)
594 out += "killed by signal %d" % os.WTERMSIG(retval)
596 out += "; output follows:\n" + data
599 def racadm_reboot(ip, username, password, port, dryrun):
603 cmd = "/usr/sbin/racadm"
606 output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip],
609 output = "dryrun of racadm command"
611 logger.debug("runcmd returned without output %s" % output)
616 except Exception, err:
617 logger.debug("runcmd raised exception %s" % err)
620 return errno.ETIMEDOUT
623 if pcu['hostname'] is not None and pcu['hostname'] is not "":
624 return pcu['hostname']
625 elif pcu['ip'] is not None and pcu['ip'] is not "":
630 def get_pcu_values(pcu_id):
631 # TODO: obviously, this shouldn't be loaded each time...
633 fb =soltesz.dbLoad("findbadpcus")
636 values = fb['nodes']["id_%s" % pcu_id]['values']
642 def reboot_new(nodename, continue_probe, dryrun):
644 pcu = plc.getpcu(nodename)
648 values = get_pcu_values(pcu['pcu_id'])
653 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
655 # DataProbe iPal (many sites)
656 if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0:
657 if values['portstatus']['23'] == "open":
658 rb_ret = ipal_reboot(pcu_name(values),
663 rb_ret = "Unsupported_Port"
666 # APC Masterswitch (Berkeley)
667 elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0:
668 if values['portstatus']['22'] == "open" or \
669 values['portstatus']['23'] == "open":
670 rb_ret = apc_reboot(pcu_name(values),
674 values['portstatus'],
677 rb_ret = "Unsupported_Port"
679 elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0:
680 if values['portstatus']['22'] == "open":
681 rb_ret = baytech_reboot(pcu_name(values),
687 rb_ret = "Unsupported_Port"
691 elif continue_probe and values['model'].find("HP iLO") >= 0:
692 if values['portstatus']['22'] == "open":
693 rb_ret = ilo_reboot(pcu_name(values),
698 rb_ret = "Unsupported_Port"
701 elif continue_probe and values['model'].find("Dell RAC") >= 0:
702 if values['portstatus']['22'] == "open":
703 rb_ret = drac_reboot(pcu_name(values),
708 rb_ret = "Unsupported_Port"
711 # BlackBox PSExxx-xx (e.g. PSE505-FR)
712 elif continue_probe and \
713 (values['model'].find("BlackBox PS5xx") >= 0 or
714 values['model'].find("ePowerSwitch 1/4/8x") >=0 ):
715 if values['portstatus']['80'] == "open":
716 rb_ret = bbpse_reboot(pcu_name(values),
723 rb_ret = "Unsupported_PCU"
726 elif continue_probe and values['protocol'] == "ssh" and \
727 values['model'] == "x10toggle":
728 rb_ret = x10toggle_reboot(pcu_name(values),
734 elif continue_probe and values['protocol'] == "racadm" and \
735 values['model'] == "RAC":
736 rb_ret = racadm_reboot(pcu_name(values),
742 rb_ret = "Unsupported_PCU"
744 elif continue_probe == False:
745 if 'portstatus' in values:
758 # Returns true if rebooted via PCU
759 def reboot(nodename, dryrun):
760 pcu = plc.getpcu(nodename)
762 plc.nodePOD(nodename)
765 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
767 # APC Masterswitch (Berkeley)
768 if pcu['model'] == "APC Masterswitch":
769 err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'],
770 pcu[nodename], pcu['protocol'], dryrun)
772 # DataProbe iPal (many sites)
773 elif pcu['protocol'] == "telnet" and pcu['model'].find("IP-4") >= 0:
774 err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename], dryrun)
777 elif pcu['protocol'] == "ssh" and \
778 (pcu['model'].find("Baytech") >= 0 or pcu['model'].find("DS4") >= 0):
779 err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
781 # BlackBox PSExxx-xx (e.g. PSE505-FR)
782 elif pcu['protocol'] == "http" and (pcu['model'] == "bbpse"):
783 err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80, dryrun)
786 elif pcu['protocol'] == "ssh" and (pcu['model'] == "x10toggle"):
787 err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
790 elif pcu['protocol'] == "racadm" and (pcu['model'] == "RAC"):
791 err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename], dryrun)
793 # Unknown or unsupported
795 err = errno.EPROTONOSUPPORT
799 #def get_suggested(suggestion_id,db):
801 # sql= """select node_id,pcu_id from nodes where suggestion = %d """\
804 # nodes = db.query(sql).dictresult()
805 # except pg.ProgrammingError, err:
806 # print( "Database error for query: %s\n%s" % (sql,err) )
810 #def get_pcu_info(node_id,pcu_id,db):
811 # sql= """select port_number from pcu_ports where node_id = %d and pcu_id = %d """\
814 # port_number = db.query(sql).dictresult()
815 # except pg.ProgrammingError, err:
816 # print( "Database error for query: %s\n%s" % (sql,err) )
819 # sql= """select * from pcu where pcu_id = %d """\
822 # pcu = db.query(sql).dictresult()
823 # except pg.ProgrammingError, err:
824 # print( "Database error for query: %s\n%s" % (sql,err) )
827 # result = {'node_id':node_id,'pcu_id':pcu_id,'port_number':port_number[0]['port_number'],
828 # 'ip':pcu[0]['ip'],'username':pcu[0]['username'],'password':pcu[0]['password'],\
829 # 'model':pcu[0]['model'],'protocol':pcu[0]['protocol'],'hostname':pcu[0]['hostname']}
833 #def add_plc_event(node_id,err,db):
834 # site_id = plc_db_utils.get_site_from_node_id(node_id,db)
835 # message = "PCU reboot by monitor-msgs@planet-lab.org: %s" % os.strerror(err)
837 # sql = """insert into events (event_class_id,message,person_id,node_id,site_id) values """\
838 # """(%d,'%s',%d,%d,%d)""" % (NODE_POWER_CONTROL,message,MONITOR_USER_ID,node_id,site_id)
843 # except pg.ProgrammingError, err:
844 # print( "Database error for: %s\n%s" % (sql,err) )
849 logger.setLevel(logging.DEBUG)
850 ch = logging.StreamHandler()
851 ch.setLevel(logging.DEBUG)
852 formatter = logging.Formatter('LOGGER - %(message)s')
853 ch.setFormatter(formatter)
854 logger.addHandler(ch)
858 reboot("planetlab2.cs.uchicago.edu")
859 reboot("alice.cs.princeton.edu")
860 except Exception, err:
862 # used later for pretty printing
863 # pp = pprint.PrettyPrinter(indent=2)
868 # plc_db = plc_dbs.open_plc_db_write()
869 # mon_db = plc_dbs.open_mon_db()
871 # 5 = needs script reboot - fix this later
872 # nodes = get_suggested(5,mon_db)
876 # pcu = get_pcu_info(row['node_id'],row['pcu_id'],plc_db)
877 # add_plc_event(row['node_id'],err,plc_db)
879 if __name__ == '__main__':
881 logger = logging.getLogger("monitor")