3 # Reboot specified nodes
9 import errno, time, traceback
11 import threading, popen2
17 plc_lock = threading.Lock()
19 # Use our versions of telnetlib and pyssh
20 sys.path.insert(0, os.path.dirname(sys.argv[0]))
22 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")
28 # Event class ID from pcu events
29 #NODE_POWER_CONTROL = 3
32 #MONITOR_USER_ID = 11142
35 logger = logging.getLogger("monitor")
39 class ExceptionNotFound(Exception): pass
40 class ExceptionPassword(Exception): pass
41 class ExceptionTimeout(Exception): pass
42 class ExceptionPrompt(Exception): pass
43 class ExceptionPort(Exception): pass
45 def telnet_answer(telnet, expected, buffer):
48 output = telnet.read_until(expected, TELNET_TIMEOUT)
50 # logger.debug(output)
51 if output.find(expected) == -1:
52 raise ExceptionNotFound, "'%s' not found" % expected
54 telnet.write(buffer + "\r\n")
57 # PCU has model, host, preferred-port, user, passwd,
60 def __init__(self, protocol, verbose, dryrun):
61 self.verbose = verbose
62 self.protocol = protocol
65 def telnet_answer(telnet, expected, buffer):
68 output = telnet.read_until(expected, TELNET_TIMEOUT)
70 # logger.debug(output)
71 if output.find(expected) == -1:
72 raise ExceptionNotFound, "'%s' not found" % expected
74 telnet.write(buffer + "\r\n")
76 def _run(self, host, user, passwd, node_port, protocols):
84 def ipal_reboot(ip, password, port, dryrun):
93 #print "lock acquired"
96 #telnet = telnetlib.Telnet(ip) # , timeout=TELNET_TIMEOUT)
97 telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
100 # traceback.print_exc()
103 telnet.set_debuglevel(verbose)
105 # XXX Some iPals require you to hit Enter a few times first
106 telnet_answer(telnet, "Password >", "\r\n\r\n")
109 telnet_answer(telnet, "Password >", password)
111 # XXX Some iPals require you to hit Enter a few times first
112 telnet.write("\r\n\r\n")
116 telnet_answer(telnet, "Enter >", "P%d" % port)
118 telnet.read_until("Enter >", TELNET_TIMEOUT)
123 #print "lock released"
127 except EOFError, err:
129 logger.debug("ipal_reboot: EOF")
133 traceback.print_exc()
134 #print "lock released"
136 return errno.ECONNRESET
137 except socket.error, err:
138 logger.debug("ipal_reboot: Socket Error")
141 traceback.print_exc()
143 return errno.ETIMEDOUT
145 except Exception, err:
147 logger.debug("ipal_reboot: Exception")
152 traceback.print_exc()
153 #print "lock released"
158 def apc_reboot(ip, username, password, port, protocol, dryrun):
164 #if "ssh" in protocol:
165 if "22" in protocol and protocol['22'] == "open":
166 transport = pyssh.Ssh(username, ip)
169 telnet_answer(transport, "password:", password)
170 #elif "telnet" in protocol:
171 elif "23" in protocol and protocol['23'] == "open":
172 transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
173 #transport = telnetlib.Telnet(ip)
174 transport.set_debuglevel(verbose)
176 telnet_answer(transport, "User Name", username)
177 telnet_answer(transport, "Password", password)
179 logger.debug("Unknown protocol %s" %protocol)
180 raise "Closed protocol ports!"
189 telnet_answer(transport, "\r\n> ", "1")
191 # 1- Phase Monitor/Configuration
192 # 2- Outlet Restriction Configuration
193 # 3- Outlet Control/Config
194 # 4- Power Supply Status
196 # 3- Outlet Control/Config
197 telnet_answer(transport, "\r\n> ", "3")
204 telnet_answer(transport, "\r\n> ", str(port))
207 # 2- Configure Outlet
210 telnet_answer(transport, "\r\n> ", "1")
214 # 3- Immediate Reboot
220 # 3- Immediate Reboot
221 telnet_answer(transport, "\r\n> ", "3")
224 telnet_answer(transport,
225 "Enter 'YES' to continue or <ENTER> to cancel", "YES\r\n")
226 telnet_answer(transport,
227 "Press <ENTER> to continue...", "")
233 except EOFError, err:
238 return errno.ECONNRESET
239 except socket.error, err:
242 return errno.ETIMEDOUT
244 except Exception, err:
246 traceback.print_exc()
251 return "apc error: check password"
253 def drac_reboot(ip, username, password, dryrun):
257 ssh = pyssh.Ssh(username, ip)
258 ssh.set_debuglevel(verbose)
262 telnet_answer(ssh, "password:", password)
265 print "reset or power"
267 telnet_answer(ssh, "[%s]#" % username, "getsysinfo")
270 telnet_answer(ssh, "[%s]#" % username, "serveraction powercycle")
273 telnet_answer(ssh, "[%s]#" % username, "exit")
280 except socket.error, err:
283 traceback.print_exc()
290 return errno.ETIMEDOUT
291 except Exception, err:
294 traceback.print_exc()
301 return "drac error: check password"
303 def ilo_reboot(ip, username, password, dryrun):
309 ssh = pyssh.Ssh(username, ip)
310 ssh.set_debuglevel(verbose)
314 telnet_answer(ssh, "password:", password)
316 # User:vici logged-in to ILOUSE701N7N4.CS.Princeton.EDU(128.112.154.171)
317 # iLO Advanced 1.26 at 10:01:40 Nov 17 2006
318 # Server Name: USE701N7N400
323 telnet_answer(ssh, "</>hpiLO->", "cd system1")
325 # Reboot Outlet N (Y/N)?
326 print "reset or power"
328 telnet_answer(ssh, "</system1>hpiLO->", "POWER")
331 telnet_answer(ssh, "</system1>hpiLO->", "reset")
334 telnet_answer(ssh, "</system1>hpiLO->", "exit")
341 except socket.error, err:
344 traceback.print_exc()
351 return errno.ETIMEDOUT
352 except Exception, err:
355 traceback.print_exc()
362 return "ilo error: check password"
364 def baytech_reboot(ip, username, password, port, dryrun):
371 ssh = pyssh.Ssh(username, ip)
372 ssh.set_debuglevel(verbose)
376 telnet_answer(ssh, "password:", password)
378 # PL1 comm output (2 ,1).........1
379 # PL2 comm output (2 ,2).........2
380 # PL3 comm output (2 ,3).........3
381 # no machine (2 ,4).........4
382 # Control Outlets (5 ,1).........5
383 # Logout..........................T
385 # Control Outlets (5 ,1).........5
386 telnet_answer(ssh, "Enter Request :", "5")
390 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
391 except ExceptionNotFound, msg:
392 # one machine is configured to ask for a username,
393 # even after login...
394 print "msg: %s" % msg
395 ssh.write(username + "\r\n")
396 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
399 # Reboot Outlet N (Y/N)?
401 telnet_answer(ssh, "(Y/N)?", "N")
403 telnet_answer(ssh, "(Y/N)?", "Y")
404 telnet_answer(ssh, "DS-RPC>", "")
410 except socket.error, err:
413 traceback.print_exc()
420 return errno.ETIMEDOUT
421 except Exception, err:
424 traceback.print_exc()
431 return "baytech error: check password"
433 ### rebooting european BlackBox PSE boxes
434 # Thierry Parmentelat - May 11 2005
435 # tested on 4-ports models known as PSE505-FR
436 # uses http to POST a data 'P<port>=r'
437 # relies on basic authentication within http1.0
438 # first curl-based script was
439 # curl --http1.0 --basic --user <username>:<password> --data P<port>=r \
440 # http://<hostname>:<http_port>/cmd.html && echo OK
442 def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port, dryrun):
446 url = "http://%s:%d/cmd.html" % (pcu_ip,http_port)
447 data= "P%d=r" % port_in_pcu
449 logger.debug("POSTing '%s' on %s" % (data,url))
451 authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
452 uri = "%s:%d" % (pcu_ip,http_port)
453 authinfo.add_password (None, uri, username, password)
454 authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
456 opener = urllib2.build_opener(authhandler)
457 urllib2.install_opener(opener)
463 f = urllib2.urlopen(url,data)
470 except urllib2.URLError,err:
471 logger.info('Could not open http connection', err)
474 ### rebooting x10toggle based systems addressed by port
475 # Marc E. Fiuczynski - May 31 2005
476 # tested on 4-ports models known as PSE505-FR
477 # uses ssh and password to login to an account
478 # that will cause the system to be powercycled.
480 def x10toggle_reboot(ip, username, password, port, dryrun):
485 ssh = pyssh.Ssh(username, ip)
489 telnet_answer(ssh, "password:", password)
493 telnet_answer(ssh, "x10toggle>", "A%d" % port)
501 except Exception, err:
508 return errno.ETIMEDOUT
510 ### rebooting Dell systems via RAC card
511 # Marc E. Fiuczynski - June 01 2005
512 # tested with David Lowenthal's itchy/scratchy nodes at UGA
515 def runcmd(command, args, username, password, timeout = None):
518 result_ready = threading.Condition()
522 result_ready.acquire()
526 result_ready.notify()
527 result_ready.release()
529 def do_command(command, username, password):
532 # Popen4 is a popen-type class that combines stdout and stderr
533 p = popen2.Popen4(command)
535 # read all output data
536 p.tochild.write("%s\n" % username)
537 p.tochild.write("%s\n" % password)
539 data = p.fromchild.read()
542 # might get interrupted by a signal in poll() or waitpid()
545 set_result((retval, data))
548 if ex.errno == errno.EINTR:
551 except Exception, ex:
555 command = " ".join([command] + args)
557 worker = threading.Thread(target = do_command, args = (command, username, password, ))
558 worker.setDaemon(True)
559 result_ready.acquire()
561 result_ready.wait(timeout)
564 raise Exception, "command timed-out: '%s'" % command
566 result_ready.release()
569 if isinstance(result, Exception):
572 (retval, data) = result
573 if os.WIFEXITED(retval) and os.WEXITSTATUS(retval) == 0:
576 out = "system command ('%s') " % command
577 if os.WIFEXITED(retval):
578 out += "failed, rc = %d" % os.WEXITSTATUS(retval)
580 out += "killed by signal %d" % os.WTERMSIG(retval)
582 out += "; output follows:\n" + data
585 def racadm_reboot(ip, username, password, port, dryrun):
589 cmd = "/usr/sbin/racadm"
592 output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip],
595 output = "dryrun of racadm command"
597 logger.debug("runcmd returned without output %s" % output)
602 except Exception, err:
603 logger.debug("runcmd raised exception %s" % err)
606 return errno.ETIMEDOUT
609 if pcu['hostname'] is not None and pcu['hostname'] is not "":
610 return pcu['hostname']
611 elif pcu['ip'] is not None and pcu['ip'] is not "":
616 def get_pcu_values(pcu_id):
617 # TODO: obviously, this shouldn't be loaded each time...
619 fb =soltesz.dbLoad("findbadpcus")
622 values = fb['nodes']["id_%s" % pcu_id]['values']
628 def reboot_new(nodename, continue_probe, dryrun):
630 pcu = plc.getpcu(nodename)
634 values = get_pcu_values(pcu['pcu_id'])
639 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
641 # DataProbe iPal (many sites)
642 if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0:
643 if values['portstatus']['23'] == "open":
644 rb_ret = ipal_reboot(pcu_name(values),
649 rb_ret = "Unsupported_Port"
652 # APC Masterswitch (Berkeley)
653 elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0:
654 if values['portstatus']['22'] == "open" or \
655 values['portstatus']['23'] == "open":
656 rb_ret = apc_reboot(pcu_name(values),
660 values['portstatus'],
663 rb_ret = "Unsupported_Port"
665 elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0:
666 if values['portstatus']['22'] == "open":
667 rb_ret = baytech_reboot(pcu_name(values),
673 rb_ret = "Unsupported_Port"
677 elif continue_probe and values['model'].find("HP iLO") >= 0:
678 if values['portstatus']['22'] == "open":
679 rb_ret = ilo_reboot(pcu_name(values),
684 rb_ret = "Unsupported_Port"
687 elif continue_probe and values['model'].find("Dell RAC") >= 0:
688 if values['portstatus']['22'] == "open":
689 rb_ret = drac_reboot(pcu_name(values),
694 rb_ret = "Unsupported_Port"
697 # BlackBox PSExxx-xx (e.g. PSE505-FR)
698 elif continue_probe and \
699 (values['model'].find("BlackBox PS5xx") >= 0 or
700 values['model'].find("ePowerSwitch 1/4/8x") >=0 ):
701 if values['portstatus']['80'] == "open":
702 rb_ret = bbpse_reboot(pcu_name(values),
709 rb_ret = "Unsupported_PCU"
712 elif continue_probe and values['protocol'] == "ssh" and \
713 values['model'] == "x10toggle":
714 rb_ret = x10toggle_reboot(pcu_name(values),
720 elif continue_probe and values['protocol'] == "racadm" and \
721 values['model'] == "RAC":
722 rb_ret = racadm_reboot(pcu_name(values),
728 rb_ret = "Unsupported_PCU"
730 elif continue_probe == False:
731 if 'portstatus' in values:
744 # Returns true if rebooted via PCU
745 def reboot(nodename, dryrun):
746 pcu = plc.getpcu(nodename)
748 plc.nodePOD(nodename)
751 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
753 # APC Masterswitch (Berkeley)
754 if pcu['model'] == "APC Masterswitch":
755 err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'],
756 pcu[nodename], pcu['protocol'], dryrun)
758 # DataProbe iPal (many sites)
759 elif pcu['protocol'] == "telnet" and pcu['model'].find("IP-4") >= 0:
760 err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename], dryrun)
763 elif pcu['protocol'] == "ssh" and \
764 (pcu['model'].find("Baytech") >= 0 or pcu['model'].find("DS4") >= 0):
765 err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
767 # BlackBox PSExxx-xx (e.g. PSE505-FR)
768 elif pcu['protocol'] == "http" and (pcu['model'] == "bbpse"):
769 err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80, dryrun)
772 elif pcu['protocol'] == "ssh" and (pcu['model'] == "x10toggle"):
773 err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
776 elif pcu['protocol'] == "racadm" and (pcu['model'] == "RAC"):
777 err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename], dryrun)
779 # Unknown or unsupported
781 err = errno.EPROTONOSUPPORT
785 #def get_suggested(suggestion_id,db):
787 # sql= """select node_id,pcu_id from nodes where suggestion = %d """\
790 # nodes = db.query(sql).dictresult()
791 # except pg.ProgrammingError, err:
792 # print( "Database error for query: %s\n%s" % (sql,err) )
796 #def get_pcu_info(node_id,pcu_id,db):
797 # sql= """select port_number from pcu_ports where node_id = %d and pcu_id = %d """\
800 # port_number = db.query(sql).dictresult()
801 # except pg.ProgrammingError, err:
802 # print( "Database error for query: %s\n%s" % (sql,err) )
805 # sql= """select * from pcu where pcu_id = %d """\
808 # pcu = db.query(sql).dictresult()
809 # except pg.ProgrammingError, err:
810 # print( "Database error for query: %s\n%s" % (sql,err) )
813 # result = {'node_id':node_id,'pcu_id':pcu_id,'port_number':port_number[0]['port_number'],
814 # 'ip':pcu[0]['ip'],'username':pcu[0]['username'],'password':pcu[0]['password'],\
815 # 'model':pcu[0]['model'],'protocol':pcu[0]['protocol'],'hostname':pcu[0]['hostname']}
819 #def add_plc_event(node_id,err,db):
820 # site_id = plc_db_utils.get_site_from_node_id(node_id,db)
821 # message = "PCU reboot by monitor-msgs@planet-lab.org: %s" % os.strerror(err)
823 # sql = """insert into events (event_class_id,message,person_id,node_id,site_id) values """\
824 # """(%d,'%s',%d,%d,%d)""" % (NODE_POWER_CONTROL,message,MONITOR_USER_ID,node_id,site_id)
829 # except pg.ProgrammingError, err:
830 # print( "Database error for: %s\n%s" % (sql,err) )
835 logger.setLevel(logging.DEBUG)
836 ch = logging.StreamHandler()
837 ch.setLevel(logging.DEBUG)
838 formatter = logging.Formatter('LOGGER - %(message)s')
839 ch.setFormatter(formatter)
840 logger.addHandler(ch)
844 reboot("planetlab2.cs.uchicago.edu")
845 reboot("alice.cs.princeton.edu")
846 except Exception, err:
848 # used later for pretty printing
849 # pp = pprint.PrettyPrinter(indent=2)
854 # plc_db = plc_dbs.open_plc_db_write()
855 # mon_db = plc_dbs.open_mon_db()
857 # 5 = needs script reboot - fix this later
858 # nodes = get_suggested(5,mon_db)
862 # pcu = get_pcu_info(row['node_id'],row['pcu_id'],plc_db)
863 # add_plc_event(row['node_id'],err,plc_db)
865 if __name__ == '__main__':
867 logger = logging.getLogger("monitor")