3 # Reboot specified nodes
9 import errno, time, traceback
11 import threading, popen2
17 plc_lock = threading.Lock()
19 # Use our versions of telnetlib and pyssh
20 sys.path.insert(0, os.path.dirname(sys.argv[0]))
22 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")
28 # Event class ID from pcu events
29 #NODE_POWER_CONTROL = 3
32 #MONITOR_USER_ID = 11142
35 logger = logging.getLogger("monitor")
39 class ExceptionNotFound(Exception): pass
40 class ExceptionPassword(Exception): pass
41 class ExceptionTimeout(Exception): pass
42 class ExceptionPrompt(Exception): pass
43 class ExceptionPort(Exception): pass
45 def telnet_answer(telnet, expected, buffer):
48 output = telnet.read_until(expected, TELNET_TIMEOUT)
50 # logger.debug(output)
51 if output.find(expected) == -1:
52 raise ExceptionNotFound, "'%s' not found" % expected
54 telnet.write(buffer + "\r\n")
57 # PCU has model, host, preferred-port, user, passwd,
60 def __init__(self, protocol, verbose, dryrun):
61 self.verbose = verbose
62 self.protocol = protocol
65 def telnet_answer(telnet, expected, buffer):
68 output = telnet.read_until(expected, TELNET_TIMEOUT)
70 # logger.debug(output)
71 if output.find(expected) == -1:
72 raise ExceptionNotFound, "'%s' not found" % expected
74 telnet.write(buffer + "\r\n")
76 def _run(self, host, user, passwd, node_port, protocols):
84 def ipal_reboot(ip, password, port, dryrun):
93 #print "lock acquired"
96 #telnet = telnetlib.Telnet(ip) # , timeout=TELNET_TIMEOUT)
97 telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
100 # traceback.print_exc()
103 telnet.set_debuglevel(verbose)
105 # XXX Some iPals require you to hit Enter a few times first
106 telnet_answer(telnet, "Password >", "\r\n\r\n")
109 telnet_answer(telnet, "Password >", password)
113 telnet_answer(telnet, "Enter >", "P%d" % port)
115 telnet.read_until("Enter >", TELNET_TIMEOUT)
120 #print "lock released"
124 except EOFError, err:
126 logger.debug("ipal_reboot: EOF")
130 traceback.print_exc()
131 #print "lock released"
133 return errno.ECONNRESET
134 except socket.error, err:
135 logger.debug("ipal_reboot: Socket Error")
138 traceback.print_exc()
140 return errno.ETIMEDOUT
142 except Exception, err:
144 logger.debug("ipal_reboot: Exception")
149 traceback.print_exc()
150 #print "lock released"
155 def apc_reboot(ip, username, password, port, protocol, dryrun):
161 #if "ssh" in protocol:
162 if "22" in protocol and protocol['22'] == "open":
163 transport = pyssh.Ssh(username, ip)
166 telnet_answer(transport, "password:", password)
167 #elif "telnet" in protocol:
168 elif "23" in protocol and protocol['23'] == "open":
169 transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
170 #transport = telnetlib.Telnet(ip)
171 transport.set_debuglevel(verbose)
173 telnet_answer(transport, "User Name", username)
174 telnet_answer(transport, "Password", password)
176 logger.debug("Unknown protocol %s" %protocol)
177 raise "Closed protocol ports!"
186 telnet_answer(transport, "\r\n> ", "1")
188 # 1- Phase Monitor/Configuration
189 # 2- Outlet Restriction Configuration
190 # 3- Outlet Control/Config
191 # 4- Power Supply Status
193 # 3- Outlet Control/Config
194 telnet_answer(transport, "\r\n> ", "3")
201 telnet_answer(transport, "\r\n> ", str(port))
204 # 2- Configure Outlet
207 telnet_answer(transport, "\r\n> ", "1")
211 # 3- Immediate Reboot
217 # 3- Immediate Reboot
218 telnet_answer(transport, "\r\n> ", "3")
221 telnet_answer(transport,
222 "Enter 'YES' to continue or <ENTER> to cancel", "YES\r\n")
223 telnet_answer(transport,
224 "Press <ENTER> to continue...", "")
230 except EOFError, err:
235 return errno.ECONNRESET
236 except socket.error, err:
239 return errno.ETIMEDOUT
241 except Exception, err:
243 traceback.print_exc()
248 return "apc error: check password"
250 def drac_reboot(ip, username, password, dryrun):
254 ssh = pyssh.Ssh(username, ip)
255 ssh.set_debuglevel(verbose)
259 telnet_answer(ssh, "password:", password)
262 print "reset or power"
264 telnet_answer(ssh, "[%s]#" % username, "getsysinfo")
267 telnet_answer(ssh, "[%s]#" % username, "serveraction powercycle")
270 telnet_answer(ssh, "[%s]#" % username, "exit")
277 except socket.error, err:
280 traceback.print_exc()
287 return errno.ETIMEDOUT
288 except Exception, err:
291 traceback.print_exc()
298 return "drac error: check password"
300 def ilo_reboot(ip, username, password, dryrun):
306 ssh = pyssh.Ssh(username, ip)
307 ssh.set_debuglevel(verbose)
311 telnet_answer(ssh, "password:", password)
313 # User:vici logged-in to ILOUSE701N7N4.CS.Princeton.EDU(128.112.154.171)
314 # iLO Advanced 1.26 at 10:01:40 Nov 17 2006
315 # Server Name: USE701N7N400
320 telnet_answer(ssh, "</>hpiLO->", "cd system1")
322 # Reboot Outlet N (Y/N)?
323 print "reset or power"
325 telnet_answer(ssh, "</system1>hpiLO->", "POWER")
328 telnet_answer(ssh, "</system1>hpiLO->", "reset")
331 telnet_answer(ssh, "</system1>hpiLO->", "exit")
338 except socket.error, err:
341 traceback.print_exc()
348 return errno.ETIMEDOUT
349 except Exception, err:
352 traceback.print_exc()
359 return "ilo error: check password"
361 def baytech_reboot(ip, username, password, port, dryrun):
368 ssh = pyssh.Ssh(username, ip)
369 ssh.set_debuglevel(verbose)
373 telnet_answer(ssh, "password:", password)
375 # PL1 comm output (2 ,1).........1
376 # PL2 comm output (2 ,2).........2
377 # PL3 comm output (2 ,3).........3
378 # no machine (2 ,4).........4
379 # Control Outlets (5 ,1).........5
380 # Logout..........................T
382 # Control Outlets (5 ,1).........5
383 telnet_answer(ssh, "Enter Request :", "5")
387 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
388 except ExceptionNotFound, msg:
389 # one machine is configured to ask for a username,
390 # even after login...
391 print "msg: %s" % msg
392 ssh.write(username + "\r\n")
393 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
396 # Reboot Outlet N (Y/N)?
398 telnet_answer(ssh, "(Y/N)?", "N")
400 telnet_answer(ssh, "(Y/N)?", "Y")
401 telnet_answer(ssh, "DS-RPC>", "")
407 except socket.error, err:
410 traceback.print_exc()
417 return errno.ETIMEDOUT
418 except Exception, err:
421 traceback.print_exc()
428 return "baytech error: check password"
430 ### rebooting european BlackBox PSE boxes
431 # Thierry Parmentelat - May 11 2005
432 # tested on 4-ports models known as PSE505-FR
433 # uses http to POST a data 'P<port>=r'
434 # relies on basic authentication within http1.0
435 # first curl-based script was
436 # curl --http1.0 --basic --user <username>:<password> --data P<port>=r \
437 # http://<hostname>:<http_port>/cmd.html && echo OK
439 def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port, dryrun):
443 url = "http://%s:%d/cmd.html" % (pcu_ip,http_port)
444 data= "P%d=r" % port_in_pcu
446 logger.debug("POSTing '%s' on %s" % (data,url))
448 authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
449 uri = "%s:%d" % (pcu_ip,http_port)
450 authinfo.add_password (None, uri, username, password)
451 authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
453 opener = urllib2.build_opener(authhandler)
454 urllib2.install_opener(opener)
460 f = urllib2.urlopen(url,data)
467 except urllib2.URLError,err:
468 logger.info('Could not open http connection', err)
471 ### rebooting x10toggle based systems addressed by port
472 # Marc E. Fiuczynski - May 31 2005
473 # tested on 4-ports models known as PSE505-FR
474 # uses ssh and password to login to an account
475 # that will cause the system to be powercycled.
477 def x10toggle_reboot(ip, username, password, port, dryrun):
482 ssh = pyssh.Ssh(username, ip)
486 telnet_answer(ssh, "password:", password)
490 telnet_answer(ssh, "x10toggle>", "A%d" % port)
498 except Exception, err:
505 return errno.ETIMEDOUT
507 ### rebooting Dell systems via RAC card
508 # Marc E. Fiuczynski - June 01 2005
509 # tested with David Lowenthal's itchy/scratchy nodes at UGA
512 def runcmd(command, args, username, password, timeout = None):
515 result_ready = threading.Condition()
519 result_ready.acquire()
523 result_ready.notify()
524 result_ready.release()
526 def do_command(command, username, password):
529 # Popen4 is a popen-type class that combines stdout and stderr
530 p = popen2.Popen4(command)
532 # read all output data
533 p.tochild.write("%s\n" % username)
534 p.tochild.write("%s\n" % password)
536 data = p.fromchild.read()
539 # might get interrupted by a signal in poll() or waitpid()
542 set_result((retval, data))
545 if ex.errno == errno.EINTR:
548 except Exception, ex:
552 command = " ".join([command] + args)
554 worker = threading.Thread(target = do_command, args = (command, username, password, ))
555 worker.setDaemon(True)
556 result_ready.acquire()
558 result_ready.wait(timeout)
561 raise Exception, "command timed-out: '%s'" % command
563 result_ready.release()
566 if isinstance(result, Exception):
569 (retval, data) = result
570 if os.WIFEXITED(retval) and os.WEXITSTATUS(retval) == 0:
573 out = "system command ('%s') " % command
574 if os.WIFEXITED(retval):
575 out += "failed, rc = %d" % os.WEXITSTATUS(retval)
577 out += "killed by signal %d" % os.WTERMSIG(retval)
579 out += "; output follows:\n" + data
582 def racadm_reboot(ip, username, password, port, dryrun):
586 cmd = "/usr/sbin/racadm"
589 output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip],
592 output = "dryrun of racadm command"
594 logger.debug("runcmd returned without output %s" % output)
599 except Exception, err:
600 logger.debug("runcmd raised exception %s" % err)
603 return errno.ETIMEDOUT
606 if pcu['hostname'] is not None and pcu['hostname'] is not "":
607 return pcu['hostname']
608 elif pcu['ip'] is not None and pcu['ip'] is not "":
613 def get_pcu_values(pcu_id):
614 # TODO: obviously, this shouldn't be loaded each time...
616 fb =soltesz.dbLoad("findbadpcus")
619 values = fb['nodes']["id_%s" % pcu_id]['values']
625 def reboot_new(nodename, continue_probe, dryrun):
627 pcu = plc.getpcu(nodename)
631 values = get_pcu_values(pcu['pcu_id'])
636 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
638 # DataProbe iPal (many sites)
639 if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0:
640 if values['portstatus']['23'] == "open":
641 rb_ret = reboot.ipal_reboot(pcu_name(values),
646 rb_ret = "Unsupported_Port"
649 # APC Masterswitch (Berkeley)
650 elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0:
651 if values['portstatus']['22'] == "open" or \
652 values['portstatus']['23'] == "open":
653 rb_ret = reboot.apc_reboot(pcu_name(values),
657 values['portstatus'],
660 rb_ret = "Unsupported_Port"
662 elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0:
663 if values['portstatus']['22'] == "open":
664 rb_ret = reboot.baytech_reboot(pcu_name(values),
670 rb_ret = "Unsupported_Port"
674 elif continue_probe and values['model'].find("HP iLO") >= 0:
675 if values['portstatus']['22'] == "open":
676 rb_ret = reboot.ilo_reboot(pcu_name(values),
681 rb_ret = "Unsupported_Port"
684 elif continue_probe and values['model'].find("Dell RAC") >= 0:
685 if values['portstatus']['22'] == "open":
686 rb_ret = reboot.drac_reboot(pcu_name(values),
691 rb_ret = "Unsupported_Port"
694 # BlackBox PSExxx-xx (e.g. PSE505-FR)
695 elif continue_probe and \
696 (values['model'].find("BlackBox PS5xx") >= 0 or
697 values['model'].find("ePowerSwitch 1/4/8x") >=0 ):
698 if values['portstatus']['80'] == "open":
699 rb_ret = reboot.bbpse_reboot(pcu_name(values),
706 rb_ret = "Unsupported_PCU"
709 elif continue_probe and values['protocol'] == "ssh" and \
710 values['model'] == "x10toggle":
711 rb_ret = reboot.x10toggle_reboot(pcu_name(values),
717 elif continue_probe and values['protocol'] == "racadm" and \
718 values['model'] == "RAC":
719 rb_ret = reboot.racadm_reboot(pcu_name(values),
725 rb_ret = "Unsupported_PCU"
727 elif continue_probe == False:
728 if 'portstatus' in values:
741 # Returns true if rebooted via PCU
742 def reboot(nodename, dryrun):
743 pcu = plc.getpcu(nodename)
745 plc.nodePOD(nodename)
748 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
750 # APC Masterswitch (Berkeley)
751 if pcu['model'] == "APC Masterswitch":
752 err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'],
753 pcu[nodename], pcu['protocol'], dryrun)
755 # DataProbe iPal (many sites)
756 elif pcu['protocol'] == "telnet" and pcu['model'].find("IP-4") >= 0:
757 err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename], dryrun)
760 elif pcu['protocol'] == "ssh" and \
761 (pcu['model'].find("Baytech") >= 0 or pcu['model'].find("DS4") >= 0):
762 err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
764 # BlackBox PSExxx-xx (e.g. PSE505-FR)
765 elif pcu['protocol'] == "http" and (pcu['model'] == "bbpse"):
766 err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80, dryrun)
769 elif pcu['protocol'] == "ssh" and (pcu['model'] == "x10toggle"):
770 err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
773 elif pcu['protocol'] == "racadm" and (pcu['model'] == "RAC"):
774 err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename], dryrun)
776 # Unknown or unsupported
778 err = errno.EPROTONOSUPPORT
782 #def get_suggested(suggestion_id,db):
784 # sql= """select node_id,pcu_id from nodes where suggestion = %d """\
787 # nodes = db.query(sql).dictresult()
788 # except pg.ProgrammingError, err:
789 # print( "Database error for query: %s\n%s" % (sql,err) )
793 #def get_pcu_info(node_id,pcu_id,db):
794 # sql= """select port_number from pcu_ports where node_id = %d and pcu_id = %d """\
797 # port_number = db.query(sql).dictresult()
798 # except pg.ProgrammingError, err:
799 # print( "Database error for query: %s\n%s" % (sql,err) )
802 # sql= """select * from pcu where pcu_id = %d """\
805 # pcu = db.query(sql).dictresult()
806 # except pg.ProgrammingError, err:
807 # print( "Database error for query: %s\n%s" % (sql,err) )
810 # result = {'node_id':node_id,'pcu_id':pcu_id,'port_number':port_number[0]['port_number'],
811 # 'ip':pcu[0]['ip'],'username':pcu[0]['username'],'password':pcu[0]['password'],\
812 # 'model':pcu[0]['model'],'protocol':pcu[0]['protocol'],'hostname':pcu[0]['hostname']}
816 #def add_plc_event(node_id,err,db):
817 # site_id = plc_db_utils.get_site_from_node_id(node_id,db)
818 # message = "PCU reboot by monitor-msgs@planet-lab.org: %s" % os.strerror(err)
820 # sql = """insert into events (event_class_id,message,person_id,node_id,site_id) values """\
821 # """(%d,'%s',%d,%d,%d)""" % (NODE_POWER_CONTROL,message,MONITOR_USER_ID,node_id,site_id)
826 # except pg.ProgrammingError, err:
827 # print( "Database error for: %s\n%s" % (sql,err) )
832 logger.setLevel(logging.DEBUG)
833 ch = logging.StreamHandler()
834 ch.setLevel(logging.DEBUG)
835 formatter = logging.Formatter('LOGGER - %(message)s')
836 ch.setFormatter(formatter)
837 logger.addHandler(ch)
841 reboot("planetlab2.cs.uchicago.edu")
842 reboot("alice.cs.princeton.edu")
843 except Exception, err:
845 # used later for pretty printing
846 # pp = pprint.PrettyPrinter(indent=2)
851 # plc_db = plc_dbs.open_plc_db_write()
852 # mon_db = plc_dbs.open_mon_db()
854 # 5 = needs script reboot - fix this later
855 # nodes = get_suggested(5,mon_db)
859 # pcu = get_pcu_info(row['node_id'],row['pcu_id'],plc_db)
860 # add_plc_event(row['node_id'],err,plc_db)
862 if __name__ == '__main__':
864 logger = logging.getLogger("monitor")