3 # Reboot specified nodes
9 import errno, time, traceback
11 import threading, popen2
17 plc_lock = threading.Lock()
19 # Use our versions of telnetlib and pyssh
20 sys.path.insert(0, os.path.dirname(sys.argv[0]))
22 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")
28 # Event class ID from pcu events
29 #NODE_POWER_CONTROL = 3
32 #MONITOR_USER_ID = 11142
35 logger = logging.getLogger("monitor")
39 class ExceptionNotFound(Exception): pass
40 class ExceptionPassword(Exception): pass
41 class ExceptionTimeout(Exception): pass
42 class ExceptionPrompt(Exception): pass
43 class ExceptionPort(Exception): pass
45 def telnet_answer(telnet, expected, buffer):
48 output = telnet.read_until(expected, TELNET_TIMEOUT)
50 # logger.debug(output)
51 if output.find(expected) == -1:
52 raise ExceptionNotFound, "'%s' not found" % expected
54 telnet.write(buffer + "\r\n")
57 # PCU has model, host, preferred-port, user, passwd,
60 def __init__(self, protocol, verbose, dryrun):
61 self.verbose = verbose
62 self.protocol = protocol
65 def telnet_answer(telnet, expected, buffer):
68 output = telnet.read_until(expected, TELNET_TIMEOUT)
70 # logger.debug(output)
71 if output.find(expected) == -1:
72 raise ExceptionNotFound, "'%s' not found" % expected
74 telnet.write(buffer + "\r\n")
76 def _run(self, host, user, passwd, node_port, protocols):
84 def ipal_reboot(ip, password, port, dryrun):
93 #print "lock acquired"
96 #telnet = telnetlib.Telnet(ip) # , timeout=TELNET_TIMEOUT)
97 telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
100 # traceback.print_exc()
103 telnet.set_debuglevel(verbose)
105 # XXX Some iPals require you to hit Enter a few times first
106 telnet_answer(telnet, "Password >", "\r\n\r\n")
109 telnet_answer(telnet, "Password >", password)
111 # XXX Some iPals require you to hit Enter a few times first
112 telnet.write("\r\n\r\n")
116 telnet_answer(telnet, "Enter >", "P%d" % port)
118 telnet.read_until("Enter >", TELNET_TIMEOUT)
123 #print "lock released"
127 except EOFError, err:
129 logger.debug("ipal_reboot: EOF")
133 traceback.print_exc()
134 #print "lock released"
136 return errno.ECONNRESET
137 except socket.error, err:
138 logger.debug("ipal_reboot: Socket Error")
141 traceback.print_exc()
143 return errno.ETIMEDOUT
145 except Exception, err:
147 logger.debug("ipal_reboot: Exception")
152 traceback.print_exc()
153 #print "lock released"
157 def apc_reboot_original(ip, username, password, port, protocol, dryrun):
162 # TODO: I may need to differentiate between different models of APC
164 # for instance, the original code didn't work for:
165 # planetdev03.fm.intel.com
166 # American Power Conversion
167 # Network Management Card AOS v3.3.0
168 # (c) Copyright 2005 All Rights Reserved
169 # Rack PDU APP v3.3.1
173 #if "ssh" in protocol:
174 if "22" in protocol and protocol['22'] == "open":
175 transport = pyssh.Ssh(username, ip)
178 telnet_answer(transport, "password:", password)
179 #elif "telnet" in protocol:
180 elif "23" in protocol and protocol['23'] == "open":
181 transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
182 #transport = telnetlib.Telnet(ip)
183 transport.set_debuglevel(verbose)
185 telnet_answer(transport, "User Name", username)
186 telnet_answer(transport, "Password", password)
188 logger.debug("Unknown protocol %s" %protocol)
189 raise "Closed protocol ports!"
198 telnet_answer(transport, "\r\n> ", "1")
200 # 1- Phase Monitor/Configuration
201 # 2- Outlet Restriction Configuration
202 # 3- Outlet Control/Config
203 # 4- Power Supply Status
205 # 3- Outlet Control/Config
206 #telnet_answer(transport, "\r\n> ", "2")
207 #telnet_answer(transport, "\r\n> ", "1")
209 # 3- Outlet Control/Config
210 telnet_answer(transport, "\r\n> ", "3")
217 telnet_answer(transport, "\r\n> ", str(port))
220 # 2- Configure Outlet
223 telnet_answer(transport, "\r\n> ", "1")
227 # 3- Immediate Reboot
233 # 3- Immediate Reboot
234 telnet_answer(transport, "\r\n> ", "3")
237 telnet_answer(transport,
238 "Enter 'YES' to continue or <ENTER> to cancel", "YES\r\n")
239 telnet_answer(transport,
240 "Press <ENTER> to continue...", "")
246 except EOFError, err:
251 return errno.ECONNRESET
252 except socket.error, err:
255 return errno.ETIMEDOUT
257 except Exception, err:
259 traceback.print_exc()
264 return "apc error: check password"
266 def apc_reboot(ip, username, password, port, protocol, dryrun):
271 # TODO: I may need to differentiate between different models of APC
273 # for instance, the original code didn't work for:
274 # planetdev03.fm.intel.com
275 # American Power Conversion
276 # Network Management Card AOS v3.3.0
277 # (c) Copyright 2005 All Rights Reserved
278 # Rack PDU APP v3.3.1
282 #if "ssh" in protocol:
283 if "22" in protocol and protocol['22'] == "open":
284 transport = pyssh.Ssh(username, ip)
287 telnet_answer(transport, "password:", password)
288 #elif "telnet" in protocol:
289 elif "23" in protocol and protocol['23'] == "open":
290 transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
291 #transport = telnetlib.Telnet(ip)
292 transport.set_debuglevel(verbose)
294 telnet_answer(transport, "User Name", username)
295 telnet_answer(transport, "Password", password)
297 logger.debug("Unknown protocol %s" %protocol)
298 raise "Closed protocol ports!"
307 telnet_answer(transport, "\r\n> ", "1")
309 # 1- Phase Monitor/Configuration
310 # 2- Outlet Restriction Configuration
311 # 3- Outlet Control/Config
312 # 4- Power Supply Status
314 # 3- Outlet Control/Config
315 telnet_answer(transport, "\r\n> ", "2")
316 telnet_answer(transport, "\r\n> ", "1")
318 # 3- Outlet Control/Config
319 #telnet_answer(transport, "\r\n> ", "3")
326 telnet_answer(transport, "\r\n> ", str(port))
329 # 2- Configure Outlet
332 telnet_answer(transport, "\r\n> ", "1")
336 # 3- Immediate Reboot
342 # 3- Immediate Reboot
343 telnet_answer(transport, "\r\n> ", "3")
346 telnet_answer(transport,
347 "Enter 'YES' to continue or <ENTER> to cancel", "YES\r\n")
348 telnet_answer(transport,
349 "Press <ENTER> to continue...", "")
355 except EOFError, err:
360 return errno.ECONNRESET
361 except socket.error, err:
364 return errno.ETIMEDOUT
366 except Exception, err:
368 traceback.print_exc()
373 return apc_reboot_original(ip, username, password, port, protocol, dryrun)
375 def drac_reboot(ip, username, password, dryrun):
379 ssh = pyssh.Ssh(username, ip)
380 ssh.set_debuglevel(verbose)
384 telnet_answer(ssh, "password:", password)
387 print "reset or power"
389 telnet_answer(ssh, "[%s]#" % username, "getsysinfo")
392 telnet_answer(ssh, "[%s]#" % username, "serveraction powercycle")
395 telnet_answer(ssh, "[%s]#" % username, "exit")
402 except socket.error, err:
405 traceback.print_exc()
412 return errno.ETIMEDOUT
413 except Exception, err:
416 traceback.print_exc()
423 return "drac error: check password"
425 def ilo_reboot(ip, username, password, dryrun):
431 ssh = pyssh.Ssh(username, ip)
432 ssh.set_debuglevel(verbose)
436 telnet_answer(ssh, "password:", password)
438 # User:vici logged-in to ILOUSE701N7N4.CS.Princeton.EDU(128.112.154.171)
439 # iLO Advanced 1.26 at 10:01:40 Nov 17 2006
440 # Server Name: USE701N7N400
445 telnet_answer(ssh, "</>hpiLO->", "cd system1")
447 # Reboot Outlet N (Y/N)?
448 print "reset or power"
450 telnet_answer(ssh, "</system1>hpiLO->", "POWER")
453 telnet_answer(ssh, "</system1>hpiLO->", "reset")
456 telnet_answer(ssh, "</system1>hpiLO->", "exit")
463 except socket.error, err:
466 traceback.print_exc()
473 return errno.ETIMEDOUT
474 except Exception, err:
477 traceback.print_exc()
484 return "ilo error: check password"
486 def baytech_reboot(ip, username, password, port, dryrun):
493 ssh = pyssh.Ssh(username, ip)
494 ssh.set_debuglevel(verbose)
498 telnet_answer(ssh, "password:", password)
500 # PL1 comm output (2 ,1).........1
501 # PL2 comm output (2 ,2).........2
502 # PL3 comm output (2 ,3).........3
503 # no machine (2 ,4).........4
504 # Control Outlets (5 ,1).........5
505 # Logout..........................T
507 # Control Outlets (5 ,1).........5
508 telnet_answer(ssh, "Enter Request :", "5")
512 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
513 except ExceptionNotFound, msg:
514 # one machine is configured to ask for a username,
515 # even after login...
516 print "msg: %s" % msg
517 ssh.write(username + "\r\n")
518 telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
521 # Reboot Outlet N (Y/N)?
523 telnet_answer(ssh, "(Y/N)?", "N")
525 telnet_answer(ssh, "(Y/N)?", "Y")
526 telnet_answer(ssh, "DS-RPC>", "")
532 except socket.error, err:
535 traceback.print_exc()
542 return errno.ETIMEDOUT
543 except Exception, err:
546 traceback.print_exc()
553 return "baytech error: check password"
555 ### rebooting european BlackBox PSE boxes
556 # Thierry Parmentelat - May 11 2005
557 # tested on 4-ports models known as PSE505-FR
558 # uses http to POST a data 'P<port>=r'
559 # relies on basic authentication within http1.0
560 # first curl-based script was
561 # curl --http1.0 --basic --user <username>:<password> --data P<port>=r \
562 # http://<hostname>:<http_port>/cmd.html && echo OK
564 def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port, dryrun):
568 url = "http://%s:%d/cmd.html" % (pcu_ip,http_port)
569 data= "P%d=r" % port_in_pcu
571 logger.debug("POSTing '%s' on %s" % (data,url))
573 authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
574 uri = "%s:%d" % (pcu_ip,http_port)
575 authinfo.add_password (None, uri, username, password)
576 authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
578 opener = urllib2.build_opener(authhandler)
579 urllib2.install_opener(opener)
585 f = urllib2.urlopen(url,data)
592 except urllib2.URLError,err:
593 logger.info('Could not open http connection', err)
596 ### rebooting x10toggle based systems addressed by port
597 # Marc E. Fiuczynski - May 31 2005
598 # tested on 4-ports models known as PSE505-FR
599 # uses ssh and password to login to an account
600 # that will cause the system to be powercycled.
602 def x10toggle_reboot(ip, username, password, port, dryrun):
607 ssh = pyssh.Ssh(username, ip)
611 telnet_answer(ssh, "password:", password)
615 telnet_answer(ssh, "x10toggle>", "A%d" % port)
623 except Exception, err:
630 return errno.ETIMEDOUT
632 ### rebooting Dell systems via RAC card
633 # Marc E. Fiuczynski - June 01 2005
634 # tested with David Lowenthal's itchy/scratchy nodes at UGA
637 def runcmd(command, args, username, password, timeout = None):
640 result_ready = threading.Condition()
644 result_ready.acquire()
648 result_ready.notify()
649 result_ready.release()
651 def do_command(command, username, password):
654 # Popen4 is a popen-type class that combines stdout and stderr
655 p = popen2.Popen4(command)
657 # read all output data
658 p.tochild.write("%s\n" % username)
659 p.tochild.write("%s\n" % password)
661 data = p.fromchild.read()
664 # might get interrupted by a signal in poll() or waitpid()
667 set_result((retval, data))
670 if ex.errno == errno.EINTR:
673 except Exception, ex:
677 command = " ".join([command] + args)
679 worker = threading.Thread(target = do_command, args = (command, username, password, ))
680 worker.setDaemon(True)
681 result_ready.acquire()
683 result_ready.wait(timeout)
686 raise Exception, "command timed-out: '%s'" % command
688 result_ready.release()
691 if isinstance(result, Exception):
694 (retval, data) = result
695 if os.WIFEXITED(retval) and os.WEXITSTATUS(retval) == 0:
698 out = "system command ('%s') " % command
699 if os.WIFEXITED(retval):
700 out += "failed, rc = %d" % os.WEXITSTATUS(retval)
702 out += "killed by signal %d" % os.WTERMSIG(retval)
704 out += "; output follows:\n" + data
707 def racadm_reboot(ip, username, password, port, dryrun):
711 cmd = "/usr/sbin/racadm"
714 output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip],
717 output = "dryrun of racadm command"
719 logger.debug("runcmd returned without output %s" % output)
724 except Exception, err:
725 logger.debug("runcmd raised exception %s" % err)
728 return errno.ETIMEDOUT
731 if pcu['hostname'] is not None and pcu['hostname'] is not "":
732 return pcu['hostname']
733 elif pcu['ip'] is not None and pcu['ip'] is not "":
738 def get_pcu_values(pcu_id):
739 # TODO: obviously, this shouldn't be loaded each time...
741 fb =soltesz.dbLoad("findbadpcus")
744 values = fb['nodes']["id_%s" % pcu_id]['values']
750 def check_open_port(values, port_list):
753 if 'portstatus' in values:
754 for port in port_list:
755 if port in values['portstatus'] and \
756 values['portstatus'][port] == "open":
763 def reboot_new(nodename, continue_probe, dryrun):
765 pcu = plc.getpcu(nodename)
769 values = get_pcu_values(pcu['pcu_id'])
774 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
776 # DataProbe iPal (many sites)
777 if continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0:
778 if check_open_port(values, ['23']):
779 rb_ret = ipal_reboot(pcu_name(values),
784 rb_ret = "Unsupported_Port"
787 # APC Masterswitch (Berkeley)
788 elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0:
789 if check_open_port(values, ['22', '23']):
790 rb_ret = apc_reboot(pcu_name(values),
794 values['portstatus'],
797 rb_ret = "Unsupported_Port"
799 elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0:
800 if check_open_port(values, ['22']):
801 rb_ret = baytech_reboot(pcu_name(values),
807 rb_ret = "Unsupported_Port"
811 elif continue_probe and values['model'].find("HP iLO") >= 0:
812 if check_open_port(values, ['22']):
813 rb_ret = ilo_reboot(pcu_name(values),
818 rb_ret = "Unsupported_Port"
821 elif continue_probe and values['model'].find("Dell RAC") >= 0:
822 if check_open_port(values, ['22']):
823 rb_ret = drac_reboot(pcu_name(values),
828 rb_ret = "Unsupported_Port"
831 # BlackBox PSExxx-xx (e.g. PSE505-FR)
832 elif continue_probe and \
833 (values['model'].find("BlackBox PS5xx") >= 0 or
834 values['model'].find("ePowerSwitch 1/4/8x") >=0 ):
835 if check_open_port(values, ['80']):
836 rb_ret = bbpse_reboot(pcu_name(values),
843 rb_ret = "Unsupported_PCU"
846 elif continue_probe and values['protocol'] == "ssh" and \
847 values['model'] == "x10toggle":
848 rb_ret = x10toggle_reboot(pcu_name(values),
854 elif continue_probe and values['protocol'] == "racadm" and \
855 values['model'] == "RAC":
856 rb_ret = racadm_reboot(pcu_name(values),
862 rb_ret = "Unsupported_PCU"
864 elif continue_probe == False:
865 if 'portstatus' in values:
878 # Returns true if rebooted via PCU
879 def reboot(nodename, dryrun):
880 pcu = plc.getpcu(nodename)
882 plc.nodePOD(nodename)
885 logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
887 # APC Masterswitch (Berkeley)
888 if pcu['model'] == "APC Masterswitch":
889 err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'],
890 pcu[nodename], pcu['protocol'], dryrun)
892 # DataProbe iPal (many sites)
893 elif pcu['protocol'] == "telnet" and pcu['model'].find("IP-4") >= 0:
894 err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename], dryrun)
897 elif pcu['protocol'] == "ssh" and \
898 (pcu['model'].find("Baytech") >= 0 or pcu['model'].find("DS4") >= 0):
899 err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
901 # BlackBox PSExxx-xx (e.g. PSE505-FR)
902 elif pcu['protocol'] == "http" and (pcu['model'] == "bbpse"):
903 err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80, dryrun)
906 elif pcu['protocol'] == "ssh" and (pcu['model'] == "x10toggle"):
907 err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
910 elif pcu['protocol'] == "racadm" and (pcu['model'] == "RAC"):
911 err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename], dryrun)
913 # Unknown or unsupported
915 err = errno.EPROTONOSUPPORT
919 #def get_suggested(suggestion_id,db):
921 # sql= """select node_id,pcu_id from nodes where suggestion = %d """\
924 # nodes = db.query(sql).dictresult()
925 # except pg.ProgrammingError, err:
926 # print( "Database error for query: %s\n%s" % (sql,err) )
930 #def get_pcu_info(node_id,pcu_id,db):
931 # sql= """select port_number from pcu_ports where node_id = %d and pcu_id = %d """\
934 # port_number = db.query(sql).dictresult()
935 # except pg.ProgrammingError, err:
936 # print( "Database error for query: %s\n%s" % (sql,err) )
939 # sql= """select * from pcu where pcu_id = %d """\
942 # pcu = db.query(sql).dictresult()
943 # except pg.ProgrammingError, err:
944 # print( "Database error for query: %s\n%s" % (sql,err) )
947 # result = {'node_id':node_id,'pcu_id':pcu_id,'port_number':port_number[0]['port_number'],
948 # 'ip':pcu[0]['ip'],'username':pcu[0]['username'],'password':pcu[0]['password'],\
949 # 'model':pcu[0]['model'],'protocol':pcu[0]['protocol'],'hostname':pcu[0]['hostname']}
953 #def add_plc_event(node_id,err,db):
954 # site_id = plc_db_utils.get_site_from_node_id(node_id,db)
955 # message = "PCU reboot by monitor-msgs@planet-lab.org: %s" % os.strerror(err)
957 # sql = """insert into events (event_class_id,message,person_id,node_id,site_id) values """\
958 # """(%d,'%s',%d,%d,%d)""" % (NODE_POWER_CONTROL,message,MONITOR_USER_ID,node_id,site_id)
963 # except pg.ProgrammingError, err:
964 # print( "Database error for: %s\n%s" % (sql,err) )
969 logger.setLevel(logging.DEBUG)
970 ch = logging.StreamHandler()
971 ch.setLevel(logging.DEBUG)
972 formatter = logging.Formatter('LOGGER - %(message)s')
973 ch.setFormatter(formatter)
974 logger.addHandler(ch)
978 reboot("planetlab2.cs.uchicago.edu")
979 reboot("alice.cs.princeton.edu")
980 except Exception, err:
982 # used later for pretty printing
983 # pp = pprint.PrettyPrinter(indent=2)
988 # plc_db = plc_dbs.open_plc_db_write()
989 # mon_db = plc_dbs.open_mon_db()
991 # 5 = needs script reboot - fix this later
992 # nodes = get_suggested(5,mon_db)
996 # pcu = get_pcu_info(row['node_id'],row['pcu_id'],plc_db)
997 # add_plc_event(row['node_id'],err,plc_db)
999 if __name__ == '__main__':
1001 logger = logging.getLogger("monitor")