Reboot.py:
[monitor.git] / reboot.py
1 #!/usr/bin/python
2 #
3 # Reboot specified nodes
4 #
5
6 import getpass, getopt
7 import os, sys
8 import xml, xmlrpclib
9 import errno, time, traceback
10 import urllib2
11 import threading, popen2
12 import array, struct
13 #from socket import *
14 import socket
15 import plc
16
17 plc_lock = threading.Lock()
18
19 # Use our versions of telnetlib and pyssh
20 sys.path.insert(0, os.path.dirname(sys.argv[0]))
21 import telnetlib
22 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
23 import pyssh
24
25 # Timeouts in seconds
26 TELNET_TIMEOUT = 30
27
28 # Event class ID from pcu events
29 #NODE_POWER_CONTROL = 3
30
31 # Monitor user ID
32 #MONITOR_USER_ID = 11142
33
34 import logging
35 logger = logging.getLogger("monitor")
36 verbose = 0
37 #dryrun = 0;
38
39 class ExceptionNotFound(Exception): pass
40 class ExceptionPassword(Exception): pass
41 class ExceptionTimeout(Exception): pass
42 class ExceptionPrompt(Exception): pass
43 class ExceptionPort(Exception): pass
44
45 def telnet_answer(telnet, expected, buffer):
46         global verbose
47
48         output = telnet.read_until(expected, TELNET_TIMEOUT)
49         #if verbose:
50         #       logger.debug(output)
51         if output.find(expected) == -1:
52                 raise ExceptionNotFound, "'%s' not found" % expected
53         else:
54                 telnet.write(buffer + "\r\n")
55
56
57 # PCU has model, host, preferred-port, user, passwd, 
58
59 class PCUExpect:
60         def __init__(self, protocol, verbose, dryrun):
61                 self.verbose = verbose
62                 self.protocol = protocol
63                 self.dryrun = dryrun
64
65         def telnet_answer(telnet, expected, buffer):
66                 global verbose
67
68                 output = telnet.read_until(expected, TELNET_TIMEOUT)
69                 #if verbose:
70                 #       logger.debug(output)
71                 if output.find(expected) == -1:
72                         raise ExceptionNotFound, "'%s' not found" % expected
73                 else:
74                         telnet.write(buffer + "\r\n")
75         
76         def _run(self, host, user, passwd, node_port, protocols):
77                 self.run()
78
79         def run(self):
80                 pass
81                 
82         
83
84 def ipal_reboot(ip, password, port, dryrun):
85         global verbose
86         global plc_lock
87
88
89         telnet = None
90
91         try:
92                 #plc_lock.acquire()
93                 #print "lock acquired"
94
95                 #try:
96                         #telnet = telnetlib.Telnet(ip) # , timeout=TELNET_TIMEOUT)
97                 telnet = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
98                 #except:
99                 #       import traceback
100                 #       traceback.print_exc()
101
102
103                 telnet.set_debuglevel(verbose)
104
105                 # XXX Some iPals require you to hit Enter a few times first
106                 telnet_answer(telnet, "Password >", "\r\n\r\n")
107
108                 # Login
109                 telnet_answer(telnet, "Password >", password)
110
111                 # P# - Pulse relay
112                 if not dryrun:
113                         telnet_answer(telnet, "Enter >", "P%d" % port)
114
115                 telnet.read_until("Enter >", TELNET_TIMEOUT)
116
117                 # Close
118                 telnet.close()
119
120                 #print "lock released"
121                 #plc_lock.release()
122                 return 0
123
124         except EOFError, err:
125                 if verbose:
126                         logger.debug("ipal_reboot: EOF")
127                         logger.debug(err)
128                 telnet.close()
129                 import traceback
130                 traceback.print_exc()
131                 #print "lock released"
132                 #plc_lock.release()
133                 return errno.ECONNRESET
134         except socket.error, err:
135                 logger.debug("ipal_reboot: Socket Error")
136                 logger.debug(err)
137                 import traceback
138                 traceback.print_exc()
139
140                 return errno.ETIMEDOUT
141                 
142         except Exception, err:
143                 if verbose:
144                         logger.debug("ipal_reboot: Exception")
145                         logger.debug(err)
146                 if telnet:
147                         telnet.close()
148                 import traceback
149                 traceback.print_exc()
150                 #print "lock released"
151                 #plc_lock.release()
152                 return  "ipal error"
153
154
155 def apc_reboot(ip, username, password, port, protocol, dryrun):
156         global verbose
157
158         transport = None
159
160         try:
161                 #if "ssh" in protocol:
162                 if "22" in protocol and protocol['22'] == "open":
163                         transport = pyssh.Ssh(username, ip)
164                         transport.open()
165                         # Login
166                         telnet_answer(transport, "password:", password)
167                 #elif "telnet" in protocol:
168                 elif "23" in protocol and protocol['23'] == "open":
169                         transport = telnetlib.Telnet(ip, timeout=TELNET_TIMEOUT)
170                         #transport = telnetlib.Telnet(ip)
171                         transport.set_debuglevel(verbose)
172                         # Login
173                         telnet_answer(transport, "User Name", username)
174                         telnet_answer(transport, "Password", password)
175                 else:
176                         logger.debug("Unknown protocol %s" %protocol)
177                         raise "Closed protocol ports!"
178
179
180                 # 1- Device Manager
181                 # 2- Network
182                 # 3- System
183                 # 4- Logout
184
185                 # 1- Device Manager
186                 telnet_answer(transport, "\r\n> ", "1")
187
188                 # 1- Phase Monitor/Configuration
189                 # 2- Outlet Restriction Configuration
190                 # 3- Outlet Control/Config
191                 # 4- Power Supply Status
192
193                 # 3- Outlet Control/Config
194                 telnet_answer(transport, "\r\n> ", "3")
195
196                 # 1- Outlet 1
197                 # 2- Outlet 2
198                 # ...
199
200                 # n- Outlet n
201                 telnet_answer(transport, "\r\n> ", str(port))
202                 
203                 # 1- Control Outlet
204                 # 2- Configure Outlet
205
206                 # 1- Control Outlet
207                 telnet_answer(transport, "\r\n> ", "1")
208
209                 # 1- Immediate On                         
210                 # 2- Immediate Off                       
211                 # 3- Immediate Reboot             
212                 # 4- Delayed On                         
213                 # 5- Delayed Off                           
214                 # 6- Delayed Reboot                     
215                 # 7- Cancel                                     
216
217                 # 3- Immediate Reboot             
218                 telnet_answer(transport, "\r\n> ", "3")
219
220                 if not dryrun:
221                         telnet_answer(transport, 
222                                 "Enter 'YES' to continue or <ENTER> to cancel", "YES\r\n")
223                         telnet_answer(transport, 
224                                 "Press <ENTER> to continue...", "")
225
226                 # Close
227                 transport.close()
228                 return 0
229
230         except EOFError, err:
231                 if verbose:
232                         logger.debug(err)
233                 if transport:
234                         transport.close()
235                 return errno.ECONNRESET
236         except socket.error, err:
237                 if verbose:
238                         logger.debug(err)
239                 return errno.ETIMEDOUT
240
241         except Exception, err:
242                 import traceback
243                 traceback.print_exc()
244                 if verbose:
245                         logger.debug(err)
246                 if transport:
247                         transport.close()
248                 return "apc error"
249
250 def drac_reboot(ip, username, password, dryrun):
251         global verbose
252         ssh = None
253         try:
254                 ssh = pyssh.Ssh(username, ip)
255                 ssh.set_debuglevel(verbose)
256                 ssh.open()
257                 # Login
258                 print "password"
259                 telnet_answer(ssh, "password:", password)
260
261                 # Testing Reboot ?
262                 print "reset or power"
263                 if dryrun:
264                         telnet_answer(ssh, "[%s]#" % username, "getsysinfo")
265                 else:
266                         # Reset this machine
267                         telnet_answer(ssh, "[%s]#" % username, "serveraction powercycle")
268
269                 print "exit"
270                 telnet_answer(ssh, "[%s]#" % username, "exit")
271
272                 # Close
273                 print "close"
274                 output = ssh.close()
275                 return 0
276
277         except socket.error, err:
278                 print "exception"
279                 import traceback
280                 traceback.print_exc()
281                 if verbose:
282                         logger.debug(err)
283                 if ssh:
284                         output = ssh.close()
285                         if verbose:
286                                 logger.debug(err)
287                 return errno.ETIMEDOUT
288         except Exception, err:
289                 print "exception"
290                 import traceback
291                 traceback.print_exc()
292                 if verbose:
293                         logger.debug(err)
294                 if ssh:
295                         output = ssh.close()
296                         if verbose:
297                                 logger.debug(err)
298                 return "drac error"
299
300 def ilo_reboot(ip, username, password, dryrun):
301         global verbose
302
303         ssh = None
304
305         try:
306                 ssh = pyssh.Ssh(username, ip)
307                 ssh.set_debuglevel(verbose)
308                 ssh.open()
309                 # Login
310                 print "password"
311                 telnet_answer(ssh, "password:", password)
312
313                 # User:vici logged-in to ILOUSE701N7N4.CS.Princeton.EDU(128.112.154.171)
314                 # iLO Advanced 1.26 at 10:01:40 Nov 17 2006
315                 # Server Name: USE701N7N400
316                 # Server Power: On
317                 # 
318                 # </>hpiLO-> 
319                 print "cd system1"
320                 telnet_answer(ssh, "</>hpiLO->", "cd system1")
321
322                 # Reboot Outlet  N        (Y/N)?
323                 print "reset or power"
324                 if dryrun:
325                         telnet_answer(ssh, "</system1>hpiLO->", "POWER")
326                 else:
327                         # Reset this machine
328                         telnet_answer(ssh, "</system1>hpiLO->", "reset")
329
330                 print "exit"
331                 telnet_answer(ssh, "</system1>hpiLO->", "exit")
332
333                 # Close
334                 print "close"
335                 output = ssh.close()
336                 return 0
337
338         except socket.error, err:
339                 print "exception"
340                 import traceback
341                 traceback.print_exc()
342                 if verbose:
343                         logger.debug(err)
344                 if ssh:
345                         output = ssh.close()
346                         if verbose:
347                                 logger.debug(err)
348                 return errno.ETIMEDOUT
349         except Exception, err:
350                 print "exception"
351                 import traceback
352                 traceback.print_exc()
353                 if verbose:
354                         logger.debug(err)
355                 if ssh:
356                         output = ssh.close()
357                         if verbose:
358                                 logger.debug(err)
359                 return "ilo error"
360
361 def baytech_reboot(ip, username, password, port, dryrun):
362         global verbose
363
364         ssh = None
365
366         #verbose = 1 
367         try:
368                 ssh = pyssh.Ssh(username, ip)
369                 ssh.set_debuglevel(verbose)
370                 ssh.open()
371
372                 # Login
373                 telnet_answer(ssh, "password:", password)
374
375                 # PL1 comm output  (2 ,1).........1
376                 # PL2 comm output  (2 ,2).........2
377                 # PL3 comm output  (2 ,3).........3
378                 # no machine       (2 ,4).........4
379                 # Control Outlets  (5 ,1).........5
380                 # Logout..........................T
381
382                 # Control Outlets  (5 ,1).........5
383                 telnet_answer(ssh, "Enter Request :", "5")
384
385                 # Reboot N
386                 try:
387                         telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
388                 except ExceptionNotFound, msg:
389                         # one machine is configured to ask for a username,
390                         # even after login...
391                         print "msg: %s" % msg
392                         ssh.write(username + "\r\n")
393                         telnet_answer(ssh, "DS-RPC>", "Reboot %d" % port)
394                         
395
396                 # Reboot Outlet  N        (Y/N)?
397                 if dryrun:
398                         telnet_answer(ssh, "(Y/N)?", "N")
399                 else:
400                         telnet_answer(ssh, "(Y/N)?", "Y")
401                 telnet_answer(ssh, "DS-RPC>", "")
402
403                 # Close
404                 output = ssh.close()
405                 return 0
406
407         except socket.error, err:
408                 print "exception"
409                 import traceback
410                 traceback.print_exc()
411                 if verbose:
412                         logger.debug(err)
413                 if ssh:
414                         output = ssh.close()
415                         if verbose:
416                                 logger.debug(err)
417                 return errno.ETIMEDOUT
418         except Exception, err:
419                 print "exception"
420                 import traceback
421                 traceback.print_exc()
422                 if verbose:
423                         logger.debug(err)
424                 if ssh:
425                         output = ssh.close()
426                         if verbose:
427                                 logger.debug(err)
428                 return "baytech error"
429
430 ### rebooting european BlackBox PSE boxes
431 # Thierry Parmentelat - May 11 2005
432 # tested on 4-ports models known as PSE505-FR
433 # uses http to POST a data 'P<port>=r'
434 # relies on basic authentication within http1.0
435 # first curl-based script was
436 # curl --http1.0 --basic --user <username>:<password> --data P<port>=r \
437 #       http://<hostname>:<http_port>/cmd.html && echo OK
438
439 def bbpse_reboot (pcu_ip,username,password,port_in_pcu,http_port, dryrun):
440
441         global verbose
442
443         url = "http://%s:%d/cmd.html" % (pcu_ip,http_port)
444         data= "P%d=r" % port_in_pcu
445         if verbose:
446                 logger.debug("POSTing '%s' on %s" % (data,url))
447
448         authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
449         uri = "%s:%d" % (pcu_ip,http_port)
450         authinfo.add_password (None, uri, username, password)
451         authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
452
453         opener = urllib2.build_opener(authhandler)
454         urllib2.install_opener(opener)
455
456         if (dryrun):
457                 return 0
458
459         try:
460                 f = urllib2.urlopen(url,data)
461
462                 r= f.read()
463                 if verbose:
464                         logger.debug(r)
465                 return 0
466
467         except urllib2.URLError,err:
468                 logger.info('Could not open http connection', err)
469                 return "bbpse error"
470
471 ### rebooting x10toggle based systems addressed by port
472 # Marc E. Fiuczynski - May 31 2005
473 # tested on 4-ports models known as PSE505-FR
474 # uses ssh and password to login to an account
475 # that will cause the system to be powercycled.
476
477 def x10toggle_reboot(ip, username, password, port, dryrun):
478         global verbose
479
480         ssh = None
481         try:
482                 ssh = pyssh.Ssh(username, ip)
483                 ssh.open()
484
485                 # Login
486                 telnet_answer(ssh, "password:", password)
487
488                 if not dryrun:
489                         # Reboot
490                         telnet_answer(ssh, "x10toggle>", "A%d" % port)
491
492                 # Close
493                 output = ssh.close()
494                 if verbose:
495                         logger.debug(output)
496                 return 0
497
498         except Exception, err:
499                 if verbose:
500                         logger.debug(err)
501                 if ssh:
502                         output = ssh.close()
503                         if verbose:
504                                 logger.debug(output)
505                 return errno.ETIMEDOUT
506
507 ### rebooting Dell systems via RAC card
508 # Marc E. Fiuczynski - June 01 2005
509 # tested with David Lowenthal's itchy/scratchy nodes at UGA
510 #
511
512 def runcmd(command, args, username, password, timeout = None):
513
514         result = [None]
515         result_ready = threading.Condition()
516
517         def set_result(x):
518
519                 result_ready.acquire()
520                 try:
521                         result[0] = x
522                 finally:
523                         result_ready.notify()
524                         result_ready.release()
525
526         def do_command(command, username, password):
527
528                 try:
529                         # Popen4 is a popen-type class that combines stdout and stderr
530                         p = popen2.Popen4(command)
531
532                         # read all output data
533                         p.tochild.write("%s\n" % username)
534                         p.tochild.write("%s\n" % password)
535                         p.tochild.close()
536                         data = p.fromchild.read()
537
538                         while True:
539                                 # might get interrupted by a signal in poll() or waitpid()
540                                 try:
541                                         retval = p.wait()
542                                         set_result((retval, data))
543                                         break
544                                 except OSError, ex:
545                                         if ex.errno == errno.EINTR:
546                                                 continue
547                                         raise ex
548                 except Exception, ex:
549                         set_result(ex)
550
551         if args:
552                 command = " ".join([command] + args)
553
554         worker = threading.Thread(target = do_command, args = (command, username, password, ))
555         worker.setDaemon(True)
556         result_ready.acquire()
557         worker.start()
558         result_ready.wait(timeout)
559         try:
560                 if result == [None]:
561                         raise Exception, "command timed-out: '%s'" % command
562         finally:
563                 result_ready.release()
564         result = result[0]
565
566         if isinstance(result, Exception):
567                 raise result
568         else:
569                 (retval, data) = result
570                 if os.WIFEXITED(retval) and os.WEXITSTATUS(retval) == 0:
571                         return data
572                 else:
573                         out = "system command ('%s') " % command
574                         if os.WIFEXITED(retval):
575                                 out += "failed, rc = %d" % os.WEXITSTATUS(retval)
576                         else:
577                                 out += "killed by signal %d" % os.WTERMSIG(retval)
578                         if data:
579                                 out += "; output follows:\n" + data
580                         raise Exception, out
581
582 def racadm_reboot(ip, username, password, port, dryrun):
583         global verbose
584
585         try:
586                 cmd = "/usr/sbin/racadm"
587                 os.stat(cmd)
588                 if not dryrun:
589                         output = runcmd(cmd, ["-r %s -i serveraction powercycle" % ip],
590                                 username, password)
591                 else:
592                         output = "dryrun of racadm command"
593
594                 logger.debug("runcmd returned without output %s" % output)
595                 if verbose:
596                         logger.debug(output)
597                 return 0
598
599         except Exception, err:
600                 logger.debug("runcmd raised exception %s" % err)
601                 if verbose:
602                         logger.debug(err)
603                 return errno.ETIMEDOUT
604
605 # Returns true if rebooted via PCU
606 def reboot(nodename, dryrun):
607         pcu = plc.getpcu(nodename)
608         if not pcu:
609                 plc.nodePOD(nodename)
610                 return False
611         # Try the PCU first
612         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
613
614         # APC Masterswitch (Berkeley)
615         if pcu['model'] == "APC Masterswitch":
616                 err = apc_reboot(pcu['ip'], pcu['username'],pcu['password'], 
617                                 pcu[nodename], pcu['protocol'], dryrun)
618
619         # DataProbe iPal (many sites)
620         elif pcu['protocol'] == "telnet" and pcu['model'].find("IP-4") >= 0:
621                 err = ipal_reboot(pcu['ip'],pcu['password'], pcu[nodename], dryrun)
622
623         # BayTech DS4-RPC
624         elif pcu['protocol'] == "ssh" and \
625         (pcu['model'].find("Baytech") >= 0 or pcu['model'].find("DS4") >= 0):
626                 err = baytech_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
627
628         # BlackBox PSExxx-xx (e.g. PSE505-FR)
629         elif pcu['protocol'] == "http" and (pcu['model'] == "bbpse"):
630                 err = bbpse_reboot(pcu['ip'], pcu['username'], pcu['password'], pcu[nodename],80, dryrun)
631
632         # x10toggle
633         elif pcu['protocol'] == "ssh" and (pcu['model'] == "x10toggle"):
634                 err = x10toggle_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu[nodename], dryrun)
635
636         # 
637         elif pcu['protocol'] == "racadm" and (pcu['model'] == "RAC"):
638                 err = racadm_reboot(pcu['ip'], pcu['username'],pcu['password'], pcu_[nodename], dryrun)
639
640         # Unknown or unsupported
641         else:
642                 err = errno.EPROTONOSUPPORT
643                 return False
644         return True 
645
646 #def get_suggested(suggestion_id,db):
647 #
648 #       sql= """select node_id,pcu_id from nodes where suggestion = %d """\
649 #                       % (suggestion_id)
650 #       try:
651 #               nodes = db.query(sql).dictresult()
652 #       except pg.ProgrammingError, err:
653 #               print( "Database error for query: %s\n%s" % (sql,err) )
654 #               sys.exit(1)
655 #       return nodes
656
657 #def get_pcu_info(node_id,pcu_id,db):
658 #       sql= """select port_number from pcu_ports where node_id = %d and pcu_id = %d """\
659 #                       % (node_id,pcu_id)
660 #       try:
661 #          port_number = db.query(sql).dictresult()
662 #       except pg.ProgrammingError, err:
663 #               print( "Database error for query: %s\n%s" % (sql,err) )
664 #               sys.exit(1)
665 #       
666 #       sql= """select * from pcu where pcu_id = %d """\
667 #                       % (pcu_id)
668 #       try:
669 #               pcu = db.query(sql).dictresult()
670 #       except pg.ProgrammingError, err:
671 #               print( "Database error for query: %s\n%s" % (sql,err) )
672 #               sys.exit(1)
673 #
674 #       result = {'node_id':node_id,'pcu_id':pcu_id,'port_number':port_number[0]['port_number'], 
675 #                         'ip':pcu[0]['ip'],'username':pcu[0]['username'],'password':pcu[0]['password'],\
676 #                         'model':pcu[0]['model'],'protocol':pcu[0]['protocol'],'hostname':pcu[0]['hostname']}
677 #
678 #       return result
679
680 #def add_plc_event(node_id,err,db):
681 #       site_id = plc_db_utils.get_site_from_node_id(node_id,db)
682 #       message = "PCU reboot by monitor-msgs@planet-lab.org: %s" % os.strerror(err)
683 #
684 #       sql = """insert into events (event_class_id,message,person_id,node_id,site_id) values """\
685 #                 """(%d,'%s',%d,%d,%d)""" % (NODE_POWER_CONTROL,message,MONITOR_USER_ID,node_id,site_id)
686 #       print sql
687 #
688 #       try:
689 #               db.query(sql)
690 #       except pg.ProgrammingError, err:
691 #               print( "Database error for: %s\n%s" % (sql,err) )
692 #               sys.exit(1)
693
694
695 def main():
696         logger.setLevel(logging.DEBUG)
697         ch = logging.StreamHandler()
698         ch.setLevel(logging.DEBUG)
699         formatter = logging.Formatter('LOGGER - %(message)s')
700         ch.setFormatter(formatter)
701         logger.addHandler(ch)
702
703
704         try:
705                 reboot("planetlab2.cs.uchicago.edu")
706                 reboot("alice.cs.princeton.edu")
707         except Exception, err:
708                 print err
709         # used later for pretty printing
710 #       pp = pprint.PrettyPrinter(indent=2)
711
712 #       user = "Monitor"
713 #       password = None
714
715 #       plc_db = plc_dbs.open_plc_db_write()
716 #       mon_db = plc_dbs.open_mon_db()
717
718         # 5 = needs script reboot - fix this later
719 #       nodes = get_suggested(5,mon_db)
720
721 #       for row in nodes:
722                 
723 #               pcu = get_pcu_info(row['node_id'],row['pcu_id'],plc_db)
724 #               add_plc_event(row['node_id'],err,plc_db)
725
726 if __name__ == '__main__':
727         import plc
728         logger = logging.getLogger("monitor")
729         main()