break out the functions that are needed by the monitor module for rebooting
[monitor.git] / monitor / reboot.py
1 #!/usr/bin/python
2 #
3 # Reboot specified nodes
4 #
5
6 import getpass, getopt
7 import os, sys
8 import xml, xmlrpclib
9 import errno, time, traceback
10 import urllib2
11 import urllib
12 import threading, popen2
13 import array, struct
14 import base64
15 from subprocess import PIPE, Popen
16 import pcucontrol.transports.ssh.pxssh as pxssh
17 import pcucontrol.transports.ssh.pexpect as pexpect
18 import socket
19
20
21
22 # Use our versions of telnetlib and pyssh
23 sys.path.insert(0, os.path.dirname(sys.argv[0]))
24 import pcucontrol.transports.telnetlib as telnetlib
25 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
26 import pcucontrol.transports.pyssh as pyssh
27
28 from monitor import config
29 from monitor.util import command
30 from monitor.wrapper import plc
31
32
33 # Event class ID from pcu events
34 #NODE_POWER_CONTROL = 3
35
36 # Monitor user ID
37 #MONITOR_USER_ID = 11142
38
39 import logging
40 logger = logging.getLogger("monitor")
41 verbose = 1
42 #dryrun = 0;
43
44 class ExceptionNoTransport(Exception): pass
45 class ExceptionNotFound(Exception): pass
46 class ExceptionPassword(Exception): pass
47 class ExceptionTimeout(Exception): pass
48 class ExceptionPrompt(Exception): pass
49 class ExceptionSequence(Exception): pass
50 class ExceptionReset(Exception): pass
51 class ExceptionPort(Exception): pass
52 class ExceptionUsername(Exception): pass
53
54
55
56 # PCU has model, host, preferred-port, user, passwd, 
57
58 # This is an object derived directly form the PLCAPI DB fields
59 class PCU(object):
60         def __init__(self, plc_pcu_dict):
61                 for field in ['username', 'password', 'site_id', 
62                                                 'hostname', 'ip', 
63                                                 'pcu_id', 'model', 
64                                                 'node_ids', 'ports', ]:
65                         if field in plc_pcu_dict:
66                                 self.__setattr__(field, plc_pcu_dict[field])
67                         else:
68                                 raise Exception("No such field %s in PCU object" % field)
69
70 # These are the convenience functions build around the PCU object.
71 class PCUModel(PCU):
72         def __init__(self, plc_pcu_dict):
73                 PCU.__init__(self, plc_pcu_dict)
74                 self.host = self.pcu_name()
75
76         def pcu_name(self):
77                 if self.hostname is not None and self.hostname is not "":
78                         return self.hostname
79                 elif self.ip is not None and self.ip is not "":
80                         return self.ip
81                 else:
82                         return None
83
84         def nodeidToPort(self, node_id):
85                 if node_id in self.node_ids:
86                         for i in range(0, len(self.node_ids)):
87                                 if node_id == self.node_ids[i]:
88                                         return self.ports[i]
89
90                 raise Exception("No such Node ID: %d" % node_id)
91
92 # This class captures the observed pcu records from FindBadPCUs.py
93 class PCURecord:
94         def __init__(self, pcu_record_dict):
95                 for field in ['port_status', 
96                                                 'dns_status', 
97                                                 'entry_complete', ]:
98                         if field in pcu_record_dict:
99                                 if field == "reboot":
100                                         self.__setattr__("reboot_str", pcu_record_dict[field])
101                                 else:
102                                         self.__setattr__(field, pcu_record_dict[field])
103                         #else:
104                         #       raise Exception("No such field %s in pcu record dict" % field)
105
106 class Transport:
107         TELNET = "telnet"
108         SSH    = "ssh"
109         HTTP   = "http"
110         HTTPS  = "https"
111         IPAL   = "ipal"
112         DRAC   = "drac"
113         AMT    = "amt"
114
115         TELNET_TIMEOUT = 120
116
117         porttypemap = {
118                         5869 : DRAC,
119                         22 : SSH,
120                         23 : TELNET,
121                         443 : HTTPS,
122                         80 :  HTTP,
123                         9100 : IPAL,
124                         16992 : AMT,
125                 }
126
127         def __init__(self, type, verbose):
128                 self.type = type
129                 self.verbose = verbose
130                 self.transport = None
131
132         def open(self, host, username=None, password=None, prompt="User Name"):
133                 transport = None
134
135                 if self.type == self.TELNET:
136                         transport = telnetlib.Telnet(host, timeout=self.TELNET_TIMEOUT)
137                         transport.set_debuglevel(self.verbose)
138                         if username is not None:
139                                 self.transport = transport
140                                 self.ifThenSend(prompt, username, ExceptionUsername)
141
142                 elif self.type == self.SSH:
143                         if username is not None:
144                                 transport = pyssh.Ssh(username, host)
145                                 transport.set_debuglevel(self.verbose)
146                                 transport.open()
147                                 # TODO: have an ssh set_debuglevel() also...
148                         else:
149                                 raise Exception("Username cannot be None for ssh transport.")
150                 elif self.type == self.HTTP:
151                         # NOTE: this does not work for all web-based services...
152                         self.url = "http://%s:%d/" % (host,80)
153                         uri = "%s:%d" % (host,80)
154
155                         # create authinfo
156                         authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
157                         authinfo.add_password (None, uri, username, password)
158                         authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
159
160                         transport = urllib2.build_opener(authhandler)
161                 else:
162                         raise Exception("Unknown transport type: %s" % self.type)
163
164                 self.transport = transport
165                 return True
166
167         def close(self):
168                 if self.type == self.TELNET:
169                         self.transport.close() 
170                 elif self.type == self.SSH:
171                         self.transport.close() 
172                 elif self.type == self.HTTP:
173                         pass
174                 else:
175                         raise Exception("Unknown transport type %s" % self.type)
176                 self.transport = None
177
178         def write(self, msg):
179                 return self.send(msg)
180
181         def send(self, msg):
182                 if self.transport == None:
183                         raise ExceptionNoTransport("transport object is type None")
184                         
185                 return self.transport.write(msg)
186
187         def sendPassword(self, password, prompt=None):
188                 if self.type == self.TELNET:
189                         if prompt == None:
190                                 self.ifThenSend("Password", password, ExceptionPassword)
191                         else:
192                                 self.ifThenSend(prompt, password, ExceptionPassword)
193                 elif self.type == self.SSH:
194                         self.ifThenSend("password:", password, ExceptionPassword)
195                 elif self.type == self.HTTP:
196                         pass
197                 else:
198                         raise Exception("Unknown transport type: %s" % self.type)
199
200         def sendHTTP(self, resource, data):
201                 if self.verbose:
202                         print "POSTing '%s' to %s" % (data,self.url + resource)
203
204                 try:
205                         f = self.transport.open(self.url + resource ,data)
206                         r = f.read()
207                         if self.verbose:
208                                 print r
209
210                 except urllib2.URLError,err:
211                         logger.info('Could not open http connection', err)
212                         return "http transport error"
213
214                 return 0
215
216         def ifThenSend(self, expected, buffer, ErrorClass=ExceptionPrompt):
217
218                 if self.transport != None:
219                         output = self.transport.read_until(expected, self.TELNET_TIMEOUT)
220                         if output.find(expected) == -1:
221                                 print "OUTPUT: --%s--" % output
222                                 raise ErrorClass, "'%s' not found" % expected
223                         else:
224                                 self.transport.write(buffer + "\r\n")
225                 else:
226                         raise ExceptionNoTransport("transport object is type None")
227
228         def ifElse(self, expected, ErrorClass):
229                 try:
230                         self.transport.read_until(expected, self.TELNET_TIMEOUT)
231                 except:
232                         raise ErrorClass("Could not find '%s' within timeout" % expected)
233
234 class PCUControl(PCUModel,PCURecord):
235
236         """ 
237                 There are three cases:
238                         1) the pcu_record passed below includes port_status from an
239                                 external probe.
240                         2) the external probe failed, and the values are empty
241                         3) this call is made independent of port_status.
242
243                 In the first case, the first open port is used.
244                 In the third case, the ports are tried in sequence.
245
246                 In this way, the port_status value serves only as an optimization,
247                 because closed ports are avoided.  The supported_ports value should
248                 order ports by their preferred usage.
249         """
250
251         supported_ports = []
252
253         def __init__(self, plc_pcu_record, verbose, ignored=None):
254                 PCUModel.__init__(self, plc_pcu_record)
255                 PCURecord.__init__(self, plc_pcu_record)
256
257         def reboot(self, node_port, dryrun):
258
259                 port_list = []
260                 # There are two sources of potential ports.  Those that are open and
261                 # those that are part of the PCU's supported_ports.  
262                 #  I think we should start with supported_ports and then filter that
263                 #  by the open ports.
264
265                 port_list = self.supported_ports
266
267                 if hasattr(self, 'port_status') and self.port_status:
268                         # get out the open ports
269                         port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
270                         port_list = [ int(x) for x in port_list ]
271                         # take only the open ports that are supported_ports
272                         port_list = filter(lambda x: x in self.supported_ports, port_list)
273                         if port_list == []:
274                                 raise ExceptionPort("No Open Port: No transport from open ports")
275
276                 print port_list
277
278                 ret = "No implementation for open ports on selected PCU model"
279                 for port in port_list:
280                         if port not in Transport.porttypemap:
281                                 continue
282
283                         type = Transport.porttypemap[port]
284                         self.transport = Transport(type, verbose)
285
286                         print "checking for run_%s" % type
287                         if hasattr(self, "run_%s" % type):
288                                 print "found run_%s" % type
289                                 fxn = getattr(self, "run_%s" % type)
290                                 ret = self.catcherror(fxn, node_port, dryrun)
291                                 if ret == 0: # NOTE: success!, so stop
292                                         break
293                         else:
294                                 continue
295
296                 return ret
297
298         def run(self, node_port, dryrun):
299                 """ This function is to be defined by the specific PCU instance.  """
300                 raise Exception("This function is not implemented")
301                 pass
302
303         #def reboot(self, node_port, dryrun):
304
305         def catcherror(self, function, node_port, dryrun):
306                 try:
307                         return function(node_port, dryrun)
308                 except ExceptionNotFound, err:
309                         return "error: " + str(err)
310                 except ExceptionPassword, err:
311                         return "Password exception: " + str(err)
312                 except ExceptionTimeout, err:
313                         return "Timeout exception: " + str(err)
314                 except ExceptionUsername, err:
315                         return "No username prompt: " + str(err)
316                 except ExceptionSequence, err:
317                         return "Sequence error: " + str(err)
318                 except ExceptionPrompt, err:
319                         return "Prompt exception: " + str(err)
320                 except ExceptionNoTransport, err:
321                         return "No Transport: " + str(err)
322                 except ExceptionPort, err:
323                         return "No ports exception: " + str(err)
324                 except socket.error, err:
325                         return "socket error: timeout: " + str(err)
326                 except urllib2.HTTPError, err:
327                         return "HTTPError: " + str(err)
328                 except urllib2.URLError, err:
329                         return "URLError: " + str(err)
330                 except EOFError, err:
331                         self.transport.close()
332                         import traceback
333                         traceback.print_exc()
334                         return "EOF connection reset" + str(err)
335                 except Exception, err:
336                         from monitor.common import email_exception
337                         email_exception(self.host)
338                         raise Exception(err)
339
340 from pcucontrol.models import *
341
342 def pcu_name(pcu):
343         if pcu['hostname'] is not None and pcu['hostname'] is not "":
344                 return pcu['hostname']
345         elif pcu['ip'] is not None and pcu['ip'] is not "":
346                 return pcu['ip']
347         else:
348                 return None
349
350 def get_pcu_values(pcu_id):
351         from monitor.database.info.model import FindbadPCURecord
352         print "pcuid: %s" % pcu_id
353         try:
354                 pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
355                 if pcurec:
356                         values = pcurec.to_dict()
357                 else:
358                         values = None
359         except:
360                 values = None
361
362         return values
363
364 def reboot(nodename):
365         return reboot_policy(nodename, True, False)
366
367 def reboot_str(nodename):
368         global verbose
369         continue_probe = True
370         dryrun=False
371
372         pcu = plc.getpcu(nodename)
373         if not pcu:
374                 logger.debug("no pcu for %s" % nodename)
375                 print "no pcu for %s" % nodename
376                 return False # "%s has no pcu" % nodename
377
378         values = get_pcu_values(pcu['pcu_id'])
379         if values == None:
380                 logger.debug("No values for pcu probe %s" % nodename)
381                 print "No values for pcu probe %s" % nodename
382                 return False #"no info for pcu_id %s" % pcu['pcu_id']
383         
384         # Try the PCU first
385         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
386
387         ret = reboot_test_new(nodename, values, verbose, dryrun)
388         return ret
389         
390 def reboot_policy(nodename, continue_probe, dryrun):
391         global verbose
392
393         pcu = plc.getpcu(nodename)
394         if not pcu:
395                 logger.debug("no pcu for %s" % nodename)
396                 print "no pcu for %s" % nodename
397                 return False # "%s has no pcu" % nodename
398
399         values = get_pcu_values(pcu['pcu_id'])
400         if values == None:
401                 logger.debug("No values for pcu probe %s" % nodename)
402                 print "No values for pcu probe %s" % nodename
403                 return False #"no info for pcu_id %s" % pcu['pcu_id']
404         
405         # Try the PCU first
406         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
407
408         ret = reboot_test_new(nodename, values, verbose, dryrun)
409
410         if ret != 0:
411                 print ret
412                 return False
413         else:
414                 print "return true"
415                 return True
416
417 class Unknown(PCUControl):
418         supported_ports = [22,23,80,443,5869,9100,16992]
419
420 def model_to_object(modelname):
421         if modelname is None:
422                 return ManualPCU 
423         if "AMT" in modelname:
424                 return IntelAMT
425         elif "BayTech" in modelname:
426                 return BayTech
427         elif "HPiLO" in modelname:
428                 return HPiLO
429         elif "IPAL" in modelname:
430                 return IPAL
431         elif "APC" in modelname:
432                 return APCControl
433         elif "DRAC" in modelname:
434                 return DRAC
435         elif "WTI" in modelname:
436                 return WTIIPS4
437         elif "ePowerSwitch" in modelname:
438                 return ePowerSwitchNew
439         elif "IPMI" in modelname:
440                 return IPMI
441         elif "BlackBoxPSMaverick" in modelname:
442                 return BlackBoxPSMaverick
443         elif "PM211MIP" in modelname:
444                 return PM211MIP
445         elif "ManualPCU" in modelname:
446                 return ManualPCU 
447         else:
448                 print "UNKNOWN model %s"%modelname
449                 return Unknown
450
451 def reboot_api(node, pcu): #, verbose, dryrun):
452         rb_ret = ""
453
454         try:
455                 modelname = pcu['model']
456                 if modelname:
457                         # get object instance 
458                         instance = eval('%s(pcu, verbose)' % modelname)
459                         # get pcu port 
460                         i = pcu['node_ids'].index(node['node_id'])
461                         p = pcu['ports'][i]
462                         # reboot
463                         rb_ret = instance.reboot(p, False)
464                 else:
465                         rb_ret =  "No modelname in PCU record."
466                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
467         except Exception, err:
468                 rb_ret = str(err)
469
470         return rb_ret
471
472 def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
473         newmodelname = None
474         update = {      'AP79xx' : 'APCControl13p13',
475                                 'Masterswitch' : 'APCControl13p13',
476                                 'DS4-RPC' : 'BayTech',
477                                 'IP-41x_IP-81x' : 'IPAL',
478                                 'DRAC3' : 'DRAC',
479                                 'DRAC4' : 'DRAC',
480                                 'ePowerSwitch' : 'ePowerSwitchOld',
481                                 'ilo2' : 'HPiLO',
482                                 'ilo1' : 'HPiLO',
483                                 'PM211-MIP' : 'PM211MIP',
484                                 'AMT2.5' : 'IntelAMT',
485                                 'AMT3.0' : 'IntelAMT',
486                                 'WTI_IPS-4' : 'WTIIPS4',
487                                 'unknown'  : 'ManualPCU',
488                                 'DRAC5' : 'DRAC',
489                                 'ipmi'  : 'OpenIPMI',
490                                 'bbsemaverick' : 'BlackBoxPSMaverick',
491                                 'manualadmin'  : 'ManualPCU',
492         }
493
494         if oldmodelname in update:
495                 newmodelname = update[oldmodelname]
496         else:
497                 newmodelname = oldmodelname
498
499         if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
500                 newmodelname = 'APCControl12p3'
501         elif pcu_id in [1110,86]:
502                 newmodelname = 'APCControl1p4'
503         elif pcu_id in [1221,1225,1220,1192]:
504                 newmodelname = 'APCControl121p3'
505         elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
506                 newmodelname = 'APCControl121p1'
507         elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
508                 newmodelname = 'BayTechCtrlC'
509         elif pcu_id in [93]:
510                 newmodelname = 'BayTechRPC3NC'
511         elif pcu_id in [1057]:
512                 newmodelname = 'BayTechCtrlCUnibe'
513         elif pcu_id in [1012]:
514                 newmodelname = 'BayTechRPC16'
515         elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
516                 newmodelname = 'ePowerSwitchNew'
517
518         return newmodelname
519
520 def reboot_test_new(nodename, values, verbose, dryrun):
521         rb_ret = ""
522         if 'plc_pcu_stats' in values:
523                 values.update(values['plc_pcu_stats'])
524
525         try:
526                 modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
527                 if modelname:
528                         object = eval('%s(values, verbose)' % modelname)
529                         rb_ret = object.reboot(values[nodename], dryrun)
530                 else:
531                         rb_ret =  "Not_Run"
532                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
533         except ExceptionPort, err:
534                 rb_ret = str(err)
535         except NameError, err:
536                 rb_ret = str(err)
537
538         return rb_ret
539
540 def main():
541         logger.setLevel(logging.DEBUG)
542         ch = logging.StreamHandler()
543         ch.setLevel(logging.DEBUG)
544         formatter = logging.Formatter('LOGGER - %(message)s')
545         ch.setFormatter(formatter)
546         logger.addHandler(ch)
547
548         try:
549                 if "test" in sys.argv:
550                         dryrun = True
551                 else:
552                         dryrun = False
553
554                 for node in sys.argv[1:]:
555                         if node == "test": continue
556
557                         print "Rebooting %s" % node
558                         if reboot_policy(node, True, dryrun):
559                                 print "success"
560                         else:
561                                 print "failed"
562         except Exception, err:
563                 import traceback; traceback.print_exc()
564                 from monitor.common import email_exception
565                 email_exception(node)
566                 print err
567
568 if __name__ == '__main__':
569         logger = logging.getLogger("monitor")
570         main()
571         f = open("/tmp/rebootlog", 'a')
572         f.write("reboot %s\n" % sys.argv)
573         f.close()