add email_exception() calls throughout code.
[monitor.git] / pcucontrol / reboot.py
1 #!/usr/bin/python
2 #
3 # Reboot specified nodes
4 #
5
6 import getpass, getopt
7 import os, sys
8 import xml, xmlrpclib
9 import errno, time, traceback
10 import urllib2
11 import urllib
12 import threading, popen2
13 import array, struct
14 from monitor.wrapper import plc
15 import base64
16 from subprocess import PIPE, Popen
17 import pcucontrol.transports.ssh.pxssh as pxssh
18 import pcucontrol.transports.ssh.pexpect as pexpect
19 import socket
20 from monitor.util import command
21
22
23 # Use our versions of telnetlib and pyssh
24 sys.path.insert(0, os.path.dirname(sys.argv[0]))
25 import pcucontrol.transports.telnetlib as telnetlib
26 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
27 import pcucontrol.transports.pyssh as pyssh
28 from monitor import config
29
30
31 # Event class ID from pcu events
32 #NODE_POWER_CONTROL = 3
33
34 # Monitor user ID
35 #MONITOR_USER_ID = 11142
36
37 import logging
38 logger = logging.getLogger("monitor")
39 verbose = 1
40 #dryrun = 0;
41
42 class ExceptionNoTransport(Exception): pass
43 class ExceptionNotFound(Exception): pass
44 class ExceptionPassword(Exception): pass
45 class ExceptionTimeout(Exception): pass
46 class ExceptionPrompt(Exception): pass
47 class ExceptionSequence(Exception): pass
48 class ExceptionReset(Exception): pass
49 class ExceptionPort(Exception): pass
50 class ExceptionUsername(Exception): pass
51
52
53
54 # PCU has model, host, preferred-port, user, passwd, 
55
56 # This is an object derived directly form the PLCAPI DB fields
57 class PCU(object):
58         def __init__(self, plc_pcu_dict):
59                 for field in ['username', 'password', 'site_id', 
60                                                 'hostname', 'ip', 
61                                                 'pcu_id', 'model', 
62                                                 'node_ids', 'ports', ]:
63                         if field in plc_pcu_dict:
64                                 self.__setattr__(field, plc_pcu_dict[field])
65                         else:
66                                 raise Exception("No such field %s in PCU object" % field)
67
68 # These are the convenience functions build around the PCU object.
69 class PCUModel(PCU):
70         def __init__(self, plc_pcu_dict):
71                 PCU.__init__(self, plc_pcu_dict)
72                 self.host = self.pcu_name()
73
74         def pcu_name(self):
75                 if self.hostname is not None and self.hostname is not "":
76                         return self.hostname
77                 elif self.ip is not None and self.ip is not "":
78                         return self.ip
79                 else:
80                         return None
81
82         def nodeidToPort(self, node_id):
83                 if node_id in self.node_ids:
84                         for i in range(0, len(self.node_ids)):
85                                 if node_id == self.node_ids[i]:
86                                         return self.ports[i]
87
88                 raise Exception("No such Node ID: %d" % node_id)
89
90 # This class captures the observed pcu records from FindBadPCUs.py
91 class PCURecord:
92         def __init__(self, pcu_record_dict):
93                 for field in ['port_status', 
94                                                 'dns_status', 
95                                                 'entry_complete', ]:
96                         if field in pcu_record_dict:
97                                 if field == "reboot":
98                                         self.__setattr__("reboot_str", pcu_record_dict[field])
99                                 else:
100                                         self.__setattr__(field, pcu_record_dict[field])
101                         #else:
102                         #       raise Exception("No such field %s in pcu record dict" % field)
103
104 class Transport:
105         TELNET = "telnet"
106         SSH    = "ssh"
107         HTTP   = "http"
108         HTTPS  = "https"
109         IPAL   = "ipal"
110         DRAC   = "drac"
111         AMT    = "amt"
112
113         TELNET_TIMEOUT = 120
114
115         porttypemap = {
116                         5869 : DRAC,
117                         22 : SSH,
118                         23 : TELNET,
119                         443 : HTTPS,
120                         80 :  HTTP,
121                         9100 : IPAL,
122                         16992 : AMT,
123                 }
124
125         def __init__(self, type, verbose):
126                 self.type = type
127                 self.verbose = verbose
128                 self.transport = None
129
130         def open(self, host, username=None, password=None, prompt="User Name"):
131                 transport = None
132
133                 if self.type == self.TELNET:
134                         transport = telnetlib.Telnet(host, timeout=self.TELNET_TIMEOUT)
135                         transport.set_debuglevel(self.verbose)
136                         if username is not None:
137                                 self.transport = transport
138                                 self.ifThenSend(prompt, username, ExceptionUsername)
139
140                 elif self.type == self.SSH:
141                         if username is not None:
142                                 transport = pyssh.Ssh(username, host)
143                                 transport.set_debuglevel(self.verbose)
144                                 transport.open()
145                                 # TODO: have an ssh set_debuglevel() also...
146                         else:
147                                 raise Exception("Username cannot be None for ssh transport.")
148                 elif self.type == self.HTTP:
149                         # NOTE: this does not work for all web-based services...
150                         self.url = "http://%s:%d/" % (host,80)
151                         uri = "%s:%d" % (host,80)
152
153                         # create authinfo
154                         authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
155                         authinfo.add_password (None, uri, username, password)
156                         authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
157
158                         transport = urllib2.build_opener(authhandler)
159                 else:
160                         raise Exception("Unknown transport type: %s" % self.type)
161
162                 self.transport = transport
163                 return True
164
165         def close(self):
166                 if self.type == self.TELNET:
167                         self.transport.close() 
168                 elif self.type == self.SSH:
169                         self.transport.close() 
170                 elif self.type == self.HTTP:
171                         pass
172                 else:
173                         raise Exception("Unknown transport type %s" % self.type)
174                 self.transport = None
175
176         def write(self, msg):
177                 return self.send(msg)
178
179         def send(self, msg):
180                 if self.transport == None:
181                         raise ExceptionNoTransport("transport object is type None")
182                         
183                 return self.transport.write(msg)
184
185         def sendPassword(self, password, prompt=None):
186                 if self.type == self.TELNET:
187                         if prompt == None:
188                                 self.ifThenSend("Password", password, ExceptionPassword)
189                         else:
190                                 self.ifThenSend(prompt, password, ExceptionPassword)
191                 elif self.type == self.SSH:
192                         self.ifThenSend("password:", password, ExceptionPassword)
193                 elif self.type == self.HTTP:
194                         pass
195                 else:
196                         raise Exception("Unknown transport type: %s" % self.type)
197
198         def sendHTTP(self, resource, data):
199                 if self.verbose:
200                         print "POSTing '%s' to %s" % (data,self.url + resource)
201
202                 try:
203                         f = self.transport.open(self.url + resource ,data)
204                         r = f.read()
205                         if self.verbose:
206                                 print r
207
208                 except urllib2.URLError,err:
209                         logger.info('Could not open http connection', err)
210                         return "http transport error"
211
212                 return 0
213
214         def ifThenSend(self, expected, buffer, ErrorClass=ExceptionPrompt):
215
216                 if self.transport != None:
217                         output = self.transport.read_until(expected, self.TELNET_TIMEOUT)
218                         if output.find(expected) == -1:
219                                 print "OUTPUT: --%s--" % output
220                                 raise ErrorClass, "'%s' not found" % expected
221                         else:
222                                 self.transport.write(buffer + "\r\n")
223                 else:
224                         raise ExceptionNoTransport("transport object is type None")
225
226         def ifElse(self, expected, ErrorClass):
227                 try:
228                         self.transport.read_until(expected, self.TELNET_TIMEOUT)
229                 except:
230                         raise ErrorClass("Could not find '%s' within timeout" % expected)
231
232 class PCUControl(PCUModel,PCURecord):
233
234         """ 
235                 There are three cases:
236                         1) the pcu_record passed below includes port_status from an
237                                 external probe.
238                         2) the external probe failed, and the values are empty
239                         3) this call is made independent of port_status.
240
241                 In the first case, the first open port is used.
242                 In the third case, the ports are tried in sequence.
243
244                 In this way, the port_status value serves only as an optimization,
245                 because closed ports are avoided.  The supported_ports value should
246                 order ports by their preferred usage.
247         """
248
249         supported_ports = []
250
251         def __init__(self, plc_pcu_record, verbose, ignored=None):
252                 PCUModel.__init__(self, plc_pcu_record)
253                 PCURecord.__init__(self, plc_pcu_record)
254
255         def reboot(self, node_port, dryrun):
256
257                 port_list = []
258                 # There are two sources of potential ports.  Those that are open and
259                 # those that are part of the PCU's supported_ports.  
260                 #  I think we should start with supported_ports and then filter that
261                 #  by the open ports.
262
263                 port_list = self.supported_ports
264
265                 if hasattr(self, 'port_status') and self.port_status:
266                         # get out the open ports
267                         port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
268                         port_list = [ int(x) for x in port_list ]
269                         # take only the open ports that are supported_ports
270                         port_list = filter(lambda x: x in self.supported_ports, port_list)
271                         if port_list == []:
272                                 raise ExceptionPort("No Open Port: No transport from open ports")
273
274                 print port_list
275
276                 ret = "No implementation for open ports on selected PCU model"
277                 for port in port_list:
278                         if port not in Transport.porttypemap:
279                                 continue
280
281                         type = Transport.porttypemap[port]
282                         self.transport = Transport(type, verbose)
283
284                         print "checking for run_%s" % type
285                         if hasattr(self, "run_%s" % type):
286                                 print "found run_%s" % type
287                                 fxn = getattr(self, "run_%s" % type)
288                                 ret = self.catcherror(fxn, node_port, dryrun)
289                                 if ret == 0: # NOTE: success!, so stop
290                                         break
291                         else:
292                                 continue
293
294                 return ret
295
296         def run(self, node_port, dryrun):
297                 """ This function is to be defined by the specific PCU instance.  """
298                 raise Exception("This function is not implemented")
299                 pass
300
301         #def reboot(self, node_port, dryrun):
302
303         def catcherror(self, function, node_port, dryrun):
304                 try:
305                         return function(node_port, dryrun)
306                 except ExceptionNotFound, err:
307                         return "error: " + str(err)
308                 except ExceptionPassword, err:
309                         return "Password exception: " + str(err)
310                 except ExceptionTimeout, err:
311                         return "Timeout exception: " + str(err)
312                 except ExceptionUsername, err:
313                         return "No username prompt: " + str(err)
314                 except ExceptionSequence, err:
315                         return "Sequence error: " + str(err)
316                 except ExceptionPrompt, err:
317                         return "Prompt exception: " + str(err)
318                 except ExceptionNoTransport, err:
319                         return "No Transport: " + str(err)
320                 except ExceptionPort, err:
321                         return "No ports exception: " + str(err)
322                 except socket.error, err:
323                         return "socket error: timeout: " + str(err)
324                 except urllib2.HTTPError, err:
325                         return "HTTPError: " + str(err)
326                 except urllib2.URLError, err:
327                         return "URLError: " + str(err)
328                 except EOFError, err:
329                         self.transport.close()
330                         import traceback
331                         traceback.print_exc()
332                         return "EOF connection reset" + str(err)
333                 except Exception, err:
334                         from monitor.common import email_exception
335                         email_exception(self.host)
336                         raise Exception(err)
337
338 from pcucontrol.models import *
339
340 def pcu_name(pcu):
341         if pcu['hostname'] is not None and pcu['hostname'] is not "":
342                 return pcu['hostname']
343         elif pcu['ip'] is not None and pcu['ip'] is not "":
344                 return pcu['ip']
345         else:
346                 return None
347
348 def get_pcu_values(pcu_id):
349         from monitor.database.info.model import FindbadPCURecord
350         print "pcuid: %s" % pcu_id
351         try:
352                 pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
353                 if pcurec:
354                         values = pcurec.to_dict()
355                 else:
356                         values = None
357         except:
358                 values = None
359
360         return values
361
362 def reboot(nodename):
363         return reboot_policy(nodename, True, False)
364
365 def reboot_str(nodename):
366         global verbose
367         continue_probe = True
368         dryrun=False
369
370         pcu = plc.getpcu(nodename)
371         if not pcu:
372                 logger.debug("no pcu for %s" % nodename)
373                 print "no pcu for %s" % nodename
374                 return False # "%s has no pcu" % nodename
375
376         values = get_pcu_values(pcu['pcu_id'])
377         if values == None:
378                 logger.debug("No values for pcu probe %s" % nodename)
379                 print "No values for pcu probe %s" % nodename
380                 return False #"no info for pcu_id %s" % pcu['pcu_id']
381         
382         # Try the PCU first
383         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
384
385         ret = reboot_test_new(nodename, values, verbose, dryrun)
386         return ret
387         
388 def reboot_policy(nodename, continue_probe, dryrun):
389         global verbose
390
391         pcu = plc.getpcu(nodename)
392         if not pcu:
393                 logger.debug("no pcu for %s" % nodename)
394                 print "no pcu for %s" % nodename
395                 return False # "%s has no pcu" % nodename
396
397         values = get_pcu_values(pcu['pcu_id'])
398         if values == None:
399                 logger.debug("No values for pcu probe %s" % nodename)
400                 print "No values for pcu probe %s" % nodename
401                 return False #"no info for pcu_id %s" % pcu['pcu_id']
402         
403         # Try the PCU first
404         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
405
406         ret = reboot_test_new(nodename, values, verbose, dryrun)
407
408         if ret != 0:
409                 print ret
410                 return False
411         else:
412                 print "return true"
413                 return True
414
415 class Unknown(PCUControl):
416         supported_ports = [22,23,80,443,5869,9100,16992]
417
418 def model_to_object(modelname):
419         if modelname is None:
420                 return ManualPCU 
421         if "AMT" in modelname:
422                 return IntelAMT
423         elif "BayTech" in modelname:
424                 return BayTech
425         elif "HPiLO" in modelname:
426                 return HPiLO
427         elif "IPAL" in modelname:
428                 return IPAL
429         elif "APC" in modelname:
430                 return APCControl
431         elif "DRAC" in modelname:
432                 return DRAC
433         elif "WTI" in modelname:
434                 return WTIIPS4
435         elif "ePowerSwitch" in modelname:
436                 return ePowerSwitchNew
437         elif "IPMI" in modelname:
438                 return IPMI
439         elif "BlackBoxPSMaverick" in modelname:
440                 return BlackBoxPSMaverick
441         elif "PM211MIP" in modelname:
442                 return PM211MIP
443         elif "ManualPCU" in modelname:
444                 return ManualPCU 
445         else:
446                 print "UNKNOWN model %s"%modelname
447                 return Unknown
448
449 def reboot_api(node, pcu): #, verbose, dryrun):
450         rb_ret = ""
451
452         try:
453                 modelname = pcu['model']
454                 if modelname:
455                         # get object instance 
456                         instance = eval('%s(pcu, verbose)' % modelname)
457                         # get pcu port 
458                         i = pcu['node_ids'].index(node['node_id'])
459                         p = pcu['ports'][i]
460                         # reboot
461                         rb_ret = instance.reboot(p, False)
462                 else:
463                         rb_ret =  "No modelname in PCU record."
464                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
465         except Exception, err:
466                 rb_ret = str(err)
467
468         return rb_ret
469
470 def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
471         newmodelname = None
472         update = {      'AP79xx' : 'APCControl13p13',
473                                 'Masterswitch' : 'APCControl13p13',
474                                 'DS4-RPC' : 'BayTech',
475                                 'IP-41x_IP-81x' : 'IPAL',
476                                 'DRAC3' : 'DRAC',
477                                 'DRAC4' : 'DRAC',
478                                 'ePowerSwitch' : 'ePowerSwitchOld',
479                                 'ilo2' : 'HPiLO',
480                                 'ilo1' : 'HPiLO',
481                                 'PM211-MIP' : 'PM211MIP',
482                                 'AMT2.5' : 'IntelAMT',
483                                 'AMT3.0' : 'IntelAMT',
484                                 'WTI_IPS-4' : 'WTIIPS4',
485                                 'unknown'  : 'ManualPCU',
486                                 'DRAC5' : 'DRAC',
487                                 'ipmi'  : 'OpenIPMI',
488                                 'bbsemaverick' : 'BlackBoxPSMaverick',
489                                 'manualadmin'  : 'ManualPCU',
490         }
491
492         if oldmodelname in update:
493                 newmodelname = update[oldmodelname]
494         else:
495                 newmodelname = oldmodelname
496
497         if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
498                 newmodelname = 'APCControl12p3'
499         elif pcu_id in [1110,86]:
500                 newmodelname = 'APCControl1p4'
501         elif pcu_id in [1221,1225,1220,1192]:
502                 newmodelname = 'APCControl121p3'
503         elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
504                 newmodelname = 'APCControl121p1'
505         elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
506                 newmodelname = 'BayTechCtrlC'
507         elif pcu_id in [93]:
508                 newmodelname = 'BayTechRPC3NC'
509         elif pcu_id in [1057]:
510                 newmodelname = 'BayTechCtrlCUnibe'
511         elif pcu_id in [1012]:
512                 newmodelname = 'BayTechRPC16'
513         elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
514                 newmodelname = 'ePowerSwitchNew'
515
516         return newmodelname
517
518 def reboot_test_new(nodename, values, verbose, dryrun):
519         rb_ret = ""
520         if 'plc_pcu_stats' in values:
521                 values.update(values['plc_pcu_stats'])
522
523         try:
524                 modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
525                 if modelname:
526                         object = eval('%s(values, verbose)' % modelname)
527                         rb_ret = object.reboot(values[nodename], dryrun)
528                 else:
529                         rb_ret =  "Not_Run"
530                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
531         except ExceptionPort, err:
532                 rb_ret = str(err)
533         except NameError, err:
534                 rb_ret = str(err)
535
536         return rb_ret
537
538 def main():
539         logger.setLevel(logging.DEBUG)
540         ch = logging.StreamHandler()
541         ch.setLevel(logging.DEBUG)
542         formatter = logging.Formatter('LOGGER - %(message)s')
543         ch.setFormatter(formatter)
544         logger.addHandler(ch)
545
546         try:
547                 if "test" in sys.argv:
548                         dryrun = True
549                 else:
550                         dryrun = False
551
552                 for node in sys.argv[1:]:
553                         if node == "test": continue
554
555                         print "Rebooting %s" % node
556                         if reboot_policy(node, True, dryrun):
557                                 print "success"
558                         else:
559                                 print "failed"
560         except Exception, err:
561                 import traceback; traceback.print_exc()
562                 from monitor.common import email_exception
563                 email_exception(node)
564                 print err
565
566 if __name__ == '__main__':
567         logger = logging.getLogger("monitor")
568         main()
569         f = open("/tmp/rebootlog", 'a')
570         f.write("reboot %s\n" % sys.argv)
571         f.close()