fixes for pcucontrol and DRAC control.
[monitor.git] / pcucontrol / reboot.py
1 #!/usr/bin/python
2 #
3 # Reboot specified nodes
4 #
5
6 import getpass, getopt
7 import os, sys
8 import xml, xmlrpclib
9 import errno, time, traceback
10 import urllib2
11 import urllib
12 import threading, popen2
13 import array, struct
14 from monitor.wrapper import plc
15 import base64
16 from subprocess import PIPE, Popen
17 import pcucontrol.transports.ssh.pxssh as pxssh
18 import pcucontrol.transports.ssh.pexpect as pexpect
19 import socket
20 from monitor.util import command
21
22
23 # Use our versions of telnetlib and pyssh
24 sys.path.insert(0, os.path.dirname(sys.argv[0]))
25 import pcucontrol.transports.telnetlib as telnetlib
26 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
27 import pcucontrol.transports.pyssh as pyssh
28 from monitor import config
29
30
31 # Event class ID from pcu events
32 #NODE_POWER_CONTROL = 3
33
34 # Monitor user ID
35 #MONITOR_USER_ID = 11142
36
37 import logging
38 logger = logging.getLogger("monitor")
39 verbose = 1
40 #dryrun = 0;
41
42 class ExceptionNoTransport(Exception): pass
43 class ExceptionNotFound(Exception): pass
44 class ExceptionPassword(Exception): pass
45 class ExceptionTimeout(Exception): pass
46 class ExceptionPrompt(Exception): pass
47 class ExceptionSequence(Exception): pass
48 class ExceptionReset(Exception): pass
49 class ExceptionPort(Exception): pass
50 class ExceptionUsername(Exception): pass
51
52
53
54 # PCU has model, host, preferred-port, user, passwd, 
55
56 # This is an object derived directly form the PLCAPI DB fields
57 class PCU(object):
58         def __init__(self, plc_pcu_dict):
59                 for field in ['username', 'password', 'site_id', 
60                                                 'hostname', 'ip', 
61                                                 'pcu_id', 'model', 
62                                                 'node_ids', 'ports', ]:
63                         if field in plc_pcu_dict:
64                                 self.__setattr__(field, plc_pcu_dict[field])
65                         else:
66                                 raise Exception("No such field %s in PCU object" % field)
67
68 # These are the convenience functions build around the PCU object.
69 class PCUModel(PCU):
70         def __init__(self, plc_pcu_dict):
71                 PCU.__init__(self, plc_pcu_dict)
72                 self.host = self.pcu_name()
73
74         def pcu_name(self):
75                 if self.hostname is not None and self.hostname is not "":
76                         return self.hostname
77                 elif self.ip is not None and self.ip is not "":
78                         return self.ip
79                 else:
80                         return None
81
82         def nodeidToPort(self, node_id):
83                 if node_id in self.node_ids:
84                         for i in range(0, len(self.node_ids)):
85                                 if node_id == self.node_ids[i]:
86                                         return self.ports[i]
87
88                 raise Exception("No such Node ID: %d" % node_id)
89
90 # This class captures the observed pcu records from FindBadPCUs.py
91 class PCURecord:
92         def __init__(self, pcu_record_dict):
93                 for field in ['port_status', 
94                                                 'dns_status', 
95                                                 'entry_complete', ]:
96                         if field in pcu_record_dict:
97                                 if field == "reboot":
98                                         self.__setattr__("reboot_str", pcu_record_dict[field])
99                                 else:
100                                         self.__setattr__(field, pcu_record_dict[field])
101                         #else:
102                         #       raise Exception("No such field %s in pcu record dict" % field)
103
104 class Transport:
105         TELNET = "telnet"
106         SSH    = "ssh"
107         HTTP   = "http"
108         HTTPS  = "https"
109         IPAL   = "ipal"
110         DRAC   = "drac"
111         AMT    = "amt"
112
113         TELNET_TIMEOUT = 120
114
115         porttypemap = {
116                         5869 : DRAC,
117                         22 : SSH,
118                         23 : TELNET,
119                         443 : HTTPS,
120                         80 :  HTTP,
121                         9100 : IPAL,
122                         16992 : AMT,
123                 }
124
125         def __init__(self, type, verbose):
126                 self.type = type
127                 self.verbose = verbose
128                 self.transport = None
129
130         def open(self, host, username=None, password=None, prompt="User Name"):
131                 transport = None
132
133                 if self.type == self.TELNET:
134                         transport = telnetlib.Telnet(host, timeout=self.TELNET_TIMEOUT)
135                         transport.set_debuglevel(self.verbose)
136                         if username is not None:
137                                 self.transport = transport
138                                 self.ifThenSend(prompt, username, ExceptionUsername)
139
140                 elif self.type == self.SSH:
141                         if username is not None:
142                                 transport = pyssh.Ssh(username, host)
143                                 transport.set_debuglevel(self.verbose)
144                                 transport.open()
145                                 # TODO: have an ssh set_debuglevel() also...
146                         else:
147                                 raise Exception("Username cannot be None for ssh transport.")
148                 elif self.type == self.HTTP:
149                         # NOTE: this does not work for all web-based services...
150                         self.url = "http://%s:%d/" % (host,80)
151                         uri = "%s:%d" % (host,80)
152
153                         # create authinfo
154                         authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
155                         authinfo.add_password (None, uri, username, password)
156                         authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
157
158                         transport = urllib2.build_opener(authhandler)
159                 else:
160                         raise Exception("Unknown transport type: %s" % self.type)
161
162                 self.transport = transport
163                 return True
164
165         def close(self):
166                 if self.type == self.TELNET:
167                         self.transport.close() 
168                 elif self.type == self.SSH:
169                         self.transport.close() 
170                 elif self.type == self.HTTP:
171                         pass
172                 else:
173                         raise Exception("Unknown transport type %s" % self.type)
174                 self.transport = None
175
176         def write(self, msg):
177                 return self.send(msg)
178
179         def send(self, msg):
180                 if self.transport == None:
181                         raise ExceptionNoTransport("transport object is type None")
182                         
183                 return self.transport.write(msg)
184
185         def sendPassword(self, password, prompt=None):
186                 if self.type == self.TELNET:
187                         if prompt == None:
188                                 self.ifThenSend("Password", password, ExceptionPassword)
189                         else:
190                                 self.ifThenSend(prompt, password, ExceptionPassword)
191                 elif self.type == self.SSH:
192                         self.ifThenSend("password:", password, ExceptionPassword)
193                 elif self.type == self.HTTP:
194                         pass
195                 else:
196                         raise Exception("Unknown transport type: %s" % self.type)
197
198         def sendHTTP(self, resource, data):
199                 if self.verbose:
200                         print "POSTing '%s' to %s" % (data,self.url + resource)
201
202                 try:
203                         f = self.transport.open(self.url + resource ,data)
204                         r = f.read()
205                         if self.verbose:
206                                 print r
207
208                 except urllib2.URLError,err:
209                         logger.info('Could not open http connection', err)
210                         return "http transport error"
211
212                 return 0
213
214         def ifThenSend(self, expected, buffer, ErrorClass=ExceptionPrompt):
215
216                 if self.transport != None:
217                         output = self.transport.read_until(expected, self.TELNET_TIMEOUT)
218                         if output.find(expected) == -1:
219                                 print "OUTPUT: --%s--" % output
220                                 raise ErrorClass, "'%s' not found" % expected
221                         else:
222                                 self.transport.write(buffer + "\r\n")
223                 else:
224                         raise ExceptionNoTransport("transport object is type None")
225
226         def ifElse(self, expected, ErrorClass):
227                 try:
228                         self.transport.read_until(expected, self.TELNET_TIMEOUT)
229                 except:
230                         raise ErrorClass("Could not find '%s' within timeout" % expected)
231
232 class PCUControl(PCUModel,PCURecord):
233
234         """ 
235                 There are three cases:
236                         1) the pcu_record passed below includes port_status from an
237                                 external probe.
238                         2) the external probe failed, and the values are empty
239                         3) this call is made independent of port_status.
240
241                 In the first case, the first open port is used.
242                 In the third case, the ports are tried in sequence.
243
244                 In this way, the port_status value serves only as an optimization,
245                 because closed ports are avoided.  The supported_ports value should
246                 order ports by their preferred usage.
247         """
248
249         supported_ports = []
250
251         def __init__(self, plc_pcu_record, verbose, ignored=None):
252                 PCUModel.__init__(self, plc_pcu_record)
253                 PCURecord.__init__(self, plc_pcu_record)
254
255         def reboot(self, node_port, dryrun):
256
257                 port_list = []
258                 # There are two sources of potential ports.  Those that are open and
259                 # those that are part of the PCU's supported_ports.  
260                 #  I think we should start with supported_ports and then filter that
261                 #  by the open ports.
262
263                 port_list = self.supported_ports
264
265                 if hasattr(self, 'port_status') and self.port_status:
266                         # get out the open ports
267                         port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
268                         port_list = [ int(x) for x in port_list ]
269                         # take only the open ports that are supported_ports
270                         port_list = filter(lambda x: x in self.supported_ports, port_list)
271                         if port_list == []:
272                                 raise ExceptionPort("No Open Port: No transport from open ports")
273
274                 print port_list
275
276                 ret = "No implementation for open ports on selected PCU model"
277                 for port in port_list:
278                         if port not in Transport.porttypemap:
279                                 continue
280
281                         type = Transport.porttypemap[port]
282                         self.transport = Transport(type, verbose)
283
284                         print "checking for run_%s" % type
285                         if hasattr(self, "run_%s" % type):
286                                 print "found run_%s" % type
287                                 fxn = getattr(self, "run_%s" % type)
288                                 ret = self.catcherror(fxn, node_port, dryrun)
289                                 if ret == 0: # NOTE: success!, so stop
290                                         break
291                         else:
292                                 continue
293
294                 return ret
295
296         def run(self, node_port, dryrun):
297                 """ This function is to be defined by the specific PCU instance.  """
298                 raise Exception("This function is not implemented")
299                 pass
300
301         #def reboot(self, node_port, dryrun):
302
303         def catcherror(self, function, node_port, dryrun):
304                 try:
305                         return function(node_port, dryrun)
306                 except ExceptionNotFound, err:
307                         return "error: " + str(err)
308                 except ExceptionPassword, err:
309                         return "Password exception: " + str(err)
310                 except ExceptionTimeout, err:
311                         return "Timeout exception: " + str(err)
312                 except ExceptionUsername, err:
313                         return "No username prompt: " + str(err)
314                 except ExceptionSequence, err:
315                         return "Sequence error: " + str(err)
316                 except ExceptionPrompt, err:
317                         return "Prompt exception: " + str(err)
318                 except ExceptionNoTransport, err:
319                         return "No Transport: " + str(err)
320                 except ExceptionPort, err:
321                         return "No ports exception: " + str(err)
322                 except socket.error, err:
323                         return "socket error: timeout: " + str(err)
324                 except urllib2.HTTPError, err:
325                         return "HTTPError: " + str(err)
326                 except urllib2.URLError, err:
327                         return "URLError: " + str(err)
328                 except EOFError, err:
329                         self.transport.close()
330                         import traceback
331                         traceback.print_exc()
332                         return "EOF connection reset" + str(err)
333
334 from pcucontrol.models import *
335
336 def pcu_name(pcu):
337         if pcu['hostname'] is not None and pcu['hostname'] is not "":
338                 return pcu['hostname']
339         elif pcu['ip'] is not None and pcu['ip'] is not "":
340                 return pcu['ip']
341         else:
342                 return None
343
344 def get_pcu_values(pcu_id):
345         from monitor.database.info.model import FindbadPCURecord
346         print "pcuid: %s" % pcu_id
347         try:
348                 pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
349                 if pcurec:
350                         values = pcurec.to_dict()
351                 else:
352                         values = None
353         except:
354                 values = None
355
356         return values
357
358 def reboot(nodename):
359         return reboot_policy(nodename, True, False)
360
361 def reboot_str(nodename):
362         global verbose
363         continue_probe = True
364         dryrun=False
365
366         pcu = plc.getpcu(nodename)
367         if not pcu:
368                 logger.debug("no pcu for %s" % nodename)
369                 print "no pcu for %s" % nodename
370                 return False # "%s has no pcu" % nodename
371
372         values = get_pcu_values(pcu['pcu_id'])
373         if values == None:
374                 logger.debug("No values for pcu probe %s" % nodename)
375                 print "No values for pcu probe %s" % nodename
376                 return False #"no info for pcu_id %s" % pcu['pcu_id']
377         
378         # Try the PCU first
379         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
380
381         ret = reboot_test_new(nodename, values, verbose, dryrun)
382         return ret
383         
384 def reboot_policy(nodename, continue_probe, dryrun):
385         global verbose
386
387         pcu = plc.getpcu(nodename)
388         if not pcu:
389                 logger.debug("no pcu for %s" % nodename)
390                 print "no pcu for %s" % nodename
391                 return False # "%s has no pcu" % nodename
392
393         values = get_pcu_values(pcu['pcu_id'])
394         if values == None:
395                 logger.debug("No values for pcu probe %s" % nodename)
396                 print "No values for pcu probe %s" % nodename
397                 return False #"no info for pcu_id %s" % pcu['pcu_id']
398         
399         # Try the PCU first
400         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
401
402         ret = reboot_test_new(nodename, values, verbose, dryrun)
403
404         if ret != 0:
405                 print ret
406                 return False
407         else:
408                 print "return true"
409                 return True
410
411 class Unknown(PCUControl):
412         supported_ports = [22,23,80,443,5869,9100,16992]
413
414 def model_to_object(modelname):
415         if modelname is None:
416                 return ManualPCU 
417         if "AMT" in modelname:
418                 return IntelAMT
419         elif "BayTech" in modelname:
420                 return BayTech
421         elif "HPiLO" in modelname:
422                 return HPiLO
423         elif "IPAL" in modelname:
424                 return IPAL
425         elif "APC" in modelname:
426                 return APCControl
427         elif "DRAC" in modelname:
428                 return DRAC
429         elif "WTI" in modelname:
430                 return WTIIPS4
431         elif "ePowerSwitch" in modelname:
432                 return ePowerSwitchNew
433         elif "IPMI" in modelname:
434                 return IPMI
435         elif "BlackBoxPSMaverick" in modelname:
436                 return BlackBoxPSMaverick
437         elif "PM211MIP" in modelname:
438                 return PM211MIP
439         elif "ManualPCU" in modelname:
440                 return ManualPCU 
441         else:
442                 print "UNKNOWN model %s"%modelname
443                 return Unknown
444
445 def reboot_api(node, pcu): #, verbose, dryrun):
446         rb_ret = ""
447
448         try:
449                 modelname = pcu['model']
450                 if modelname:
451                         # get object instance 
452                         instance = eval('%s(pcu, verbose)' % modelname)
453                         # get pcu port 
454                         i = pcu['node_ids'].index(node['node_id'])
455                         p = pcu['ports'][i]
456                         # reboot
457                         rb_ret = instance.reboot(p, False)
458                 else:
459                         rb_ret =  "No modelname in PCU record."
460                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
461         except Exception, err:
462                 rb_ret = str(err)
463
464         return rb_ret
465
466 def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
467         newmodelname = None
468         update = {      'AP79xx' : 'APCControl13p13',
469                                 'Masterswitch' : 'APCControl13p13',
470                                 'DS4-RPC' : 'BayTech',
471                                 'IP-41x_IP-81x' : 'IPAL',
472                                 'DRAC3' : 'DRAC',
473                                 'DRAC4' : 'DRAC',
474                                 'ePowerSwitch' : 'ePowerSwitchOld',
475                                 'ilo2' : 'HPiLO',
476                                 'ilo1' : 'HPiLO',
477                                 'PM211-MIP' : 'PM211MIP',
478                                 'AMT2.5' : 'IntelAMT',
479                                 'AMT3.0' : 'IntelAMT',
480                                 'WTI_IPS-4' : 'WTIIPS4',
481                                 'unknown'  : 'ManualPCU',
482                                 'DRAC5' : 'DRAC',
483                                 'ipmi'  : 'OpenIPMI',
484                                 'bbsemaverick' : 'BlackBoxPSMaverick',
485                                 'manualadmin'  : 'ManualPCU',
486         }
487
488         if oldmodelname in update:
489                 newmodelname = update[oldmodelname]
490         else:
491                 newmodelname = oldmodelname
492
493         if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
494                 newmodelname = 'APCControl12p3'
495         elif pcu_id in [1110,86]:
496                 newmodelname = 'APCControl1p4'
497         elif pcu_id in [1221,1225,1220,1192]:
498                 newmodelname = 'APCControl121p3'
499         elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
500                 newmodelname = 'APCControl121p1'
501         elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
502                 newmodelname = 'BayTechCtrlC'
503         elif pcu_id in [93]:
504                 newmodelname = 'BayTechRPC3NC'
505         elif pcu_id in [1057]:
506                 newmodelname = 'BayTechCtrlCUnibe'
507         elif pcu_id in [1012]:
508                 newmodelname = 'BayTechRPC16'
509         elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
510                 newmodelname = 'ePowerSwitchNew'
511
512         return newmodelname
513
514 def reboot_test_new(nodename, values, verbose, dryrun):
515         rb_ret = ""
516         if 'plc_pcu_stats' in values:
517                 values.update(values['plc_pcu_stats'])
518
519         try:
520                 modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
521                 if modelname:
522                         object = eval('%s(values, verbose)' % modelname)
523                         rb_ret = object.reboot(values[nodename], dryrun)
524                 else:
525                         rb_ret =  "Not_Run"
526                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
527         except ExceptionPort, err:
528                 rb_ret = str(err)
529         except NameError, err:
530                 rb_ret = str(err)
531
532         return rb_ret
533
534 def main():
535         logger.setLevel(logging.DEBUG)
536         ch = logging.StreamHandler()
537         ch.setLevel(logging.DEBUG)
538         formatter = logging.Formatter('LOGGER - %(message)s')
539         ch.setFormatter(formatter)
540         logger.addHandler(ch)
541
542         try:
543                 if "test" in sys.argv:
544                         dryrun = True
545                 else:
546                         dryrun = False
547
548                 for node in sys.argv[1:]:
549                         if node == "test": continue
550
551                         print "Rebooting %s" % node
552                         if reboot_policy(node, True, dryrun):
553                                 print "success"
554                         else:
555                                 print "failed"
556         except Exception, err:
557                 import traceback; traceback.print_exc()
558                 print err
559
560 if __name__ == '__main__':
561         logger = logging.getLogger("monitor")
562         main()
563         f = open("/tmp/rebootlog", 'a')
564         f.write("reboot %s\n" % sys.argv)
565         f.close()