3ad6438abf70676879e26c3851379253b73f2cef
[monitor.git] / pcucontrol / reboot.py
1 #!/usr/bin/python
2 #
3 # Reboot specified nodes
4 #
5
6 import getpass, getopt
7 import os, sys
8 import xml, xmlrpclib
9 import errno, time, traceback
10 import urllib2
11 import urllib
12 import threading, popen2
13 import array, struct
14 from monitor.wrapper import plc
15 import base64
16 from subprocess import PIPE, Popen
17 import pcucontrol.transports.ssh.pxssh as pxssh
18 import pcucontrol.transports.ssh.pexpect as pexpect
19 import socket
20 from monitor.util import command
21
22
23 # Use our versions of telnetlib and pyssh
24 sys.path.insert(0, os.path.dirname(sys.argv[0]))
25 import pcucontrol.transports.telnetlib as telnetlib
26 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
27 import pcucontrol.transports.pyssh as pyssh
28 from monitor import config
29
30 from monitor.database.info.model import FindbadPCURecord
31
32 # Event class ID from pcu events
33 #NODE_POWER_CONTROL = 3
34
35 # Monitor user ID
36 #MONITOR_USER_ID = 11142
37
38 import logging
39 logger = logging.getLogger("monitor")
40 verbose = 1
41 #dryrun = 0;
42
43 class ExceptionNoTransport(Exception): pass
44 class ExceptionNotFound(Exception): pass
45 class ExceptionPassword(Exception): pass
46 class ExceptionTimeout(Exception): pass
47 class ExceptionPrompt(Exception): pass
48 class ExceptionSequence(Exception): pass
49 class ExceptionReset(Exception): pass
50 class ExceptionPort(Exception): pass
51 class ExceptionUsername(Exception): pass
52
53
54
55 # PCU has model, host, preferred-port, user, passwd, 
56
57 # This is an object derived directly form the PLCAPI DB fields
58 class PCU(object):
59         def __init__(self, plc_pcu_dict):
60                 for field in ['username', 'password', 'site_id', 
61                                                 'hostname', 'ip', 
62                                                 'pcu_id', 'model', 
63                                                 'node_ids', 'ports', ]:
64                         if field in plc_pcu_dict:
65                                 self.__setattr__(field, plc_pcu_dict[field])
66                         else:
67                                 raise Exception("No such field %s in PCU object" % field)
68
69 # These are the convenience functions build around the PCU object.
70 class PCUModel(PCU):
71         def __init__(self, plc_pcu_dict):
72                 PCU.__init__(self, plc_pcu_dict)
73                 self.host = self.pcu_name()
74
75         def pcu_name(self):
76                 if self.hostname is not None and self.hostname is not "":
77                         return self.hostname
78                 elif self.ip is not None and self.ip is not "":
79                         return self.ip
80                 else:
81                         return None
82
83         def nodeidToPort(self, node_id):
84                 if node_id in self.node_ids:
85                         for i in range(0, len(self.node_ids)):
86                                 if node_id == self.node_ids[i]:
87                                         return self.ports[i]
88
89                 raise Exception("No such Node ID: %d" % node_id)
90
91 # This class captures the observed pcu records from FindBadPCUs.py
92 class PCURecord:
93         def __init__(self, pcu_record_dict):
94                 for field in ['port_status', 
95                                                 'dns_status', 
96                                                 'entry_complete', ]:
97                         if field in pcu_record_dict:
98                                 if field == "reboot":
99                                         self.__setattr__("reboot_str", pcu_record_dict[field])
100                                 else:
101                                         self.__setattr__(field, pcu_record_dict[field])
102                         #else:
103                         #       raise Exception("No such field %s in pcu record dict" % field)
104
105 class Transport:
106         TELNET = "telnet"
107         SSH    = "ssh"
108         HTTP   = "http"
109         HTTPS  = "https"
110         IPAL   = "ipal"
111         DRAC   = "drac"
112         AMT    = "amt"
113
114         TELNET_TIMEOUT = 120
115
116         porttypemap = {
117                         5869 : DRAC,
118                         22 : SSH,
119                         23 : TELNET,
120                         443 : HTTPS,
121                         80 :  HTTP,
122                         9100 : IPAL,
123                         16992 : AMT,
124                 }
125
126         def __init__(self, type, verbose):
127                 self.type = type
128                 self.verbose = verbose
129                 self.transport = None
130
131         def open(self, host, username=None, password=None, prompt="User Name"):
132                 transport = None
133
134                 if self.type == self.TELNET:
135                         transport = telnetlib.Telnet(host, timeout=self.TELNET_TIMEOUT)
136                         transport.set_debuglevel(self.verbose)
137                         if username is not None:
138                                 self.transport = transport
139                                 self.transport.ifThenSend(prompt, username, ExceptionUsername)
140
141                 elif self.type == self.SSH:
142                         if username is not None:
143                                 transport = pyssh.Ssh(username, host)
144                                 transport.set_debuglevel(self.verbose)
145                                 transport.open()
146                                 # TODO: have an ssh set_debuglevel() also...
147                         else:
148                                 raise Exception("Username cannot be None for ssh transport.")
149                 elif self.type == self.HTTP:
150                         # NOTE: this does not work for all web-based services...
151                         self.url = "http://%s:%d/" % (host,80)
152                         uri = "%s:%d" % (host,80)
153
154                         # create authinfo
155                         authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
156                         authinfo.add_password (None, uri, username, password)
157                         authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
158
159                         transport = urllib2.build_opener(authhandler)
160                 else:
161                         raise Exception("Unknown transport type: %s" % self.type)
162
163                 self.transport = transport
164                 return True
165
166         def close(self):
167                 if self.type == self.TELNET:
168                         self.transport.close() 
169                 elif self.type == self.SSH:
170                         self.transport.close() 
171                 elif self.type == self.HTTP:
172                         pass
173                 else:
174                         raise Exception("Unknown transport type %s" % self.type)
175                 self.transport = None
176
177         def write(self, msg):
178                 return self.send(msg)
179
180         def send(self, msg):
181                 if self.transport == None:
182                         raise ExceptionNoTransport("transport object is type None")
183                         
184                 return self.transport.write(msg)
185
186         def sendPassword(self, password, prompt=None):
187                 if self.type == self.TELNET:
188                         if prompt == None:
189                                 self.ifThenSend("Password", password, ExceptionPassword)
190                         else:
191                                 self.ifThenSend(prompt, password, ExceptionPassword)
192                 elif self.type == self.SSH:
193                         self.ifThenSend("password:", password, ExceptionPassword)
194                 elif self.type == self.HTTP:
195                         pass
196                 else:
197                         raise Exception("Unknown transport type: %s" % self.type)
198
199         def sendHTTP(self, resource, data):
200                 if self.verbose:
201                         print "POSTing '%s' to %s" % (data,self.url + resource)
202
203                 try:
204                         f = self.transport.open(self.url + resource ,data)
205                         r = f.read()
206                         if self.verbose:
207                                 print r
208
209                 except urllib2.URLError,err:
210                         logger.info('Could not open http connection', err)
211                         return "http transport error"
212
213                 return 0
214
215         def ifThenSend(self, expected, buffer, ErrorClass=ExceptionPrompt):
216
217                 if self.transport != None:
218                         output = self.transport.read_until(expected, self.TELNET_TIMEOUT)
219                         if output.find(expected) == -1:
220                                 print "OUTPUT: --%s--" % output
221                                 raise ErrorClass, "'%s' not found" % expected
222                         else:
223                                 self.transport.write(buffer + "\r\n")
224                 else:
225                         raise ExceptionNoTransport("transport object is type None")
226
227         def ifElse(self, expected, ErrorClass):
228                 try:
229                         self.transport.read_until(expected, self.TELNET_TIMEOUT)
230                 except:
231                         raise ErrorClass("Could not find '%s' within timeout" % expected)
232
233 class PCUControl(PCUModel,PCURecord):
234
235         """ 
236                 There are three cases:
237                         1) the pcu_record passed below includes port_status from an
238                                 external probe.
239                         2) the external probe failed, and the values are empty
240                         3) this call is made independent of port_status.
241
242                 In the first case, the first open port is used.
243                 In the third case, the ports are tried in sequence.
244
245                 In this way, the port_status value serves only as an optimization,
246                 because closed ports are avoided.  The supported_ports value should
247                 order ports by their preferred usage.
248         """
249
250         supported_ports = []
251
252         def __init__(self, plc_pcu_record, verbose, ignored=None):
253                 PCUModel.__init__(self, plc_pcu_record)
254                 PCURecord.__init__(self, plc_pcu_record)
255
256         def reboot(self, node_port, dryrun):
257
258                 port_list = []
259                 if hasattr(self, 'port_status') and self.port_status:
260                         port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
261                         port_list = [ int(x) for x in port_list ]
262                         if port_list == []:
263                                 raise ExceptionPort("Unsupported Port: No transport from open ports")
264                 else:
265                         port_list = self.supported_ports
266
267                 print port_list
268
269                 ret = "could not run"
270                 for port in port_list:
271                         if port not in Transport.porttypemap:
272                                 continue
273
274                         type = Transport.porttypemap[port]
275                         self.transport = Transport(type, verbose)
276
277                         if hasattr(self, "run_%s" % type):
278                                 fxn = getattr(self, "run_%s" % type)
279                                 ret = self.catcherror(fxn, node_port, dryrun)
280                                 if ret == 0: # NOTE: success!, so stop
281                                         break
282                         else:
283                                 continue
284
285                 return ret
286
287         def run(self, node_port, dryrun):
288                 """ This function is to be defined by the specific PCU instance.  """
289                 raise Exception("This function is not implemented")
290                 pass
291
292         #def reboot(self, node_port, dryrun):
293
294         def catcherror(self, function, node_port, dryrun):
295                 try:
296                         return function(node_port, dryrun)
297                 except ExceptionNotFound, err:
298                         return "error: " + str(err)
299                 except ExceptionPassword, err:
300                         return "Password exception: " + str(err)
301                 except ExceptionTimeout, err:
302                         return "Timeout exception: " + str(err)
303                 except ExceptionUsername, err:
304                         return "No username prompt: " + str(err)
305                 except ExceptionSequence, err:
306                         return "Sequence error: " + str(err)
307                 except ExceptionPrompt, err:
308                         return "Prompt exception: " + str(err)
309                 except ExceptionNoTransport, err:
310                         return "No Transport: " + str(err)
311                 except ExceptionPort, err:
312                         return "No ports exception: " + str(err)
313                 except socket.error, err:
314                         return "socket error: timeout: " + str(err)
315                 except urllib2.HTTPError, err:
316                         return "HTTPError: " + str(err)
317                 except urllib2.URLError, err:
318                         return "URLError: " + str(err)
319                 except EOFError, err:
320                         if self.verbose:
321                                 logger.debug("reboot: EOF")
322                                 logger.debug(err)
323                         self.transport.close()
324                         import traceback
325                         traceback.print_exc()
326                         return "EOF connection reset" + str(err)
327
328 from pcucontrol.models import *
329
330 def pcu_name(pcu):
331         if pcu['hostname'] is not None and pcu['hostname'] is not "":
332                 return pcu['hostname']
333         elif pcu['ip'] is not None and pcu['ip'] is not "":
334                 return pcu['ip']
335         else:
336                 return None
337
338 def get_pcu_values(pcu_id):
339         print "pcuid: %s" % pcu_id
340         try:
341                 pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
342                 if pcurec:
343                         values = pcurec.to_dict()
344                 else:
345                         values = None
346         except:
347                 values = None
348
349         return values
350
351 def reboot(nodename):
352         return reboot_policy(nodename, True, False)
353
354 def reboot_str(nodename):
355         global verbose
356         continue_probe = True
357         dryrun=False
358
359         pcu = plc.getpcu(nodename)
360         if not pcu:
361                 logger.debug("no pcu for %s" % nodename)
362                 print "no pcu for %s" % nodename
363                 return False # "%s has no pcu" % nodename
364
365         values = get_pcu_values(pcu['pcu_id'])
366         if values == None:
367                 logger.debug("No values for pcu probe %s" % nodename)
368                 print "No values for pcu probe %s" % nodename
369                 return False #"no info for pcu_id %s" % pcu['pcu_id']
370         
371         # Try the PCU first
372         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
373
374         ret = reboot_test_new(nodename, values, verbose, dryrun)
375         return ret
376         
377 def reboot_policy(nodename, continue_probe, dryrun):
378         global verbose
379
380         pcu = plc.getpcu(nodename)
381         if not pcu:
382                 logger.debug("no pcu for %s" % nodename)
383                 print "no pcu for %s" % nodename
384                 return False # "%s has no pcu" % nodename
385
386         values = get_pcu_values(pcu['pcu_id'])
387         if values == None:
388                 logger.debug("No values for pcu probe %s" % nodename)
389                 print "No values for pcu probe %s" % nodename
390                 return False #"no info for pcu_id %s" % pcu['pcu_id']
391         
392         # Try the PCU first
393         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
394
395         ret = reboot_test_new(nodename, values, verbose, dryrun)
396
397         if ret != 0:
398                 print ret
399                 return False
400         else:
401                 print "return true"
402                 return True
403
404 class Unknown(PCUControl):
405         supported_ports = [22,23,80,443,5869,9100,16992]
406
407 def model_to_object(modelname):
408         if modelname is None:
409                 return ManualPCU 
410         if "AMT" in modelname:
411                 return IntelAMT
412         elif "BayTech" in modelname:
413                 return BayTech
414         elif "HPiLO" in modelname:
415                 return HPiLO
416         elif "IPAL" in modelname:
417                 return IPAL
418         elif "APC" in modelname:
419                 return APCControl
420         elif "DRAC" in modelname:
421                 return DRAC
422         elif "WTI" in modelname:
423                 return WTIIPS4
424         elif "ePowerSwitch" in modelname:
425                 return ePowerSwitchNew
426         elif "IPMI" in modelname:
427                 return IPMI
428         elif "BlackBoxPSMaverick" in modelname:
429                 return BlackBoxPSMaverick
430         elif "PM211MIP" in modelname:
431                 return PM211MIP
432         elif "ManualPCU" in modelname:
433                 return ManualPCU 
434         else:
435                 print "UNKNOWN model %s"%modelname
436                 return Unknown
437
438 def reboot_api(node, pcu): #, verbose, dryrun):
439         rb_ret = ""
440
441         try:
442                 modelname = pcu['model']
443                 if modelname:
444                         # get object instance 
445                         instance = eval('%s(pcu, verbose)' % modelname)
446                         # get pcu port 
447                         i = pcu['node_ids'].index(node['node_id'])
448                         p = pcu['ports'][i]
449                         # reboot
450                         rb_ret = instance.reboot(p, False)
451                 else:
452                         rb_ret =  "No modelname in PCU record."
453                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
454         except Exception, err:
455                 rb_ret = str(err)
456
457         return rb_ret
458
459 def reboot_test_new(nodename, values, verbose, dryrun):
460         rb_ret = ""
461         if 'plc_pcu_stats' in values:
462                 values.update(values['plc_pcu_stats'])
463
464         try:
465                 modelname = values['model']
466                 if modelname:
467                         object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
468                         rb_ret = object.reboot(values[nodename], dryrun)
469                 else:
470                         rb_ret =  "Not_Run"
471                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
472         except ExceptionPort, err:
473                 rb_ret = str(err)
474
475         return rb_ret
476
477 def main():
478         logger.setLevel(logging.DEBUG)
479         ch = logging.StreamHandler()
480         ch.setLevel(logging.DEBUG)
481         formatter = logging.Formatter('LOGGER - %(message)s')
482         ch.setFormatter(formatter)
483         logger.addHandler(ch)
484
485         try:
486                 if "test" in sys.argv:
487                         dryrun = True
488                 else:
489                         dryrun = False
490
491                 for node in sys.argv[1:]:
492                         if node == "test": continue
493
494                         print "Rebooting %s" % node
495                         if reboot_policy(node, True, dryrun):
496                                 print "success"
497                         else:
498                                 print "failed"
499         except Exception, err:
500                 import traceback; traceback.print_exc()
501                 print err
502
503 if __name__ == '__main__':
504         logger = logging.getLogger("monitor")
505         main()