added sitelist option for find* scripts.
[monitor.git] / pcucontrol / reboot.py
1 #!/usr/bin/python
2 #
3 # Reboot specified nodes
4 #
5
6 import getpass, getopt
7 import os, sys
8 import xml, xmlrpclib
9 import errno, time, traceback
10 import urllib2
11 import urllib
12 import threading, popen2
13 import array, struct
14 from monitor.wrapper import plc
15 import base64
16 from subprocess import PIPE, Popen
17 import pcucontrol.transports.ssh.pxssh as pxssh
18 import pcucontrol.transports.ssh.pexpect as pexpect
19 import socket
20 from monitor.util import command
21
22
23 # Use our versions of telnetlib and pyssh
24 sys.path.insert(0, os.path.dirname(sys.argv[0]))
25 import pcucontrol.transports.telnetlib as telnetlib
26 sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")    
27 import pcucontrol.transports.pyssh as pyssh
28 from monitor import config
29
30
31 # Event class ID from pcu events
32 #NODE_POWER_CONTROL = 3
33
34 # Monitor user ID
35 #MONITOR_USER_ID = 11142
36
37 import logging
38 logger = logging.getLogger("monitor")
39 verbose = 1
40 #dryrun = 0;
41
42 class ExceptionNoTransport(Exception): pass
43 class ExceptionNotFound(Exception): pass
44 class ExceptionPassword(Exception): pass
45 class ExceptionTimeout(Exception): pass
46 class ExceptionPrompt(Exception): pass
47 class ExceptionSequence(Exception): pass
48 class ExceptionReset(Exception): pass
49 class ExceptionPort(Exception): pass
50 class ExceptionUsername(Exception): pass
51
52
53
54 # PCU has model, host, preferred-port, user, passwd, 
55
56 # This is an object derived directly form the PLCAPI DB fields
57 class PCU(object):
58         def __init__(self, plc_pcu_dict):
59                 for field in ['username', 'password', 'site_id', 
60                                                 'hostname', 'ip', 
61                                                 'pcu_id', 'model', 
62                                                 'node_ids', 'ports', ]:
63                         if field in plc_pcu_dict:
64                                 self.__setattr__(field, plc_pcu_dict[field])
65                         else:
66                                 raise Exception("No such field %s in PCU object" % field)
67
68 # These are the convenience functions build around the PCU object.
69 class PCUModel(PCU):
70         def __init__(self, plc_pcu_dict):
71                 PCU.__init__(self, plc_pcu_dict)
72                 self.host = self.pcu_name()
73
74         def pcu_name(self):
75                 if self.hostname is not None and self.hostname is not "":
76                         return self.hostname
77                 elif self.ip is not None and self.ip is not "":
78                         return self.ip
79                 else:
80                         return None
81
82         def nodeidToPort(self, node_id):
83                 if node_id in self.node_ids:
84                         for i in range(0, len(self.node_ids)):
85                                 if node_id == self.node_ids[i]:
86                                         return self.ports[i]
87
88                 raise Exception("No such Node ID: %d" % node_id)
89
90 # This class captures the observed pcu records from FindBadPCUs.py
91 class PCURecord:
92         def __init__(self, pcu_record_dict):
93                 for field in ['port_status', 
94                                                 'dns_status', 
95                                                 'entry_complete', ]:
96                         if field in pcu_record_dict:
97                                 if field == "reboot":
98                                         self.__setattr__("reboot_str", pcu_record_dict[field])
99                                 else:
100                                         self.__setattr__(field, pcu_record_dict[field])
101                         #else:
102                         #       raise Exception("No such field %s in pcu record dict" % field)
103
104 class Transport:
105         TELNET = "telnet"
106         SSH    = "ssh"
107         HTTP   = "http"
108         HTTPS  = "https"
109         IPAL   = "ipal"
110         DRAC   = "drac"
111         AMT    = "amt"
112
113         TELNET_TIMEOUT = 120
114
115         porttypemap = {
116                         5869 : DRAC,
117                         22 : SSH,
118                         23 : TELNET,
119                         443 : HTTPS,
120                         80 :  HTTP,
121                         9100 : IPAL,
122                         16992 : AMT,
123                 }
124
125         def __init__(self, type, verbose):
126                 self.type = type
127                 self.verbose = verbose
128                 self.transport = None
129
130         def open(self, host, username=None, password=None, prompt="User Name"):
131                 transport = None
132
133                 if self.type == self.TELNET:
134                         transport = telnetlib.Telnet(host, timeout=self.TELNET_TIMEOUT)
135                         transport.set_debuglevel(self.verbose)
136                         if username is not None:
137                                 self.transport = transport
138                                 self.transport.ifThenSend(prompt, username, ExceptionUsername)
139
140                 elif self.type == self.SSH:
141                         if username is not None:
142                                 transport = pyssh.Ssh(username, host)
143                                 transport.set_debuglevel(self.verbose)
144                                 transport.open()
145                                 # TODO: have an ssh set_debuglevel() also...
146                         else:
147                                 raise Exception("Username cannot be None for ssh transport.")
148                 elif self.type == self.HTTP:
149                         # NOTE: this does not work for all web-based services...
150                         self.url = "http://%s:%d/" % (host,80)
151                         uri = "%s:%d" % (host,80)
152
153                         # create authinfo
154                         authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
155                         authinfo.add_password (None, uri, username, password)
156                         authhandler = urllib2.HTTPBasicAuthHandler( authinfo )
157
158                         transport = urllib2.build_opener(authhandler)
159                 else:
160                         raise Exception("Unknown transport type: %s" % self.type)
161
162                 self.transport = transport
163                 return True
164
165         def close(self):
166                 if self.type == self.TELNET:
167                         self.transport.close() 
168                 elif self.type == self.SSH:
169                         self.transport.close() 
170                 elif self.type == self.HTTP:
171                         pass
172                 else:
173                         raise Exception("Unknown transport type %s" % self.type)
174                 self.transport = None
175
176         def write(self, msg):
177                 return self.send(msg)
178
179         def send(self, msg):
180                 if self.transport == None:
181                         raise ExceptionNoTransport("transport object is type None")
182                         
183                 return self.transport.write(msg)
184
185         def sendPassword(self, password, prompt=None):
186                 if self.type == self.TELNET:
187                         if prompt == None:
188                                 self.ifThenSend("Password", password, ExceptionPassword)
189                         else:
190                                 self.ifThenSend(prompt, password, ExceptionPassword)
191                 elif self.type == self.SSH:
192                         self.ifThenSend("password:", password, ExceptionPassword)
193                 elif self.type == self.HTTP:
194                         pass
195                 else:
196                         raise Exception("Unknown transport type: %s" % self.type)
197
198         def sendHTTP(self, resource, data):
199                 if self.verbose:
200                         print "POSTing '%s' to %s" % (data,self.url + resource)
201
202                 try:
203                         f = self.transport.open(self.url + resource ,data)
204                         r = f.read()
205                         if self.verbose:
206                                 print r
207
208                 except urllib2.URLError,err:
209                         logger.info('Could not open http connection', err)
210                         return "http transport error"
211
212                 return 0
213
214         def ifThenSend(self, expected, buffer, ErrorClass=ExceptionPrompt):
215
216                 if self.transport != None:
217                         output = self.transport.read_until(expected, self.TELNET_TIMEOUT)
218                         if output.find(expected) == -1:
219                                 print "OUTPUT: --%s--" % output
220                                 raise ErrorClass, "'%s' not found" % expected
221                         else:
222                                 self.transport.write(buffer + "\r\n")
223                 else:
224                         raise ExceptionNoTransport("transport object is type None")
225
226         def ifElse(self, expected, ErrorClass):
227                 try:
228                         self.transport.read_until(expected, self.TELNET_TIMEOUT)
229                 except:
230                         raise ErrorClass("Could not find '%s' within timeout" % expected)
231
232 class PCUControl(PCUModel,PCURecord):
233
234         """ 
235                 There are three cases:
236                         1) the pcu_record passed below includes port_status from an
237                                 external probe.
238                         2) the external probe failed, and the values are empty
239                         3) this call is made independent of port_status.
240
241                 In the first case, the first open port is used.
242                 In the third case, the ports are tried in sequence.
243
244                 In this way, the port_status value serves only as an optimization,
245                 because closed ports are avoided.  The supported_ports value should
246                 order ports by their preferred usage.
247         """
248
249         supported_ports = []
250
251         def __init__(self, plc_pcu_record, verbose, ignored=None):
252                 PCUModel.__init__(self, plc_pcu_record)
253                 PCURecord.__init__(self, plc_pcu_record)
254
255         def reboot(self, node_port, dryrun):
256
257                 port_list = []
258                 if hasattr(self, 'port_status') and self.port_status:
259                         port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
260                         port_list = [ int(x) for x in port_list ]
261                         if port_list == []:
262                                 raise ExceptionPort("Unsupported Port: No transport from open ports")
263                 else:
264                         port_list = self.supported_ports
265
266                 print port_list
267
268                 ret = "could not run"
269                 for port in port_list:
270                         if port not in Transport.porttypemap:
271                                 continue
272
273                         type = Transport.porttypemap[port]
274                         self.transport = Transport(type, verbose)
275
276                         if hasattr(self, "run_%s" % type):
277                                 fxn = getattr(self, "run_%s" % type)
278                                 ret = self.catcherror(fxn, node_port, dryrun)
279                                 if ret == 0: # NOTE: success!, so stop
280                                         break
281                         else:
282                                 continue
283
284                 return ret
285
286         def run(self, node_port, dryrun):
287                 """ This function is to be defined by the specific PCU instance.  """
288                 raise Exception("This function is not implemented")
289                 pass
290
291         #def reboot(self, node_port, dryrun):
292
293         def catcherror(self, function, node_port, dryrun):
294                 try:
295                         return function(node_port, dryrun)
296                 except ExceptionNotFound, err:
297                         return "error: " + str(err)
298                 except ExceptionPassword, err:
299                         return "Password exception: " + str(err)
300                 except ExceptionTimeout, err:
301                         return "Timeout exception: " + str(err)
302                 except ExceptionUsername, err:
303                         return "No username prompt: " + str(err)
304                 except ExceptionSequence, err:
305                         return "Sequence error: " + str(err)
306                 except ExceptionPrompt, err:
307                         return "Prompt exception: " + str(err)
308                 except ExceptionNoTransport, err:
309                         return "No Transport: " + str(err)
310                 except ExceptionPort, err:
311                         return "No ports exception: " + str(err)
312                 except socket.error, err:
313                         return "socket error: timeout: " + str(err)
314                 except urllib2.HTTPError, err:
315                         return "HTTPError: " + str(err)
316                 except urllib2.URLError, err:
317                         return "URLError: " + str(err)
318                 except EOFError, err:
319                         if self.verbose:
320                                 logger.debug("reboot: EOF")
321                                 logger.debug(err)
322                         self.transport.close()
323                         import traceback
324                         traceback.print_exc()
325                         return "EOF connection reset" + str(err)
326
327 from pcucontrol.models import *
328
329 def pcu_name(pcu):
330         if pcu['hostname'] is not None and pcu['hostname'] is not "":
331                 return pcu['hostname']
332         elif pcu['ip'] is not None and pcu['ip'] is not "":
333                 return pcu['ip']
334         else:
335                 return None
336
337 def get_pcu_values(pcu_id):
338         from monitor.database.info.model import FindbadPCURecord
339         print "pcuid: %s" % pcu_id
340         try:
341                 pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
342                 if pcurec:
343                         values = pcurec.to_dict()
344                 else:
345                         values = None
346         except:
347                 values = None
348
349         return values
350
351 def reboot(nodename):
352         return reboot_policy(nodename, True, False)
353
354 def reboot_str(nodename):
355         global verbose
356         continue_probe = True
357         dryrun=False
358
359         pcu = plc.getpcu(nodename)
360         if not pcu:
361                 logger.debug("no pcu for %s" % nodename)
362                 print "no pcu for %s" % nodename
363                 return False # "%s has no pcu" % nodename
364
365         values = get_pcu_values(pcu['pcu_id'])
366         if values == None:
367                 logger.debug("No values for pcu probe %s" % nodename)
368                 print "No values for pcu probe %s" % nodename
369                 return False #"no info for pcu_id %s" % pcu['pcu_id']
370         
371         # Try the PCU first
372         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
373
374         ret = reboot_test_new(nodename, values, verbose, dryrun)
375         return ret
376         
377 def reboot_policy(nodename, continue_probe, dryrun):
378         global verbose
379
380         pcu = plc.getpcu(nodename)
381         if not pcu:
382                 logger.debug("no pcu for %s" % nodename)
383                 print "no pcu for %s" % nodename
384                 return False # "%s has no pcu" % nodename
385
386         values = get_pcu_values(pcu['pcu_id'])
387         if values == None:
388                 logger.debug("No values for pcu probe %s" % nodename)
389                 print "No values for pcu probe %s" % nodename
390                 return False #"no info for pcu_id %s" % pcu['pcu_id']
391         
392         # Try the PCU first
393         logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
394
395         ret = reboot_test_new(nodename, values, verbose, dryrun)
396
397         if ret != 0:
398                 print ret
399                 return False
400         else:
401                 print "return true"
402                 return True
403
404 class Unknown(PCUControl):
405         supported_ports = [22,23,80,443,5869,9100,16992]
406
407 def model_to_object(modelname):
408         if modelname is None:
409                 return ManualPCU 
410         if "AMT" in modelname:
411                 return IntelAMT
412         elif "BayTech" in modelname:
413                 return BayTech
414         elif "HPiLO" in modelname:
415                 return HPiLO
416         elif "IPAL" in modelname:
417                 return IPAL
418         elif "APC" in modelname:
419                 return APCControl
420         elif "DRAC" in modelname:
421                 return DRAC
422         elif "WTI" in modelname:
423                 return WTIIPS4
424         elif "ePowerSwitch" in modelname:
425                 return ePowerSwitchNew
426         elif "IPMI" in modelname:
427                 return IPMI
428         elif "BlackBoxPSMaverick" in modelname:
429                 return BlackBoxPSMaverick
430         elif "PM211MIP" in modelname:
431                 return PM211MIP
432         elif "ManualPCU" in modelname:
433                 return ManualPCU 
434         else:
435                 print "UNKNOWN model %s"%modelname
436                 return Unknown
437
438 def reboot_api(node, pcu): #, verbose, dryrun):
439         rb_ret = ""
440
441         try:
442                 modelname = pcu['model']
443                 if modelname:
444                         # get object instance 
445                         instance = eval('%s(pcu, verbose)' % modelname)
446                         # get pcu port 
447                         i = pcu['node_ids'].index(node['node_id'])
448                         p = pcu['ports'][i]
449                         # reboot
450                         rb_ret = instance.reboot(p, False)
451                 else:
452                         rb_ret =  "No modelname in PCU record."
453                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
454         except Exception, err:
455                 rb_ret = str(err)
456
457         return rb_ret
458
459 def reboot_test_new(nodename, values, verbose, dryrun):
460         rb_ret = ""
461         if 'plc_pcu_stats' in values:
462                 values.update(values['plc_pcu_stats'])
463
464         try:
465                 modelname = values['model']
466                 if modelname:
467                         object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
468                         rb_ret = object.reboot(values[nodename], dryrun)
469                 else:
470                         rb_ret =  "Not_Run"
471                 # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
472         except ExceptionPort, err:
473                 rb_ret = str(err)
474         except NameError, err:
475                 rb_ret = str(err)
476
477         return rb_ret
478
479 def main():
480         logger.setLevel(logging.DEBUG)
481         ch = logging.StreamHandler()
482         ch.setLevel(logging.DEBUG)
483         formatter = logging.Formatter('LOGGER - %(message)s')
484         ch.setFormatter(formatter)
485         logger.addHandler(ch)
486
487         try:
488                 if "test" in sys.argv:
489                         dryrun = True
490                 else:
491                         dryrun = False
492
493                 for node in sys.argv[1:]:
494                         if node == "test": continue
495
496                         print "Rebooting %s" % node
497                         if reboot_policy(node, True, dryrun):
498                                 print "success"
499                         else:
500                                 print "failed"
501         except Exception, err:
502                 import traceback; traceback.print_exc()
503                 print err
504
505 if __name__ == '__main__':
506         logger = logging.getLogger("monitor")
507         main()
508         f = open("/tmp/rebootlog", 'a')
509         f.write("reboot %s\n" % sys.argv)
510         f.close()