fixes for pcucontrol and DRAC control.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:07:06 +0000 (20:07 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:07:06 +0000 (20:07 +0000)
added temporary old-to-new name mapping in reboot.py
added sitelist option to findbadpcu.py
many fixes otherwise.

13 files changed:
bootman.py
findbadpcu.py
getsshkeys.py
monitor/common.py
monitor/model.py
monitor/policy.py
monitor/scanapi.py
nodequery.py
pcucontrol/models/APCControl.py
pcucontrol/models/BayTech.py
pcucontrol/models/DRAC.py
pcucontrol/models/ePowerSwitch.py
pcucontrol/reboot.py

index 22201cb..67ce675 100755 (executable)
@@ -731,7 +731,7 @@ def reboot(hostname, config=None, forced_action=None):
                        args = {}
                        args['hostname'] = hostname
                        args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
                        emails = plc.getTechEmails(loginbase)
index 815a77e..b63a96a 100755 (executable)
@@ -164,6 +164,7 @@ if __name__ == '__main__':
                                                pcuid=None,
                                                pcuselect=None,
                                                site=None,
+                                               sitelist=None,
                                                dbname="findbadpcus", 
                                                cachenodes=False,
                                                cachecalls=True,
@@ -173,6 +174,8 @@ if __name__ == '__main__':
                                                help="Provide the input file for the node list")
        parser.add_option("", "--site", dest="site", metavar="FILE", 
                                                help="Get all pcus associated with the given site's nodes")
+       parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE", 
+                                               help="Get all pcus associated with the given site's nodes")
        parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
                                                help="Query string to apply to the findbad pcus")
        parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
index 68d2945..d362c94 100755 (executable)
@@ -18,7 +18,7 @@ except:
 args = {}
 args['known_hosts'] =  os.environ['HOME'] + os.sep + ".ssh" + os.sep + "known_hosts"
 try:
-       import config
+       from monitor import config
        args['XMLRPC_SERVER'] = config.API_SERVER
 except:
        args['XMLRPC_SERVER'] = 'https://boot.planet-lab.org/PLCAPI/'
index 051cd61..65b82b8 100644 (file)
@@ -8,7 +8,7 @@ from monitor import database
 from monitor.wrapper import plc, plccache
 
 from datetime import datetime 
-from monitor.model import PersistFlags
+from monitor.model import PersistFlags, Message
 
 esc = struct.pack('i', 27)
 RED    = esc + "[1;31m"
@@ -211,4 +211,11 @@ def get_nodeset(config):
                l_nodes = node_select(config.nodeselect, node_list, None)
 
        return l_nodes
-       
+
+def email_exception():
+       from monitor import config
+       import traceback
+       msg=traceback.format_exc()
+       m=Message("exception running monitor", msg, False)
+       m.send([config.cc_email])
+       return
index b4db483..2f2f5e3 100755 (executable)
@@ -527,6 +527,8 @@ class Record(object):
                else:
                        print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
+
+               print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
                pp.index = index
                pp.apply(self.hostname)
                pp.save()
index c23e7de..4574de7 100644 (file)
@@ -171,10 +171,11 @@ class MonitorMergeDiagnoseSendEscellate:
 
                        #### APPLY PENALTY
                        if ( record.data['take_action'] and diag['Squeeze'] ): 
-                               print "action: taking action"
+                               print "action: taking squeeze action"
                                record.takeAction(record.data['penalty_level'])
                                del diag['Squeeze']
                        if diag.getFlag('BackOff'):
+                               print "action: taking backoff action"
                                record.takeAction(0)
                                del diag['BackOff']
 
index 194ab40..d9e17b5 100644 (file)
@@ -494,7 +494,7 @@ class ScanPCU(ScanInterface):
 
 
                        ######  DRY RUN  ############################
-                       if 'node_ids' in values['plc_pcu_stats'] and \
+                       if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
                                len(values['plc_pcu_stats']['node_ids']) > 0:
                                rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], 
                                                                                                values, 1, True)
@@ -510,7 +510,8 @@ class ScanPCU(ScanInterface):
                        print "____________________________________"
                        errors['traceback'] = traceback.format_exc()
                        print errors['traceback']
-                       values['reboot_trial_status'] = errors['traceback']
+                       values['reboot_trial_status'] = str(errors['traceback'])
+                       print values
 
                values['entry_complete']=" ".join(values['entry_complete'])
 
index dfe3f95..781e841 100755 (executable)
@@ -270,6 +270,8 @@ def pcu_select(str_query, nodelist=None):
                fbquery = FindbadNodeRecord.get_all_latest()
                fb_nodelist = [ n.hostname for n in fbquery ]
        if True:
+               # NOTE: this doesn't work when there are only a few records current.
+               # pcu_select should apply to all pcus globally, not just the most recent records.
                fbpcuquery = FindbadPCURecord.get_all_latest()
                fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ]
 
index 62f5f6f..59cc649 100644 (file)
@@ -6,7 +6,7 @@ class APCControl(PCUControl):
 
        def run(self, node_port, dryrun):
                print "RUNNING!!!!!!!!!!!!"
-               if self.type == Transport.HTTPS or self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP:
                        print "APC via http...."
                        return self.run_http_or_https(node_port, dryrun)
                else:
@@ -58,9 +58,9 @@ class APCControl(PCUControl):
 
                else:
                        # TODO: also send message for https, since that doesn't work this way...
-                       if self.type == Transport.HTTPS:
+                       if self.transport.type == Transport.HTTPS:
                                cmd = self.get_https_cmd()
-                       elif self.type == Transport.HTTP:
+                       elif self.transport.type == Transport.HTTP:
                                cmd = self.get_http_cmd()
                        else:
                                raise ExceptionNoTransport("Unsupported transport for http command")
@@ -118,12 +118,12 @@ class APCControl(PCUControl):
                # NOTE: we may need to return software version, no model version to
                #               know which file to request on the server.
 
-               if self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTP:
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
                                  #""" | grep -E "v[[:digit:]].*" """
-               elif self.type == Transport.HTTPS:
+               elif self.transport.type == Transport.HTTPS:
                        cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \
                                  """ | sed -e "s/<[^>]*>//g" -e "s/&nbsp;//g" -e "/^$/d" """ + \
                                  """ | grep -E "AP[[:digit:]]+" """
@@ -138,10 +138,10 @@ class APCControl(PCUControl):
 
        def logout(self):
                # NOTE: log out again, to allow other uses to access the machine.
-               if self.type == Transport.HTTP:
+               if self.transport.type == Transport.HTTP:
                        cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
-               elif self.type == Transport.HTTPS:
+               elif self.transport.type == Transport.HTTPS:
                        cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \
                                  """ | grep -E '^[^<]+' """
                else:
index 83de3a5..065cc28 100644 (file)
@@ -1,6 +1,7 @@
 from pcucontrol.reboot import *
 
 class BayTechRPC3NC(PCUControl):
+       supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
 
@@ -22,6 +23,7 @@ class BayTechRPC3NC(PCUControl):
                return 0
 
 class BayTechRPC16(PCUControl):
+       supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
                return self.run_ssh(node_port, dryrun)
        def run_ssh(self, node_port, dryrun):
@@ -48,6 +50,7 @@ class BayTechCtrlCUnibe(PCUControl):
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
+       supported_ports = [22]
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
@@ -69,9 +72,11 @@ class BayTechCtrlCUnibe(PCUControl):
                        if index == 0:
                                print "3"
                                s.send("3\r\n")
+                               time.sleep(5)
                                index = s.expect(["DS-RPC>", "Enter user name:"])
                                if index == 1:
                                        s.send(self.username + "\r\n")
+                                       time.sleep(5)
                                        index = s.expect(["DS-RPC>"])
 
                                if index == 0:
@@ -112,6 +117,7 @@ class BayTechCtrlC(PCUControl):
                indefinitely, unless you send a Ctrl-C after the password.  No idea
                why.
        """
+       supported_ports = [22]
        def run_ssh(self, node_port, dryrun):
                print "BayTechCtrlC %s" % self.host
 
index e7c030a..0213201 100644 (file)
@@ -16,7 +16,7 @@ class DRAC(PCUControl):
                                                original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT):
                        raise ExceptionPassword("Invalid Password")
 
-               print "logging in..."
+               print "logging in... %s" % self.host
                s.send("\r\n\r\n")
                try:
                        # Testing Reboot ?
@@ -155,4 +155,4 @@ def racadm_reboot(host, username, password, port, dryrun):
                logger.debug("runcmd raised exception %s" % err)
                if verbose:
                        logger.debug(err)
-               return err
+               return str(err)
index 7650689..edff5cc 100644 (file)
@@ -50,14 +50,14 @@ class ePowerSwitchNew(PCUControl):
                                req.add_header("Authorization", authheader)
                                # add data to handler,
                                f = urllib2.urlopen(req, data)
-                               if self.verbose: print f.read()
+                               if self.transport.verbose: print f.read()
                        except:
                                import traceback; traceback.print_exc()
 
                                # fetch url one more time on cmd.html, econtrol.html or whatever.
                                # pass
                else:
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                return 0
 
@@ -74,12 +74,12 @@ class ePowerSwitchOld(PCUControl):
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener(authinfo)
                f = transport.open(self.url)
-               if self.verbose: print f.read()
+               if self.transport.verbose: print f.read()
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "cmd.html", "P%d=r" % node_port)
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                self.transport.close()
                return 0
@@ -103,12 +103,12 @@ class ePowerSwitchOld(PCUControl):
                # NOTE: it doesn't seem to matter whether this authinfo is here or not.
                transport = urllib2.build_opener()
                f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password)
-               if self.verbose: print f.read()
+               if self.transport.verbose: print f.read()
 
                if not dryrun:
                        transport = urllib2.build_opener(authhandler)
                        f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port)
-                       if self.verbose: print f.read()
+                       if self.transport.verbose: print f.read()
 
                #       data= "P%d=r" % node_port
                #self.open(self.host, self.username, self.password)
index 9d171a2..2361d89 100755 (executable)
@@ -135,7 +135,7 @@ class Transport:
                        transport.set_debuglevel(self.verbose)
                        if username is not None:
                                self.transport = transport
-                               self.transport.ifThenSend(prompt, username, ExceptionUsername)
+                               self.ifThenSend(prompt, username, ExceptionUsername)
 
                elif self.type == self.SSH:
                        if username is not None:
@@ -255,17 +255,25 @@ class PCUControl(PCUModel,PCURecord):
        def reboot(self, node_port, dryrun):
 
                port_list = []
+               # There are two sources of potential ports.  Those that are open and
+               # those that are part of the PCU's supported_ports.  
+               #  I think we should start with supported_ports and then filter that
+               #  by the open ports.
+
+               port_list = self.supported_ports
+
                if hasattr(self, 'port_status') and self.port_status:
+                       # get out the open ports
                        port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
                        port_list = [ int(x) for x in port_list ]
+                       # take only the open ports that are supported_ports
+                       port_list = filter(lambda x: x in self.supported_ports, port_list)
                        if port_list == []:
-                               raise ExceptionPort("Unsupported Port: No transport from open ports")
-               else:
-                       port_list = self.supported_ports
+                               raise ExceptionPort("No Open Port: No transport from open ports")
 
                print port_list
 
-               ret = "could not run"
+               ret = "No implementation for open ports on selected PCU model"
                for port in port_list:
                        if port not in Transport.porttypemap:
                                continue
@@ -273,7 +281,9 @@ class PCUControl(PCUModel,PCURecord):
                        type = Transport.porttypemap[port]
                        self.transport = Transport(type, verbose)
 
+                       print "checking for run_%s" % type
                        if hasattr(self, "run_%s" % type):
+                               print "found run_%s" % type
                                fxn = getattr(self, "run_%s" % type)
                                ret = self.catcherror(fxn, node_port, dryrun)
                                if ret == 0: # NOTE: success!, so stop
@@ -316,9 +326,6 @@ class PCUControl(PCUModel,PCURecord):
                except urllib2.URLError, err:
                        return "URLError: " + str(err)
                except EOFError, err:
-                       if self.verbose:
-                               logger.debug("reboot: EOF")
-                               logger.debug(err)
                        self.transport.close()
                        import traceback
                        traceback.print_exc()
@@ -456,15 +463,63 @@ def reboot_api(node, pcu): #, verbose, dryrun):
 
        return rb_ret
 
+def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
+       newmodelname = None
+       update = {      'AP79xx' : 'APCControl13p13',
+                               'Masterswitch' : 'APCControl13p13',
+                               'DS4-RPC' : 'BayTech',
+                               'IP-41x_IP-81x' : 'IPAL',
+                               'DRAC3' : 'DRAC',
+                               'DRAC4' : 'DRAC',
+                               'ePowerSwitch' : 'ePowerSwitchOld',
+                               'ilo2' : 'HPiLO',
+                               'ilo1' : 'HPiLO',
+                               'PM211-MIP' : 'PM211MIP',
+                               'AMT2.5' : 'IntelAMT',
+                               'AMT3.0' : 'IntelAMT',
+                               'WTI_IPS-4' : 'WTIIPS4',
+                               'unknown'  : 'ManualPCU',
+                               'DRAC5' : 'DRAC',
+                               'ipmi'  : 'OpenIPMI',
+                               'bbsemaverick' : 'BlackBoxPSMaverick',
+                               'manualadmin'  : 'ManualPCU',
+       }
+
+       if oldmodelname in update:
+               newmodelname = update[oldmodelname]
+       else:
+               newmodelname = oldmodelname
+
+       if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
+               newmodelname = 'APCControl12p3'
+       elif pcu_id in [1110,86]:
+               newmodelname = 'APCControl1p4'
+       elif pcu_id in [1221,1225,1220,1192]:
+               newmodelname = 'APCControl121p3'
+       elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
+               newmodelname = 'APCControl121p1'
+       elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
+               newmodelname = 'BayTechCtrlC'
+       elif pcu_id in [93]:
+               newmodelname = 'BayTechRPC3NC'
+       elif pcu_id in [1057]:
+               newmodelname = 'BayTechCtrlCUnibe'
+       elif pcu_id in [1012]:
+               newmodelname = 'BayTechRPC16'
+       elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
+               newmodelname = 'ePowerSwitchNew'
+
+       return newmodelname
+
 def reboot_test_new(nodename, values, verbose, dryrun):
        rb_ret = ""
        if 'plc_pcu_stats' in values:
                values.update(values['plc_pcu_stats'])
 
        try:
-               modelname = values['model']
+               modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
                if modelname:
-                       object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
+                       object = eval('%s(values, verbose)' % modelname)
                        rb_ret = object.reboot(values[nodename], dryrun)
                else:
                        rb_ret =  "Not_Run"