From: Stephen Soltesz Date: Tue, 10 Mar 2009 20:07:06 +0000 (+0000) Subject: fixes for pcucontrol and DRAC control. X-Git-Tag: Monitor-2.0-2~2 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=d40ab05dde79f4698691e06e064170a09db98351 fixes for pcucontrol and DRAC control. added temporary old-to-new name mapping in reboot.py added sitelist option to findbadpcu.py many fixes otherwise. --- diff --git a/bootman.py b/bootman.py index 22201cb..67ce675 100755 --- a/bootman.py +++ b/bootman.py @@ -731,7 +731,7 @@ def reboot(hostname, config=None, forced_action=None): args = {} args['hostname'] = hostname args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, + m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) emails = plc.getTechEmails(loginbase) diff --git a/findbadpcu.py b/findbadpcu.py index 815a77e..b63a96a 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -164,6 +164,7 @@ if __name__ == '__main__': pcuid=None, pcuselect=None, site=None, + sitelist=None, dbname="findbadpcus", cachenodes=False, cachecalls=True, @@ -173,6 +174,8 @@ if __name__ == '__main__': help="Provide the input file for the node list") parser.add_option("", "--site", dest="site", metavar="FILE", help="Get all pcus associated with the given site's nodes") + parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE", + help="Get all pcus associated with the given site's nodes") parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", help="Query string to apply to the findbad pcus") parser.add_option("", "--pcuid", dest="pcuid", metavar="id", diff --git a/getsshkeys.py b/getsshkeys.py index 68d2945..d362c94 100755 --- a/getsshkeys.py +++ b/getsshkeys.py @@ -18,7 +18,7 @@ except: args = {} args['known_hosts'] = os.environ['HOME'] + os.sep + ".ssh" + os.sep + "known_hosts" try: - import config + from monitor import config args['XMLRPC_SERVER'] = config.API_SERVER except: args['XMLRPC_SERVER'] = 'https://boot.planet-lab.org/PLCAPI/' diff --git a/monitor/common.py b/monitor/common.py index 051cd61..65b82b8 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -8,7 +8,7 @@ from monitor import database from monitor.wrapper import plc, plccache from datetime import datetime -from monitor.model import PersistFlags +from monitor.model import PersistFlags, Message esc = struct.pack('i', 27) RED = esc + "[1;31m" @@ -211,4 +211,11 @@ def get_nodeset(config): l_nodes = node_select(config.nodeselect, node_list, None) return l_nodes - + +def email_exception(): + from monitor import config + import traceback + msg=traceback.format_exc() + m=Message("exception running monitor", msg, False) + m.send([config.cc_email]) + return diff --git a/monitor/model.py b/monitor/model.py index b4db483..2f2f5e3 100755 --- a/monitor/model.py +++ b/monitor/model.py @@ -527,6 +527,8 @@ class Record(object): else: print "takeAction: increasing penalty for %s"%self.hostname pp.increase() + + print "takeAction: applying penalty to %s as index %s"% (self.hostname, index) pp.index = index pp.apply(self.hostname) pp.save() diff --git a/monitor/policy.py b/monitor/policy.py index c23e7de..4574de7 100644 --- a/monitor/policy.py +++ b/monitor/policy.py @@ -171,10 +171,11 @@ class MonitorMergeDiagnoseSendEscellate: #### APPLY PENALTY if ( record.data['take_action'] and diag['Squeeze'] ): - print "action: taking action" + print "action: taking squeeze action" record.takeAction(record.data['penalty_level']) del diag['Squeeze'] if diag.getFlag('BackOff'): + print "action: taking backoff action" record.takeAction(0) del diag['BackOff'] diff --git a/monitor/scanapi.py b/monitor/scanapi.py index 194ab40..d9e17b5 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -494,7 +494,7 @@ class ScanPCU(ScanInterface): ###### DRY RUN ############################ - if 'node_ids' in values['plc_pcu_stats'] and \ + if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \ len(values['plc_pcu_stats']['node_ids']) > 0: rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, 1, True) @@ -510,7 +510,8 @@ class ScanPCU(ScanInterface): print "____________________________________" errors['traceback'] = traceback.format_exc() print errors['traceback'] - values['reboot_trial_status'] = errors['traceback'] + values['reboot_trial_status'] = str(errors['traceback']) + print values values['entry_complete']=" ".join(values['entry_complete']) diff --git a/nodequery.py b/nodequery.py index dfe3f95..781e841 100755 --- a/nodequery.py +++ b/nodequery.py @@ -270,6 +270,8 @@ def pcu_select(str_query, nodelist=None): fbquery = FindbadNodeRecord.get_all_latest() fb_nodelist = [ n.hostname for n in fbquery ] if True: + # NOTE: this doesn't work when there are only a few records current. + # pcu_select should apply to all pcus globally, not just the most recent records. fbpcuquery = FindbadPCURecord.get_all_latest() fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ] diff --git a/pcucontrol/models/APCControl.py b/pcucontrol/models/APCControl.py index 62f5f6f..59cc649 100644 --- a/pcucontrol/models/APCControl.py +++ b/pcucontrol/models/APCControl.py @@ -6,7 +6,7 @@ class APCControl(PCUControl): def run(self, node_port, dryrun): print "RUNNING!!!!!!!!!!!!" - if self.type == Transport.HTTPS or self.type == Transport.HTTP: + if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP: print "APC via http...." return self.run_http_or_https(node_port, dryrun) else: @@ -58,9 +58,9 @@ class APCControl(PCUControl): else: # TODO: also send message for https, since that doesn't work this way... - if self.type == Transport.HTTPS: + if self.transport.type == Transport.HTTPS: cmd = self.get_https_cmd() - elif self.type == Transport.HTTP: + elif self.transport.type == Transport.HTTP: cmd = self.get_http_cmd() else: raise ExceptionNoTransport("Unsupported transport for http command") @@ -118,12 +118,12 @@ class APCControl(PCUControl): # NOTE: we may need to return software version, no model version to # know which file to request on the server. - if self.type == Transport.HTTP: + if self.transport.type == Transport.HTTP: cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \ """ | sed -e "s/<[^>]*>//g" -e "s/ //g" -e "/^$/d" """ + \ """ | grep -E "AP[[:digit:]]+" """ #""" | grep -E "v[[:digit:]].*" """ - elif self.type == Transport.HTTPS: + elif self.transport.type == Transport.HTTPS: cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \ """ | sed -e "s/<[^>]*>//g" -e "s/ //g" -e "/^$/d" """ + \ """ | grep -E "AP[[:digit:]]+" """ @@ -138,10 +138,10 @@ class APCControl(PCUControl): def logout(self): # NOTE: log out again, to allow other uses to access the machine. - if self.type == Transport.HTTP: + if self.transport.type == Transport.HTTP: cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \ """ | grep -E '^[^<]+' """ - elif self.type == Transport.HTTPS: + elif self.transport.type == Transport.HTTPS: cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \ """ | grep -E '^[^<]+' """ else: diff --git a/pcucontrol/models/BayTech.py b/pcucontrol/models/BayTech.py index 83de3a5..065cc28 100644 --- a/pcucontrol/models/BayTech.py +++ b/pcucontrol/models/BayTech.py @@ -1,6 +1,7 @@ from pcucontrol.reboot import * class BayTechRPC3NC(PCUControl): + supported_ports = [22,23] def run_telnet(self, node_port, dryrun): return self.run_ssh(node_port, dryrun) @@ -22,6 +23,7 @@ class BayTechRPC3NC(PCUControl): return 0 class BayTechRPC16(PCUControl): + supported_ports = [22,23] def run_telnet(self, node_port, dryrun): return self.run_ssh(node_port, dryrun) def run_ssh(self, node_port, dryrun): @@ -48,6 +50,7 @@ class BayTechCtrlCUnibe(PCUControl): indefinitely, unless you send a Ctrl-C after the password. No idea why. """ + supported_ports = [22] def run_ssh(self, node_port, dryrun): print "BayTechCtrlC %s" % self.host @@ -69,9 +72,11 @@ class BayTechCtrlCUnibe(PCUControl): if index == 0: print "3" s.send("3\r\n") + time.sleep(5) index = s.expect(["DS-RPC>", "Enter user name:"]) if index == 1: s.send(self.username + "\r\n") + time.sleep(5) index = s.expect(["DS-RPC>"]) if index == 0: @@ -112,6 +117,7 @@ class BayTechCtrlC(PCUControl): indefinitely, unless you send a Ctrl-C after the password. No idea why. """ + supported_ports = [22] def run_ssh(self, node_port, dryrun): print "BayTechCtrlC %s" % self.host diff --git a/pcucontrol/models/DRAC.py b/pcucontrol/models/DRAC.py index e7c030a..0213201 100644 --- a/pcucontrol/models/DRAC.py +++ b/pcucontrol/models/DRAC.py @@ -16,7 +16,7 @@ class DRAC(PCUControl): original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT): raise ExceptionPassword("Invalid Password") - print "logging in..." + print "logging in... %s" % self.host s.send("\r\n\r\n") try: # Testing Reboot ? @@ -155,4 +155,4 @@ def racadm_reboot(host, username, password, port, dryrun): logger.debug("runcmd raised exception %s" % err) if verbose: logger.debug(err) - return err + return str(err) diff --git a/pcucontrol/models/ePowerSwitch.py b/pcucontrol/models/ePowerSwitch.py index 7650689..edff5cc 100644 --- a/pcucontrol/models/ePowerSwitch.py +++ b/pcucontrol/models/ePowerSwitch.py @@ -50,14 +50,14 @@ class ePowerSwitchNew(PCUControl): req.add_header("Authorization", authheader) # add data to handler, f = urllib2.urlopen(req, data) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() except: import traceback; traceback.print_exc() # fetch url one more time on cmd.html, econtrol.html or whatever. # pass else: - if self.verbose: print f.read() + if self.transport.verbose: print f.read() return 0 @@ -74,12 +74,12 @@ class ePowerSwitchOld(PCUControl): # NOTE: it doesn't seem to matter whether this authinfo is here or not. transport = urllib2.build_opener(authinfo) f = transport.open(self.url) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() if not dryrun: transport = urllib2.build_opener(authhandler) f = transport.open(self.url + "cmd.html", "P%d=r" % node_port) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() self.transport.close() return 0 @@ -103,12 +103,12 @@ class ePowerSwitchOld(PCUControl): # NOTE: it doesn't seem to matter whether this authinfo is here or not. transport = urllib2.build_opener() f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() if not dryrun: transport = urllib2.build_opener(authhandler) f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() # data= "P%d=r" % node_port #self.open(self.host, self.username, self.password) diff --git a/pcucontrol/reboot.py b/pcucontrol/reboot.py index 9d171a2..2361d89 100755 --- a/pcucontrol/reboot.py +++ b/pcucontrol/reboot.py @@ -135,7 +135,7 @@ class Transport: transport.set_debuglevel(self.verbose) if username is not None: self.transport = transport - self.transport.ifThenSend(prompt, username, ExceptionUsername) + self.ifThenSend(prompt, username, ExceptionUsername) elif self.type == self.SSH: if username is not None: @@ -255,17 +255,25 @@ class PCUControl(PCUModel,PCURecord): def reboot(self, node_port, dryrun): port_list = [] + # There are two sources of potential ports. Those that are open and + # those that are part of the PCU's supported_ports. + # I think we should start with supported_ports and then filter that + # by the open ports. + + port_list = self.supported_ports + if hasattr(self, 'port_status') and self.port_status: + # get out the open ports port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys()) port_list = [ int(x) for x in port_list ] + # take only the open ports that are supported_ports + port_list = filter(lambda x: x in self.supported_ports, port_list) if port_list == []: - raise ExceptionPort("Unsupported Port: No transport from open ports") - else: - port_list = self.supported_ports + raise ExceptionPort("No Open Port: No transport from open ports") print port_list - ret = "could not run" + ret = "No implementation for open ports on selected PCU model" for port in port_list: if port not in Transport.porttypemap: continue @@ -273,7 +281,9 @@ class PCUControl(PCUModel,PCURecord): type = Transport.porttypemap[port] self.transport = Transport(type, verbose) + print "checking for run_%s" % type if hasattr(self, "run_%s" % type): + print "found run_%s" % type fxn = getattr(self, "run_%s" % type) ret = self.catcherror(fxn, node_port, dryrun) if ret == 0: # NOTE: success!, so stop @@ -316,9 +326,6 @@ class PCUControl(PCUModel,PCURecord): except urllib2.URLError, err: return "URLError: " + str(err) except EOFError, err: - if self.verbose: - logger.debug("reboot: EOF") - logger.debug(err) self.transport.close() import traceback traceback.print_exc() @@ -456,15 +463,63 @@ def reboot_api(node, pcu): #, verbose, dryrun): return rb_ret +def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id): + newmodelname = None + update = { 'AP79xx' : 'APCControl13p13', + 'Masterswitch' : 'APCControl13p13', + 'DS4-RPC' : 'BayTech', + 'IP-41x_IP-81x' : 'IPAL', + 'DRAC3' : 'DRAC', + 'DRAC4' : 'DRAC', + 'ePowerSwitch' : 'ePowerSwitchOld', + 'ilo2' : 'HPiLO', + 'ilo1' : 'HPiLO', + 'PM211-MIP' : 'PM211MIP', + 'AMT2.5' : 'IntelAMT', + 'AMT3.0' : 'IntelAMT', + 'WTI_IPS-4' : 'WTIIPS4', + 'unknown' : 'ManualPCU', + 'DRAC5' : 'DRAC', + 'ipmi' : 'OpenIPMI', + 'bbsemaverick' : 'BlackBoxPSMaverick', + 'manualadmin' : 'ManualPCU', + } + + if oldmodelname in update: + newmodelname = update[oldmodelname] + else: + newmodelname = oldmodelname + + if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]: + newmodelname = 'APCControl12p3' + elif pcu_id in [1110,86]: + newmodelname = 'APCControl1p4' + elif pcu_id in [1221,1225,1220,1192]: + newmodelname = 'APCControl121p3' + elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]: + newmodelname = 'APCControl121p1' + elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]: + newmodelname = 'BayTechCtrlC' + elif pcu_id in [93]: + newmodelname = 'BayTechRPC3NC' + elif pcu_id in [1057]: + newmodelname = 'BayTechCtrlCUnibe' + elif pcu_id in [1012]: + newmodelname = 'BayTechRPC16' + elif pcu_id in [1089, 1071, 1046, 1035, 1118]: + newmodelname = 'ePowerSwitchNew' + + return newmodelname + def reboot_test_new(nodename, values, verbose, dryrun): rb_ret = "" if 'plc_pcu_stats' in values: values.update(values['plc_pcu_stats']) try: - modelname = values['model'] + modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id']) if modelname: - object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname) + object = eval('%s(values, verbose)' % modelname) rb_ret = object.reboot(values[nodename], dryrun) else: rb_ret = "Not_Run"