From 982bd1d331390521823fa079e19e6e1d591e756a Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 28 May 2009 15:16:27 +0000 Subject: [PATCH] merge recent v2 changes; migrating to v3 --- RunlevelAgent.py | 1 + getsshkeys.py | 12 ++++++------ monitor/bootman.py | 28 ++++++++++++++++++---------- monitor/common.py | 1 + monitor/database/info/plc.py | 17 +++++++++++++++++ monitor/getsshkeys.py | 12 ++++++------ monitor/nodeconfig.py | 2 +- monitor/scanapi.py | 7 +++++-- monitor/wrapper/emailTxt.py | 2 +- monitor/wrapper/plc.py | 4 ++-- monitor/wrapper/plccache.py | 23 +++++++++++++++++------ nagios/plc2nagios.py | 4 ++-- nodeaction.py | 2 +- nodeconfig.py | 2 +- policy.py | 2 +- prep_power_users.py | 4 ++-- testapi.py | 2 +- tests/bwlimit.py | 4 ++-- tests/nodenetwork.py | 16 ++++++++-------- zabbix/zabbixsync.py | 4 ++-- 20 files changed, 95 insertions(+), 54 deletions(-) diff --git a/RunlevelAgent.py b/RunlevelAgent.py index 739678d..cf1c8bf 100644 --- a/RunlevelAgent.py +++ b/RunlevelAgent.py @@ -82,6 +82,7 @@ def main(): session_str=f.read().strip() api = PLC(Auth(session=session_str), api_server_url) # NOTE: should we rely on bootmanager for this functionality? + # TODO: handle dns failure here. api.AuthCheck() try: diff --git a/getsshkeys.py b/getsshkeys.py index d362c94..7932ab3 100755 --- a/getsshkeys.py +++ b/getsshkeys.py @@ -135,27 +135,27 @@ class SSHKnownHosts: if type(host) == type(""): host = [host] # get the node(s) info - nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","nodenetwork_ids"]) + nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","interface_ids"]) # for each node's node network, update the self.nodenetworks cache nodenetworks = [] for node in nodes: - for net in node["nodenetwork_ids"]: + for net in node["interface_ids"]: nodenetworks.append(net) - plcnodenetworks = self.api.GetNodeNetworks(self.auth,nodenetworks,["nodenetwork_id","ip"]) + plcnodenetworks = self.api.GetInterfaces(self.auth,nodenetworks,["interface_id","ip"]) for n in plcnodenetworks: - self.nodenetworks[n["nodenetwork_id"]]=n + self.nodenetworks[n["interface_id"]]=n return nodes def _record_from_node(self, node, nokey_list=None): host = node['hostname'] key = node['ssh_rsa_key'] - nodenetworks = node['nodenetwork_ids'] + nodenetworks = node['interface_ids'] if len(nodenetworks)==0: return (host, None, None, None) - # the [0] subscript to node['nodenetwork_ids'] means + # the [0] subscript to node['interface_ids'] means # that this function wont work with multihomed nodes l_nw = self.nodenetworks.get(nodenetworks[0],None) if l_nw is None: return (host, None, None, None) diff --git a/monitor/bootman.py b/monitor/bootman.py index effd750..531f883 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -42,6 +42,8 @@ api = plc.getAuthAPI() fb = None +class ExceptionDoubleSSHError(Exception): pass + class NodeConnection: def __init__(self, connection, node, config): self.node = node @@ -248,7 +250,7 @@ class PlanetLabSession: if ret != 0: print "\tFAILED TWICE" #sys.exit(1) - raise Exception("Failed twice trying to login with updated ssh host key") + raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key") t1 = time.time() # KILL any already running servers. @@ -346,9 +348,11 @@ class DebugInterface: self.session = PlanetLabSession(self.hostname, False, True) else: self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) - except Exception, e: + except ExceptionDoubleSSHError, e: msg = "ERROR setting up session for %s" % self.hostname print msg + return False + except Exception, e: traceback.print_exc() email_exception(msg) return False @@ -361,6 +365,10 @@ class DebugInterface: try: time.sleep(self.session.timeout*5) conn = self.session.get_connection(config) + except EOFError: + # failed twice... no need to report this really, it's just in a + # weird state... + return False except: traceback.print_exc() email_exception(self.hostname) @@ -399,7 +407,7 @@ class DebugInterface: ]: sequences.update({n : "restart_bootmanager_boot"}) - # conn.restart_bootmanager('rins') + # conn.restart_bootmanager('reinstall') for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", @@ -428,7 +436,7 @@ class DebugInterface: # repair_node_keys sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - # conn.restart_node('rins') + # conn.restart_node('reinstall') for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", @@ -642,7 +650,7 @@ def restore(sitehist, hostname, config=None, forced_action=None): if type(conn) == type(False): return False #if forced_action == "reboot": - # conn.restart_node('rins') + # conn.restart_node('reinstall') # return True boot_state = conn.get_boot_state() @@ -731,16 +739,16 @@ def restore(sitehist, hostname, config=None, forced_action=None): conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": print "...Restarting BootManager.py on %s "%hostname - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') elif sequences[s] == "restart_node_rins": - conn.restart_node('rins') + conn.restart_node('reinstall') elif sequences[s] == "restart_node_boot": conn.restart_node('boot') elif sequences[s] == "repair_node_keys": if conn.compare_and_repair_nodekeys(): # the keys either are in sync or were forced in sync. # so try to reboot the node again. - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') pass else: # there was some failure to synchronize the keys. @@ -813,7 +821,7 @@ def restore(sitehist, hostname, config=None, forced_action=None): args = {} try: node = plccache.GetNodeByName(hostname) - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + net = api.GetInterfaces(node['interface_ids'])[0] except: email_exception() print traceback.print_exc() @@ -824,7 +832,7 @@ def restore(sitehist, hostname, config=None, forced_action=None): args['hostname'] = hostname args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] + args['interface_id'] = net['interface_id'] sitehist.sendMessage('baddns_notice', **args) diff --git a/monitor/common.py b/monitor/common.py index d082dbb..372913a 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -78,6 +78,7 @@ def color_boot_state(l): elif l == "down": return red(l) elif l == "boot": return green(l) elif l == "rins": return blue(l) + elif l == "reinstall": return blue(l) else: return l diff --git a/monitor/database/info/plc.py b/monitor/database/info/plc.py index 0847057..bfbfde4 100644 --- a/monitor/database/info/plc.py +++ b/monitor/database/info/plc.py @@ -31,3 +31,20 @@ class PlcPCU(Entity): plc_pcu_stats = Field(PickleType,default=None) acts_as_versioned(ignore=['date_checked']) + +class PlcPCU2(Entity): + pcu_id = Field(Integer,primary_key=True) + date_checked = Field(DateTime,default=datetime.now) + + site_id = Field(Integer, default=0) + protocol = Field(String, default=None) + node_ids = Field(PickleType,default=None) + ports = Field(PickleType,default=None) + hostname = Field(String, default=None) + ip = Field(String, default=None) + username = Field(String, default=None) + password = Field(String, default=None) + model = Field(String, default=None) + notes = Field(String, default=None) + + acts_as_versioned(ignore=['date_checked']) diff --git a/monitor/getsshkeys.py b/monitor/getsshkeys.py index 1068c5f..686252b 100755 --- a/monitor/getsshkeys.py +++ b/monitor/getsshkeys.py @@ -135,27 +135,27 @@ class SSHKnownHosts: if type(host) == type(""): host = [host] # get the node(s) info - nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","nodenetwork_ids"]) + nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","interface_ids"]) # for each node's node network, update the self.nodenetworks cache nodenetworks = [] for node in nodes: - for net in node["nodenetwork_ids"]: + for net in node["interface_ids"]: nodenetworks.append(net) - plcnodenetworks = self.api.GetNodeNetworks(self.auth,nodenetworks,["nodenetwork_id","ip"]) + plcnodenetworks = self.api.GetInterfaces(self.auth,nodenetworks,["interface_id","ip"]) for n in plcnodenetworks: - self.nodenetworks[n["nodenetwork_id"]]=n + self.nodenetworks[n["interface_id"]]=n return nodes def _record_from_node(self, node, nokey_list=None): host = node['hostname'] key = node['ssh_rsa_key'] - nodenetworks = node['nodenetwork_ids'] + nodenetworks = node['interface_ids'] if len(nodenetworks)==0: return (host, None, None, None) - # the [0] subscript to node['nodenetwork_ids'] means + # the [0] subscript to node['interface_ids'] means # that this function wont work with multihomed nodes l_nw = self.nodenetworks.get(nodenetworks[0],None) if l_nw is None: return (host, None, None, None) diff --git a/monitor/nodeconfig.py b/monitor/nodeconfig.py index 6a23fb7..bb66176 100755 --- a/monitor/nodeconfig.py +++ b/monitor/nodeconfig.py @@ -39,7 +39,7 @@ def main(): try: n = api.GetNodes(node)[0] #print n - net = api.GetNodeNetworks(n['nodenetwork_ids'])[0] + net = api.GetInterfaces(n['interface_ids'])[0] #print net node_keys = ['boot_state', 'key', 'last_updated', 'last_contact'] diff --git a/monitor/scanapi.py b/monitor/scanapi.py index f7939e6..667c504 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -157,7 +157,7 @@ class ScanInterface(object): except: print "ERROR:" - email_exception(nodename) + email_exception(str(nodename)) print traceback.print_exc() pass @@ -204,13 +204,14 @@ class ScanNodeInternal(ScanInterface): echo ' "bmlog":"'`ls /tmp/bm.log`'",' echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' - echo ' "fs_status":"'`touch /var/log/monitor 2>&1`'",' + echo ' "fs_status":"'`touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then touch /vservers/monitor.log 2>&1 ; fi ; grep proc /proc/mounts | grep ro,`'",' echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' + echo ' "rpm_version":"'`rpm -q NodeManager`'",' echo "}" EOF """) @@ -225,6 +226,7 @@ EOF """) 'nm_status' : '', 'fs_status' : '', 'dns_status' : '', + 'rpm_version' : '', 'princeton_comon_dir' : "", 'princeton_comon_running' : "", 'princeton_comon_procs' : "", 'ssh_portused' : None}) @@ -232,6 +234,7 @@ EOF """) print traceback.print_exc() sys.exit(1) + print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) ### RUN SSH ###################### b_getbootcd_id = True diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index b50be5b..c90cf5e 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -644,7 +644,7 @@ To help us return this machine to running order, please verify that the register You may update the node's network information at the link below: - https://www.planet-lab.org/db/nodes/node_networks.php?id=%(nodenetwork_id)s + https://www.planet-lab.org/db/nodes/node_networks.php?id=%(interface_id)s If you have any questions, please feel free to contact us at PlanetLab Support (support@planet-lab.org). diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 2f0f19d..8f70c1f 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -301,14 +301,14 @@ def getSiteNodes2(loginbase): def getNodeNetworks(filter=None): api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) - nodenetworks = api.GetNodeNetworks(auth.auth, filter, None) + nodenetworks = api.GetInterfaces(auth.auth, filter, None) return nodenetworks def getNodes(filter=None, fields=None): api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) nodes = api.GetNodes(auth.auth, filter, fields) #['boot_state', 'hostname', - #'site_id', 'date_created', 'node_id', 'version', 'nodenetwork_ids', + #'site_id', 'date_created', 'node_id', 'version', 'interface_ids', #'last_updated', 'peer_node_id', 'ssh_rsa_key' ]) return nodes diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index dc62d0d..f92fa85 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -78,8 +78,16 @@ def init(): l_nodes = [ s.plc_node_stats for s in dbnodes ] print "plcpcu" - dbpcus = PlcPCU.query.all() - l_pcus = [ s.plc_pcu_stats for s in dbpcus ] + dbpcus = PlcPCU2.query.all() + l_pcus = [] + for s in dbpcus: + pcu = {} + for k in ['username', 'protocol', 'node_ids', 'ip', + 'pcu_id', 'hostname', 'site_id', 'notes', + 'model', 'password', 'ports']: + pcu[k] = getattr(s, k) + l_pcus.append(pcu) + #l_pcus = [ s.plc_pcu_stats for s in dbpcus ] print "dsites_from_lsites" (d_sites,id2lb) = dsites_from_lsites(l_sites) @@ -139,7 +147,7 @@ def sync(): l_nodes = plc.api.GetNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', 'version', 'last_updated', 'date_created', 'key', - 'last_contact', 'pcu_ids', 'nodenetwork_ids']) + 'last_contact', 'pcu_ids', 'interface_ids']) l_pcus = plc.api.GetPCUs() print "sync sites" @@ -154,10 +162,13 @@ def sync(): print "sync pcus" for pcu in l_pcus: - dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id']) + dbpcu = PlcPCU2.findby_or_create(pcu_id=pcu['pcu_id']) dbpcu.date_checked = datetime.now() - dbpcu.plc_pcu_stats = pcu - deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id') + for key in pcu.keys(): + print "setting %s = %s" % (key, pcu[key]) + setattr(dbpcu, key, pcu[key]) + + deleteExtra(l_pcus, PlcPCU2, 'pcu_id', 'pcu_id') deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id') deleteExtra(l_pcus, FindbadPCURecord, 'plc_pcuid', 'pcu_id') session.flush() diff --git a/nagios/plc2nagios.py b/nagios/plc2nagios.py index d7da592..de74d42 100755 --- a/nagios/plc2nagios.py +++ b/nagios/plc2nagios.py @@ -97,10 +97,10 @@ for site in plcdb: for node in nodes: hn = node['hostname'] - if len(node['nodenetwork_ids']) == 0: + if len(node['interface_ids']) == 0: continue - ip = netid2ip[node['nodenetwork_ids'][0]] + ip = netid2ip[node['interface_ids'][0]] if lon_x is not -1 and lat_y is not -1: coords="%s,%s" % (lon_x, lat_y) diff --git a/nodeaction.py b/nodeaction.py index a5bd3ea..ad58279 100755 --- a/nodeaction.py +++ b/nodeaction.py @@ -30,7 +30,7 @@ for node in config.args: if config.rins: print "Setting %s to rins" % node - api.UpdateNode(node, {'boot_state' : 'rins'}) + api.UpdateNode(node, {'boot_state' : 'reinstall'}) if config.backoff: print "Enabling Slices & Slice Creation for %s" % node diff --git a/nodeconfig.py b/nodeconfig.py index 6a23fb7..bb66176 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -39,7 +39,7 @@ def main(): try: n = api.GetNodes(node)[0] #print n - net = api.GetNodeNetworks(n['nodenetwork_ids'])[0] + net = api.GetInterfaces(n['interface_ids'])[0] #print net node_keys = ['boot_state', 'key', 'last_updated', 'last_contact'] diff --git a/policy.py b/policy.py index 7ce85db..11ec4a7 100755 --- a/policy.py +++ b/policy.py @@ -33,7 +33,7 @@ api = plc.getAuthAPI() def logic(): - plc.nodeBootState(host, 'rins') + plc.nodeBootState(host, 'reinstall') node_end_record(host) def main(hostnames, sitenames): diff --git a/prep_power_users.py b/prep_power_users.py index 8e0219e..01094ac 100755 --- a/prep_power_users.py +++ b/prep_power_users.py @@ -134,9 +134,9 @@ for email in email_list: 'model': node['model'], 'hostname' : node['hostname'], 'version' : node['version']}) - nnets = api.GetNodeNetworks(node['nodenetwork_ids']) + nnets = api.GetInterfaces(node['interface_ids']) for nnet in nnets: - del nnet['nodenetwork_id'] + del nnet['interface_id'] del nnet['nodenetwork_setting_ids'] api06.AddNodeNetwork(id, nnet) print "" diff --git a/testapi.py b/testapi.py index d60effb..530677f 100755 --- a/testapi.py +++ b/testapi.py @@ -13,7 +13,7 @@ try: site_nodes = api.GetNodes(site['node_ids']) site_people = api.GetPersons(site['person_ids']) for node in site_nodes: - network = api.GetNodeNetworks(node['nodenetwork_ids']) + network = api.GetInterfaces(node['interface_ids']) print "ok" except: sys.stderr.write(traceback.format_exc()) diff --git a/tests/bwlimit.py b/tests/bwlimit.py index 6b93156..850ad53 100755 --- a/tests/bwlimit.py +++ b/tests/bwlimit.py @@ -19,8 +19,8 @@ def main(): for h in d_nodes: host = d_nodes[h] - for nw_id in host['nodenetwork_ids']: - l_nw = plc.getNodeNetworks({'nodenetwork_id': host['nodenetwork_ids']}) + for nw_id in host['interface_ids']: + l_nw = plc.getNodeNetworks({'interface_id': host['interface_ids']}) bwlimit[h] = [] for nw in l_nw: if nw['bwlimit'] != None and nw['bwlimit'] < 500000: diff --git a/tests/nodenetwork.py b/tests/nodenetwork.py index 5c1a439..baa64cd 100755 --- a/tests/nodenetwork.py +++ b/tests/nodenetwork.py @@ -8,32 +8,32 @@ import util.file if len(sys.argv[1:]) > 0: for host in sys.argv[1:]: n = api.GetNodes(host)[0] - nn = api.GetNodeNetworks(n['nodenetwork_ids']) + nn = api.GetInterfaces(n['interface_ids']) for nodenet in nn: - nnet2 = api.GetNodeNetworks({'ip': nodenet['ip']}) + nnet2 = api.GetInterfaces({'ip': nodenet['ip']}) print "len of nn entries with ip: %s == %s " % ( nodenet['ip'], len(nnet2) ) for nn2 in nnet2: n2 = api.GetNodes(nn2['node_id']) - print "\t%d node is attached to nodenetwork %s" % ( len(n2), nn2['nodenetwork_id'] ) + print "\t%d node is attached to nodenetwork %s" % ( len(n2), nn2['interface_id'] ) if len(n2) != 0 : n2 = n2[0] print #print "host %s : %s" % (n2['hostname'], n2['node_id']) else: pass - #print nn2['nodenetwork_id'] - #api.DeleteNodeNetwork(nn2['nodenetwork_id']) + #print nn2['interface_id'] + #api.DeleteNodeNetwork(nn2['interface_id']) else: nnids = util.file.getListFromFile('nnids.txt') nnids = [ int(i) for i in nnids] for id in nnids: - nnet2 = api.GetNodeNetworks(id) + nnet2 = api.GetInterfaces(id) for nn2 in nnet2: n2 = api.GetNodes(nn2['node_id']) if len(n2) == 0 : - print "\t%d node is attached to nodenetwork %s %s" % ( len(n2), nn2['nodenetwork_id'] , nn2['ip']), + print "\t%d node is attached to nodenetwork %s %s" % ( len(n2), nn2['interface_id'] , nn2['ip']), - netlist = api.GetNodeNetworks({'ip' : nn2['ip']}) + netlist = api.GetInterfaces({'ip' : nn2['ip']}) if len(netlist) != 1: node_len = len([ n['node_id'] for n in netlist]) print "\t but, ip %s is used by %s nodenetwork entries" % (nn2['ip'], node_len) diff --git a/zabbix/zabbixsync.py b/zabbix/zabbixsync.py index aaee4ff..407d801 100755 --- a/zabbix/zabbixsync.py +++ b/zabbix/zabbixsync.py @@ -22,8 +22,8 @@ def get_site_iplist(loginbase): # TODO: if it is, then we need to break up the discovery rule. ip_list = "" for node in node_list: - if len(node['nodenetwork_ids']) > 0: - ip = netid2ip[node['nodenetwork_ids'][0]] + if len(node['interface_ids']) > 0: + ip = netid2ip[node['interface_ids'][0]] if len(ip_list) > 0: ip_list += "," ip_list += ip -- 2.43.0