merge recent v2 changes; migrating to v3 2.0 2.0
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 28 May 2009 15:16:27 +0000 (15:16 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 28 May 2009 15:16:27 +0000 (15:16 +0000)
20 files changed:
RunlevelAgent.py
getsshkeys.py
monitor/bootman.py
monitor/common.py
monitor/database/info/plc.py
monitor/getsshkeys.py
monitor/nodeconfig.py
monitor/scanapi.py
monitor/wrapper/emailTxt.py
monitor/wrapper/plc.py
monitor/wrapper/plccache.py
nagios/plc2nagios.py
nodeaction.py
nodeconfig.py
policy.py
prep_power_users.py
testapi.py
tests/bwlimit.py
tests/nodenetwork.py
zabbix/zabbixsync.py

index 739678d..cf1c8bf 100644 (file)
@@ -82,6 +82,7 @@ def main():
     session_str=f.read().strip()
     api = PLC(Auth(session=session_str), api_server_url)
     # NOTE: should we rely on bootmanager for this functionality?
+       # TODO: handle dns failure here.
     api.AuthCheck()
 
     try:
index d362c94..7932ab3 100755 (executable)
@@ -135,27 +135,27 @@ class SSHKnownHosts:
                if type(host) == type(""): host = [host]
 
                # get the node(s) info
-               nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","nodenetwork_ids"])
+               nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","interface_ids"])
 
                # for each node's node network, update the self.nodenetworks cache
                nodenetworks = []
                for node in nodes:
-                       for net in node["nodenetwork_ids"]:
+                       for net in node["interface_ids"]:
                                nodenetworks.append(net)
 
-               plcnodenetworks = self.api.GetNodeNetworks(self.auth,nodenetworks,["nodenetwork_id","ip"])
+               plcnodenetworks = self.api.GetInterfaces(self.auth,nodenetworks,["interface_id","ip"])
                for n in plcnodenetworks:
-                       self.nodenetworks[n["nodenetwork_id"]]=n
+                       self.nodenetworks[n["interface_id"]]=n
                return nodes
 
        def _record_from_node(self, node, nokey_list=None):
                host = node['hostname']
                key = node['ssh_rsa_key']
 
-               nodenetworks = node['nodenetwork_ids']
+               nodenetworks = node['interface_ids']
                if len(nodenetworks)==0: return (host, None, None, None)
 
-               # the [0] subscript to node['nodenetwork_ids'] means
+               # the [0] subscript to node['interface_ids'] means
                # that this function wont work with multihomed nodes
                l_nw = self.nodenetworks.get(nodenetworks[0],None)
                if l_nw is None: return (host, None, None, None)
index effd750..531f883 100755 (executable)
@@ -42,6 +42,8 @@ api = plc.getAuthAPI()
 fb = None
 
 
+class ExceptionDoubleSSHError(Exception): pass
+
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
@@ -248,7 +250,7 @@ class PlanetLabSession:
                        if ret != 0:
                                print "\tFAILED TWICE"
                                #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
                # KILL any already running servers.
@@ -346,9 +348,11 @@ class DebugInterface:
                                self.session = PlanetLabSession(self.hostname, False, True)
                        else:
                                self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
-               except Exception, e:
+               except ExceptionDoubleSSHError, e:
                        msg = "ERROR setting up session for %s" % self.hostname
                        print msg
+                       return False
+               except Exception, e:
                        traceback.print_exc()
                        email_exception(msg)
                        return False
@@ -361,6 +365,10 @@ class DebugInterface:
                        try:
                                time.sleep(self.session.timeout*5)
                                conn = self.session.get_connection(config)
+                       except EOFError:
+                               # failed twice... no need to report this really, it's just in a
+                               # weird state...
+                               return False
                        except:
                                traceback.print_exc()
                                email_exception(self.hostname)
@@ -399,7 +407,7 @@ class DebugInterface:
                                ]:
                        sequences.update({n : "restart_bootmanager_boot"})
 
-               #       conn.restart_bootmanager('rins')
+               #       conn.restart_bootmanager('reinstall')
                for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
@@ -428,7 +436,7 @@ class DebugInterface:
                # repair_node_keys
                sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 
-               #   conn.restart_node('rins')
+               #   conn.restart_node('reinstall')
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
@@ -642,7 +650,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
        if type(conn) == type(False): return False
 
        #if forced_action == "reboot":
-       #       conn.restart_node('rins')
+       #       conn.restart_node('reinstall')
        #       return True
 
        boot_state = conn.get_boot_state()
@@ -731,16 +739,16 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
                        print "...Restarting BootManager.py on %s "%hostname 
-                       conn.restart_bootmanager('rins')
+                       conn.restart_bootmanager('reinstall')
                elif sequences[s] == "restart_node_rins":
-                       conn.restart_node('rins')
+                       conn.restart_node('reinstall')
                elif sequences[s] == "restart_node_boot":
                        conn.restart_node('boot')
                elif sequences[s] == "repair_node_keys":
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('rins')
+                               conn.restart_bootmanager('reinstall')
                                pass
                        else:
                                # there was some failure to synchronize the keys.
@@ -813,7 +821,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                                args = {}
                                try:
                                        node = plccache.GetNodeByName(hostname)
-                                       net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                                       net = api.GetInterfaces(node['interface_ids'])[0]
                                except:
                                        email_exception()
                                        print traceback.print_exc()
@@ -824,7 +832,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                                args['hostname'] = hostname
                                args['network_config'] = nodenet_str
-                               args['nodenetwork_id'] = net['nodenetwork_id']
+                               args['interface_id'] = net['interface_id']
 
                                sitehist.sendMessage('baddns_notice', **args)
 
index d082dbb..372913a 100644 (file)
@@ -78,6 +78,7 @@ def color_boot_state(l):
        elif  l == "down": return red(l)
        elif  l == "boot": return green(l)
        elif  l == "rins": return blue(l)
+       elif  l == "reinstall": return blue(l)
        else:
                return l
 
index 0847057..bfbfde4 100644 (file)
@@ -31,3 +31,20 @@ class PlcPCU(Entity):
 
        plc_pcu_stats = Field(PickleType,default=None)
        acts_as_versioned(ignore=['date_checked'])
+
+class PlcPCU2(Entity):
+       pcu_id = Field(Integer,primary_key=True)
+       date_checked = Field(DateTime,default=datetime.now)
+
+       site_id = Field(Integer, default=0)
+       protocol = Field(String, default=None)
+       node_ids = Field(PickleType,default=None)
+       ports = Field(PickleType,default=None)
+       hostname = Field(String, default=None)
+       ip = Field(String, default=None)
+       username = Field(String, default=None)
+       password = Field(String, default=None)
+       model = Field(String, default=None)
+       notes = Field(String, default=None)
+
+       acts_as_versioned(ignore=['date_checked'])
index 1068c5f..686252b 100755 (executable)
@@ -135,27 +135,27 @@ class SSHKnownHosts:
                if type(host) == type(""): host = [host]
 
                # get the node(s) info
-               nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","nodenetwork_ids"])
+               nodes = self.api.GetNodes(self.auth,host,["hostname","ssh_rsa_key","interface_ids"])
 
                # for each node's node network, update the self.nodenetworks cache
                nodenetworks = []
                for node in nodes:
-                       for net in node["nodenetwork_ids"]:
+                       for net in node["interface_ids"]:
                                nodenetworks.append(net)
 
-               plcnodenetworks = self.api.GetNodeNetworks(self.auth,nodenetworks,["nodenetwork_id","ip"])
+               plcnodenetworks = self.api.GetInterfaces(self.auth,nodenetworks,["interface_id","ip"])
                for n in plcnodenetworks:
-                       self.nodenetworks[n["nodenetwork_id"]]=n
+                       self.nodenetworks[n["interface_id"]]=n
                return nodes
 
        def _record_from_node(self, node, nokey_list=None):
                host = node['hostname']
                key = node['ssh_rsa_key']
 
-               nodenetworks = node['nodenetwork_ids']
+               nodenetworks = node['interface_ids']
                if len(nodenetworks)==0: return (host, None, None, None)
 
-               # the [0] subscript to node['nodenetwork_ids'] means
+               # the [0] subscript to node['interface_ids'] means
                # that this function wont work with multihomed nodes
                l_nw = self.nodenetworks.get(nodenetworks[0],None)
                if l_nw is None: return (host, None, None, None)
index 6a23fb7..bb66176 100755 (executable)
@@ -39,7 +39,7 @@ def main():
                try:
                        n = api.GetNodes(node)[0]
                        #print n
-                       net = api.GetNodeNetworks(n['nodenetwork_ids'])[0]
+                       net = api.GetInterfaces(n['interface_ids'])[0]
                        #print net
 
                        node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
index f7939e6..667c504 100644 (file)
@@ -157,7 +157,7 @@ class ScanInterface(object):
 
                except:
                        print "ERROR:"
-                       email_exception(nodename)
+                       email_exception(str(nodename))
                        print traceback.print_exc()
                        pass
 
@@ -204,13 +204,14 @@ class ScanNodeInternal(ScanInterface):
                                                echo '  "bmlog":"'`ls /tmp/bm.log`'",'
                                                echo '  "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",'
                                                echo '  "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",'
-                                               echo '  "fs_status":"'`touch /var/log/monitor 2>&1`'",'
+                                               echo '  "fs_status":"'`touch /var/log/monitor 2>&1 ; if [ -d /vservers/ ] ; then touch /vservers/monitor.log 2>&1 ; fi ; grep proc /proc/mounts | grep ro,`'",'
                                                echo '  "dns_status":"'`host boot.planet-lab.org 2>&1`'",'
                                                echo '  "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",'
 
                                                ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
                                                echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
                                                echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
+                                               echo '  "rpm_version":"'`rpm -q NodeManager`'",'
                                                echo "}"
 EOF                            """)
                                        
@@ -225,6 +226,7 @@ EOF                         """)
                                                                                'nm_status' : '', 
                                                                                'fs_status' : '',
                                                                                'dns_status' : '',
+                                                                               'rpm_version' : '',
                                                                                'princeton_comon_dir' : "", 
                                                                                'princeton_comon_running' : "", 
                                                                                'princeton_comon_procs' : "", 'ssh_portused' : None})
@@ -232,6 +234,7 @@ EOF                         """)
                                print traceback.print_exc()
                                sys.exit(1)
 
+                       print "RPMVERSION: %s %s" % (nodename, values['rpm_version'])
                        ### RUN SSH ######################
                        b_getbootcd_id = True
 
index b50be5b..c90cf5e 100644 (file)
@@ -644,7 +644,7 @@ To help us return this machine to running order, please verify that the register
 
 You may update the node's network information at the link below:
 
-    https://www.planet-lab.org/db/nodes/node_networks.php?id=%(nodenetwork_id)s
+    https://www.planet-lab.org/db/nodes/node_networks.php?id=%(interface_id)s
 
 If you have any questions, please feel free to contact us at PlanetLab Support (support@planet-lab.org).
 
index 2f0f19d..8f70c1f 100644 (file)
@@ -301,14 +301,14 @@ def getSiteNodes2(loginbase):
 
 def getNodeNetworks(filter=None):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
-       nodenetworks = api.GetNodeNetworks(auth.auth, filter, None)
+       nodenetworks = api.GetInterfaces(auth.auth, filter, None)
        return nodenetworks
 
 def getNodes(filter=None, fields=None):
        api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
        nodes = api.GetNodes(auth.auth, filter, fields) 
                        #['boot_state', 'hostname', 
-                       #'site_id', 'date_created', 'node_id', 'version', 'nodenetwork_ids',
+                       #'site_id', 'date_created', 'node_id', 'version', 'interface_ids',
                        #'last_updated', 'peer_node_id', 'ssh_rsa_key' ])
        return nodes
 
index dc62d0d..f92fa85 100755 (executable)
@@ -78,8 +78,16 @@ def init():
        l_nodes = [ s.plc_node_stats for s in dbnodes ]
 
        print "plcpcu"
-       dbpcus = PlcPCU.query.all()
-       l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
+       dbpcus = PlcPCU2.query.all()
+       l_pcus = []
+       for s in dbpcus:
+               pcu = {}
+               for k in ['username', 'protocol', 'node_ids', 'ip', 
+                                 'pcu_id', 'hostname', 'site_id', 'notes', 
+                                 'model', 'password', 'ports']:
+                       pcu[k] = getattr(s, k)
+               l_pcus.append(pcu)
+       #l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
        print "dsites_from_lsites"
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
@@ -139,7 +147,7 @@ def sync():
        l_nodes = plc.api.GetNodes({'peer_id':None}, 
                                                ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
                                                 'version', 'last_updated', 'date_created', 'key',
-                                                'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+                                                'last_contact', 'pcu_ids', 'interface_ids'])
        l_pcus = plc.api.GetPCUs()
 
        print "sync sites"
@@ -154,10 +162,13 @@ def sync():
 
        print "sync pcus"
        for pcu in l_pcus:
-               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+               dbpcu = PlcPCU2.findby_or_create(pcu_id=pcu['pcu_id'])
                dbpcu.date_checked = datetime.now()
-               dbpcu.plc_pcu_stats = pcu
-       deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
+               for key in pcu.keys():
+                       print "setting %s  = %s" % (key, pcu[key])
+                       setattr(dbpcu, key, pcu[key])
+
+       deleteExtra(l_pcus, PlcPCU2, 'pcu_id', 'pcu_id')
        deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
        deleteExtra(l_pcus, FindbadPCURecord, 'plc_pcuid', 'pcu_id')
        session.flush()
index d7da592..de74d42 100755 (executable)
@@ -97,10 +97,10 @@ for site in plcdb:
 
        for node in nodes:
                hn = node['hostname']
-               if len(node['nodenetwork_ids']) == 0:
+               if len(node['interface_ids']) == 0:
                        continue
 
-               ip = netid2ip[node['nodenetwork_ids'][0]]
+               ip = netid2ip[node['interface_ids'][0]]
 
                if lon_x is not -1 and lat_y is not -1:
                        coords="%s,%s" % (lon_x, lat_y)
index a5bd3ea..ad58279 100755 (executable)
@@ -30,7 +30,7 @@ for node in config.args:
 
        if config.rins:
                print "Setting %s to rins" % node
-               api.UpdateNode(node, {'boot_state' : 'rins'})
+               api.UpdateNode(node, {'boot_state' : 'reinstall'})
 
        if config.backoff:
                print "Enabling Slices & Slice Creation for %s" % node
index 6a23fb7..bb66176 100755 (executable)
@@ -39,7 +39,7 @@ def main():
                try:
                        n = api.GetNodes(node)[0]
                        #print n
-                       net = api.GetNodeNetworks(n['nodenetwork_ids'])[0]
+                       net = api.GetInterfaces(n['interface_ids'])[0]
                        #print net
 
                        node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
index 7ce85db..11ec4a7 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -33,7 +33,7 @@ api = plc.getAuthAPI()
 
 def logic():
 
-       plc.nodeBootState(host, 'rins')
+       plc.nodeBootState(host, 'reinstall')
        node_end_record(host)
 
 def main(hostnames, sitenames):
index 8e0219e..01094ac 100755 (executable)
@@ -134,9 +134,9 @@ for email in email_list:
                                                                          'model': node['model'],
                                                                          'hostname' : node['hostname'],
                                                                          'version' : node['version']})
-                               nnets = api.GetNodeNetworks(node['nodenetwork_ids'])
+                               nnets = api.GetInterfaces(node['interface_ids'])
                                for nnet in nnets:
-                                       del nnet['nodenetwork_id']
+                                       del nnet['interface_id']
                                        del nnet['nodenetwork_setting_ids']
                                        api06.AddNodeNetwork(id, nnet) 
     print ""
index d60effb..530677f 100755 (executable)
@@ -13,7 +13,7 @@ try:
        site_nodes = api.GetNodes(site['node_ids'])
        site_people = api.GetPersons(site['person_ids'])
        for node in site_nodes:
-               network = api.GetNodeNetworks(node['nodenetwork_ids'])
+               network = api.GetInterfaces(node['interface_ids'])
        print "ok"
 except:
        sys.stderr.write(traceback.format_exc())
index 6b93156..850ad53 100755 (executable)
@@ -19,8 +19,8 @@ def main():
 
        for h in d_nodes:
                host = d_nodes[h]
-               for nw_id in host['nodenetwork_ids']:
-                       l_nw = plc.getNodeNetworks({'nodenetwork_id': host['nodenetwork_ids']})
+               for nw_id in host['interface_ids']:
+                       l_nw = plc.getNodeNetworks({'interface_id': host['interface_ids']})
                        bwlimit[h] = []
                        for nw in l_nw:
                                if nw['bwlimit'] != None and nw['bwlimit'] < 500000:
index 5c1a439..baa64cd 100755 (executable)
@@ -8,32 +8,32 @@ import util.file
 if len(sys.argv[1:]) > 0:
        for host in sys.argv[1:]:
                n = api.GetNodes(host)[0]
-               nn = api.GetNodeNetworks(n['nodenetwork_ids'])
+               nn = api.GetInterfaces(n['interface_ids'])
                for nodenet in nn:
-                       nnet2 = api.GetNodeNetworks({'ip': nodenet['ip']})
+                       nnet2 = api.GetInterfaces({'ip': nodenet['ip']})
                        print "len of nn entries with ip: %s == %s " % ( nodenet['ip'], len(nnet2) )
                        for nn2 in nnet2:
                                n2 = api.GetNodes(nn2['node_id'])
-                               print "\t%d node is attached to nodenetwork %s" % ( len(n2), nn2['nodenetwork_id'] )
+                               print "\t%d node is attached to nodenetwork %s" % ( len(n2), nn2['interface_id'] )
                                if len(n2) != 0 :
                                        n2 = n2[0]
                                        print
                                        #print "host %s : %s" % (n2['hostname'], n2['node_id'])
                                else:
                                        pass
-                                       #print nn2['nodenetwork_id']
-                                       #api.DeleteNodeNetwork(nn2['nodenetwork_id'])
+                                       #print nn2['interface_id']
+                                       #api.DeleteNodeNetwork(nn2['interface_id'])
 else:
        nnids = util.file.getListFromFile('nnids.txt')
        nnids = [ int(i) for i in nnids]
        for id in nnids:
-               nnet2 = api.GetNodeNetworks(id)
+               nnet2 = api.GetInterfaces(id)
                for nn2 in nnet2:
                        n2 = api.GetNodes(nn2['node_id'])
                        if len(n2) == 0 :
-                               print "\t%d node is attached to nodenetwork %s %s" % ( len(n2), nn2['nodenetwork_id'] , nn2['ip']),
+                               print "\t%d node is attached to nodenetwork %s %s" % ( len(n2), nn2['interface_id'] , nn2['ip']),
 
-                               netlist = api.GetNodeNetworks({'ip' : nn2['ip']})
+                               netlist = api.GetInterfaces({'ip' : nn2['ip']})
                                if len(netlist) != 1:
                                        node_len = len([ n['node_id'] for n in netlist])
                                        print "\t but, ip %s is used by %s nodenetwork entries" % (nn2['ip'], node_len)
index aaee4ff..407d801 100755 (executable)
@@ -22,8 +22,8 @@ def get_site_iplist(loginbase):
        # TODO: if it is, then we need to break up the discovery rule.
        ip_list = ""
        for node in node_list:
-               if len(node['nodenetwork_ids']) > 0:
-                       ip = netid2ip[node['nodenetwork_ids'][0]]
+               if len(node['interface_ids']) > 0:
+                       ip = netid2ip[node['interface_ids'][0]]
                        if len(ip_list) > 0: ip_list += ","
                        ip_list += ip