Added a check for bad dns on the node that prevents bootmanager from booting.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 4 Aug 2008 15:32:45 +0000 (15:32 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 4 Aug 2008 15:32:45 +0000 (15:32 +0000)
added a findbad.py check after running grouprins in automate_pl03.sh

additional todos.

automate_pl03.sh
bootman.py
emailTxt.py
nodeconfig.py
todo

index 536914e..b535ccc 100755 (executable)
@@ -76,6 +76,7 @@ done
        --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' \
        --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \
        --reboot || :
+./findbad.py --increment --cachenodes --debug=0 --dbname="findbad" --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' || :
 
 # cache the RT db locally.
 python ./rt.py
index 2fd161c..0a75fac 100755 (executable)
@@ -23,7 +23,8 @@ import ssh.fdpexpect as fdpexpect
 import ssh.pexpect as pexpect
 from unified_model import *
 from emailTxt import mailtxt
-
+from nodeconfig import network_config_to_str
+import traceback
 import monitorconfig
 
 import signal
@@ -334,7 +335,7 @@ def reboot(hostname, config=None, forced_action=None):
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                return False
 
        try:
@@ -344,7 +345,7 @@ def reboot(hostname, config=None, forced_action=None):
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
        except Exception, e:
                print "ERROR setting up session for %s" % hostname
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                print e
                return False
 
@@ -357,7 +358,7 @@ def reboot(hostname, config=None, forced_action=None):
                        time.sleep(session.timeout*4)
                        conn = session.get_connection(config)
                except:
-                       import traceback; print traceback.print_exc()
+                       print traceback.print_exc()
                        return False
                        
 
@@ -497,6 +498,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ('noinstall'    , 'notinstalled'),
                        ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                        ('noblockdev'   , "No block devices detected."),
+                       ('dnserror'     , 'Name or service not known'),
                        ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
                        ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
@@ -542,6 +544,9 @@ def reboot(hostname, config=None, forced_action=None):
        for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-debug-done",
@@ -549,6 +554,7 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
                        "bminit-cfg-auth-getplc-implementerror-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_boot"})
@@ -628,6 +634,9 @@ def reboot(hostname, config=None, forced_action=None):
        # broken_hardware_email
        sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 
+       # bad_dns_email
+       sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+
        flag_set = True
 
        
@@ -751,6 +760,29 @@ def reboot(hostname, config=None, forced_action=None):
                        m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
                        conn.set_nodestate('disable')
 
+               elif sequences[s] == "bad_dns_email":
+                       print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+                       args = {}
+                       try:
+                               node = api.GetNodes(hostname)[0]
+                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                       except:
+                               print traceback.print_exc()
+                               # TODO: api error. skip email, b/c all info is not available,
+                               # flag_set will not be recorded.
+                               return False
+                       nodenet_str = network_config_to_str(net)
+
+                       args['hostname'] = hostname
+                       args['network_config'] = nodenet_str
+                       args['nodenetwork_id'] = net['nodenetwork_id']
+                       m = PersistMessage(hostname, mailtxt.baddns[0] % args,
+                                                                                mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
+
        if flag_set:
                pflags.setRecentFlag(s)
                pflags.save() 
@@ -773,6 +805,8 @@ def main():
                                                help="Extra quiet output messages.")
        parser.add_option("", "--verbose", dest="verbose", action="store_true", 
                                                help="Extra debug output messages.")
+       parser.add_option("", "--nonet", dest="nonet", action="store_true", 
+                                               help="Do not setup the network, use existing log files to re-run a test pass.")
        parser.add_option("", "--collect", dest="collect", action="store_true", 
                                                help="No action, just collect dmesg, and bm.log")
        parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
index 137e4a9..c2e147f 100644 (file)
@@ -389,6 +389,8 @@ Thank you for your help,
 
        donation_nopcu = [ donation_nopcu_one, donation_nopcu_one, donation_nopcu_one ]
        donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
+
+
        minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
@@ -515,12 +517,24 @@ Thanks.
 """)
 
 
-       dns=("""Planetlab node %(hostname)s down.""", """As part of PlanetLab node monitoring, we noticed the DNS servers used by  %(hostname)s are not responding to queries.
+       baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
-Please verify the DNS information used by the node is correct.  You can find directions on how to update the node's network information on the PlanetLab Technical Contacts Guid (http://www.planet-lab.org/doc/TechsGuide.php#id268898).
+    %(hostname)s 
 
-Thanks.
+The conseuqnece of this is that the node cannot boot correctly, and is not a functioning part of the PlanetLab network.
+
+To help us return this machine to running order, please verify that the registered DNS servers in the node network configuration are correct.  
+
+%(network_config)s
 
+You may update the node's network information at the link below:
+
+    https://www.planet-lab.org/db/nodes/node_networks.php?id=%(nodenetwork_id)s
+
+If you have any questions, please feel free to contact us at PlanetLab Support (support@planet-lab.org).
+
+Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
index 61d31f9..fa4a3eb 100755 (executable)
@@ -11,6 +11,16 @@ from sets import Set
 from nodecommon import *
 import database
 
+def network_config_to_str(net):
+
+       str = ""
+       static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary']
+       for k in static_keys:
+               str += "%15s == %s\n" % (k, net[k])
+
+       return str
+       
+
 def main():
        from config import config
        fb = database.dbLoad("findbad")
@@ -43,9 +53,7 @@ def main():
                                else:
                                        print "%15s == %s" % (k, n[k])
 
-                       static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary']
-                       for k in static_keys:
-                               print "%15s == %s" % (k, net[k])
+                       print network_config_to_str(net)
 
                        #for k in net.keys():
                        #       print k, "==" , net[k]
diff --git a/todo b/todo
index 09bdcbe..066e683 100644 (file)
--- a/todo
+++ b/todo
@@ -15,6 +15,7 @@ TODO:
 
    - convert plc and other files to use the new monitorconfig.py rather than
      auth, or plc.*
+   - need to alter all import 'auth' statements.
 
 Lower priority:
  * Add a more structured, 'automate' library of scripts and means of making