From: Stephen Soltesz Date: Mon, 4 Aug 2008 15:32:45 +0000 (+0000) Subject: Added a check for bad dns on the node that prevents bootmanager from booting. X-Git-Tag: Monitor-1.0-6~20 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=bccc542ed6a8eec74fd6411976cca424a1158c75;hp=e85a9cdcb5f44d2a5299987f4c6a8adc50a7f4d7;p=monitor.git Added a check for bad dns on the node that prevents bootmanager from booting. added a findbad.py check after running grouprins in automate_pl03.sh additional todos. --- diff --git a/automate_pl03.sh b/automate_pl03.sh index 536914e..b535ccc 100755 --- a/automate_pl03.sh +++ b/automate_pl03.sh @@ -76,6 +76,7 @@ done --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' \ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \ --reboot || : +./findbad.py --increment --cachenodes --debug=0 --dbname="findbad" --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' || : # cache the RT db locally. python ./rt.py diff --git a/bootman.py b/bootman.py index 2fd161c..0a75fac 100755 --- a/bootman.py +++ b/bootman.py @@ -23,7 +23,8 @@ import ssh.fdpexpect as fdpexpect import ssh.pexpect as pexpect from unified_model import * from emailTxt import mailtxt - +from nodeconfig import network_config_to_str +import traceback import monitorconfig import signal @@ -334,7 +335,7 @@ def reboot(hostname, config=None, forced_action=None): try: k = SSHKnownHosts(); k.update(node); k.write(); del k except: - import traceback; print traceback.print_exc() + print traceback.print_exc() return False try: @@ -344,7 +345,7 @@ def reboot(hostname, config=None, forced_action=None): session = PlanetLabSession(node, config.nosetup, config.verbose) except Exception, e: print "ERROR setting up session for %s" % hostname - import traceback; print traceback.print_exc() + print traceback.print_exc() print e return False @@ -357,7 +358,7 @@ def reboot(hostname, config=None, forced_action=None): time.sleep(session.timeout*4) conn = session.get_connection(config) except: - import traceback; print traceback.print_exc() + print traceback.print_exc() return False @@ -497,6 +498,7 @@ def reboot(hostname, config=None, forced_action=None): ('noinstall' , 'notinstalled'), ('bziperror' , 'bzip2: Data integrity error when decompressing.'), ('noblockdev' , "No block devices detected."), + ('dnserror' , 'Name or service not known'), ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'), ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'), ('hardwarerequirefail' , 'Hardware requirements not met'), @@ -542,6 +544,9 @@ def reboot(hostname, config=None, forced_action=None): for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-update-debug-done", @@ -549,6 +554,7 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", "bminit-cfg-auth-getplc-implementerror-update-debug-done", ]: sequences.update({n : "restart_bootmanager_boot"}) @@ -628,6 +634,9 @@ def reboot(hostname, config=None, forced_action=None): # broken_hardware_email sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + # bad_dns_email + sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"}) + flag_set = True @@ -751,6 +760,29 @@ def reboot(hostname, config=None, forced_action=None): m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) conn.set_nodestate('disable') + elif sequences[s] == "bad_dns_email": + print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname + args = {} + try: + node = api.GetNodes(hostname)[0] + net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + except: + print traceback.print_exc() + # TODO: api error. skip email, b/c all info is not available, + # flag_set will not be recorded. + return False + nodenet_str = network_config_to_str(net) + + args['hostname'] = hostname + args['network_config'] = nodenet_str + args['nodenetwork_id'] = net['nodenetwork_id'] + m = PersistMessage(hostname, mailtxt.baddns[0] % args, + mailtxt.baddns[1] % args, True, db='baddns_persistmessages') + + loginbase = plc.siteId(hostname) + m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + conn.set_nodestate('disable') + if flag_set: pflags.setRecentFlag(s) pflags.save() @@ -773,6 +805,8 @@ def main(): help="Extra quiet output messages.") parser.add_option("", "--verbose", dest="verbose", action="store_true", help="Extra debug output messages.") + parser.add_option("", "--nonet", dest="nonet", action="store_true", + help="Do not setup the network, use existing log files to re-run a test pass.") parser.add_option("", "--collect", dest="collect", action="store_true", help="No action, just collect dmesg, and bm.log") parser.add_option("", "--nosetup", dest="nosetup", action="store_true", diff --git a/emailTxt.py b/emailTxt.py index 137e4a9..c2e147f 100644 --- a/emailTxt.py +++ b/emailTxt.py @@ -389,6 +389,8 @@ Thank you for your help, donation_nopcu = [ donation_nopcu_one, donation_nopcu_one, donation_nopcu_one ] donation_down = [ donation_down_one, donation_down_one, donation_down_one ] + + minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -515,12 +517,24 @@ Thanks. """) - dns=("""Planetlab node %(hostname)s down.""", """As part of PlanetLab node monitoring, we noticed the DNS servers used by %(hostname)s are not responding to queries. + baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", +"""As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries. -Please verify the DNS information used by the node is correct. You can find directions on how to update the node's network information on the PlanetLab Technical Contacts Guid (http://www.planet-lab.org/doc/TechsGuide.php#id268898). + %(hostname)s -Thanks. +The conseuqnece of this is that the node cannot boot correctly, and is not a functioning part of the PlanetLab network. + +To help us return this machine to running order, please verify that the registered DNS servers in the node network configuration are correct. + +%(network_config)s +You may update the node's network information at the link below: + + https://www.planet-lab.org/db/nodes/node_networks.php?id=%(nodenetwork_id)s + +If you have any questions, please feel free to contact us at PlanetLab Support (support@planet-lab.org). + +Thank you for your help, -- PlanetLab Central (support@planet-lab.org) """) diff --git a/nodeconfig.py b/nodeconfig.py index 61d31f9..fa4a3eb 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -11,6 +11,16 @@ from sets import Set from nodecommon import * import database +def network_config_to_str(net): + + str = "" + static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary'] + for k in static_keys: + str += "%15s == %s\n" % (k, net[k]) + + return str + + def main(): from config import config fb = database.dbLoad("findbad") @@ -43,9 +53,7 @@ def main(): else: print "%15s == %s" % (k, n[k]) - static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary'] - for k in static_keys: - print "%15s == %s" % (k, net[k]) + print network_config_to_str(net) #for k in net.keys(): # print k, "==" , net[k] diff --git a/todo b/todo index 09bdcbe..066e683 100644 --- a/todo +++ b/todo @@ -15,6 +15,7 @@ TODO: - convert plc and other files to use the new monitorconfig.py rather than auth, or plc.* + - need to alter all import 'auth' statements. Lower priority: * Add a more structured, 'automate' library of scripts and means of making