From: Stephen Soltesz Date: Thu, 30 Apr 2009 21:10:11 +0000 (+0000) Subject: reduce false exceptions that should be ignored or handled better in the code. X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=a048ec7065ad812c5dfbe23efa8a4d67b04724af reduce false exceptions that should be ignored or handled better in the code. --- diff --git a/bootman.py b/bootman.py index c26335f..f8f6d48 100755 --- a/bootman.py +++ b/bootman.py @@ -26,6 +26,8 @@ from nodeconfig import network_config_to_str import traceback import config +class ExceptionDoubleSSHError(Exception): pass + import signal class Sopen(subprocess.Popen): def kill(self, signal = signal.SIGTERM): @@ -65,7 +67,8 @@ class NodeConnection: def get_bootmanager_log(self): download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) - os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node)) log = open("log/bm.%s.log" % self.node, 'r') return log @@ -232,7 +235,7 @@ class PlanetLabSession: if ret != 0: print "\tFAILED TWICE" #sys.exit(1) - raise Exception("Failed twice trying to login with updated ssh host key") + raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key") t1 = time.time() # KILL any already running servers. @@ -348,6 +351,10 @@ def reboot(hostname, config=None, forced_action=None): session = PlanetLabSession(node, False, True) else: session = PlanetLabSession(node, config.nosetup, config.verbose) + except ExceptionDoubleSSHError, e: + msg = "ERROR setting up session for %s" % hostname + print msg + return False except Exception, e: msg = "ERROR setting up session for %s" % hostname print msg @@ -365,10 +372,14 @@ def reboot(hostname, config=None, forced_action=None): try: time.sleep(session.timeout*4) conn = session.get_connection(config) + except EOFError: + # failed twice... no need to report this really, it's just in a + # weird state... + return False except: print traceback.print_exc() from nodecommon import email_exception - email_exception() + email_exception(node) return False if forced_action == "reboot": diff --git a/reboot.py b/reboot.py index 34b9b00..ba641c4 100755 --- a/reboot.py +++ b/reboot.py @@ -296,11 +296,12 @@ class IPAL(PCUControl): try: # TODO: make sleep backoff, before stopping. - time.sleep(4) + time.sleep(8) ret = s.recv(count, socket.MSG_DONTWAIT) except socket.error, e: if e[0] == errno.EAGAIN: - raise Exception(e[1]) + #raise Exception(e[1]) + raise ExceptionNotFound(e[1]) else: # TODO: not other exceptions. raise Exception(e) @@ -321,6 +322,8 @@ class IPAL(PCUControl): if e[0] == errno.ECONNREFUSED: # cannot connect to remote host raise Exception(e[1]) + elif e[0] == errno.ETIMEDOUT: + raise ExceptionTimeout(e[1]) else: # TODO: what other conditions are there? raise Exception(e)