add email_exception() calls throughout code.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:48:06 +0000 (20:48 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:48:06 +0000 (20:48 +0000)
bootman.py
findbad.py
findbadpcu.py
grouprins.py
monitor/common.py
monitor/util/command.py
pcucontrol/reboot.py

index 67ce675..7ec552f 100755 (executable)
@@ -331,6 +331,8 @@ def reboot(hostname, config=None, forced_action=None):
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
+               from monitor.common import email_exception
+               email_exception()
                print traceback.print_exc()
                return False
 
@@ -340,8 +342,11 @@ def reboot(hostname, config=None, forced_action=None):
                else:
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
        except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                print traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception(msg)
                print e
                return False
 
@@ -355,6 +360,8 @@ def reboot(hostname, config=None, forced_action=None):
                        conn = session.get_connection(config)
                except:
                        print traceback.print_exc()
+                       from monitor.common import email_exception
+                       email_exception()
                        return False
 
        if forced_action == "reboot":
@@ -793,6 +800,8 @@ def reboot(hostname, config=None, forced_action=None):
                                node = api.GetNodes(hostname)[0]
                                net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
                        except:
+                               from monitor.common import email_exception
+                               email_exception()
                                print traceback.print_exc()
                                # TODO: api error. skip email, b/c all info is not available,
                                # flag_set will not be recorded.
index 7bb31a0..0d68845 100755 (executable)
@@ -175,6 +175,8 @@ if __name__ == '__main__':
                main()
        except Exception, err:
                print traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
index b63a96a..7e84513 100755 (executable)
@@ -206,6 +206,8 @@ if __name__ == '__main__':
                time.sleep(1)
        except Exception, err:
                traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                sys.exit(0)
index ed6149d..433ecd3 100755 (executable)
@@ -73,6 +73,7 @@ class Reboot(object):
                                        return ret
 
                                except Exception,e:
+                                       email_exception()
                                        print traceback.print_exc(); print e
 
                                        # NOTE: this failure could be an implementation issue on
@@ -95,6 +96,7 @@ class Reboot(object):
                                        return ret
 
                                except Exception,e:
+                                       email_exception()
                                        print traceback.print_exc(); print e
 
                                        # NOTE: this failure could be an implementation issue on
@@ -138,6 +140,7 @@ class Reboot(object):
                try:
                        return mailmonitor.reboot(host)
                except Exception, e:
+                       email_exception(host)
                        print traceback.print_exc(); print e
                        return False
 
@@ -261,6 +264,7 @@ for host in hostnames:
                try:
                        node = api.GetNodes(host)[0]
                except:
+                       email_exception()
                        print traceback.print_exc(); 
                        print "FAILED GETNODES for host: %s" % host
                        continue
@@ -285,6 +289,7 @@ for host in hostnames:
                                        # todo: send thank you, etc.
                                        mailmonitor.reboot(host)
                                except Exception, e:
+                                       email_exception()
                                        print traceback.print_exc(); print e
 
                                continue
@@ -355,6 +360,7 @@ for host in hostnames:
                print "Killed by interrupt"
                sys.exit(0)
        except:
+               email_exception()
                print traceback.print_exc();
                print "Continuing..."
 
index 65b82b8..8bddae1 100644 (file)
@@ -212,10 +212,13 @@ def get_nodeset(config):
 
        return l_nodes
 
-def email_exception():
-       from monitor import config
-       import traceback
-       msg=traceback.format_exc()
-       m=Message("exception running monitor", msg, False)
-       m.send([config.cc_email])
-       return
+def email_exception(content=None):
+    import config
+    from unified_model import Message
+    import traceback
+    msg=traceback.format_exc()
+    if content:
+        msg = content + "\n" + msg
+    m=Message("exception running monitor", msg, False)
+    m.send([config.cc_email])
+    return
index da7ddae..e3e81ca 100644 (file)
@@ -37,6 +37,9 @@ class CMD:
                except ExceptionTimeout:
                        print traceback.print_exc()
                        return ("", "SCRIPTTIMEOUT")
+               except:
+                       from monitor.common import email_exception
+                       email_exception()
                        
        def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                (o,e) = self.run(cmd, timeout)
index 2361d89..b762d89 100755 (executable)
@@ -330,6 +330,10 @@ class PCUControl(PCUModel,PCURecord):
                        import traceback
                        traceback.print_exc()
                        return "EOF connection reset" + str(err)
+               except Exception, err:
+                       from monitor.common import email_exception
+                       email_exception(self.host)
+                       raise Exception(err)
 
 from pcucontrol.models import *
 
@@ -555,6 +559,8 @@ def main():
                                print "failed"
        except Exception, err:
                import traceback; traceback.print_exc()
+               from monitor.common import email_exception
+               email_exception(node)
                print err
 
 if __name__ == '__main__':