add email_exception() to all except: statements.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
21 files changed:
bootman.py
clean_policy.py
database.py
findbad.py
findbadpcu.py
grouprins.py
moncommands.py
monitor/database.py
monitor_policy.py
nodebad.py
nodecommon.py
nodeconfig.py
nodehistory.py
pcubad.py
policy.py
reboot.py
showlatlon.py
sitebad.py
soltesz.py
testapi.py
unified_model.py

index fb5cf5d..f3ecf72 100755 (executable)
@@ -338,6 +338,8 @@ def reboot(hostname, config=None, forced_action=None):
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
+               from nodecommon import email_exception
+               email_exception()
                print traceback.print_exc()
                return False
 
@@ -347,8 +349,11 @@ def reboot(hostname, config=None, forced_action=None):
                else:
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
        except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception(msg)
                print e
                return False
 
@@ -362,6 +367,8 @@ def reboot(hostname, config=None, forced_action=None):
                        conn = session.get_connection(config)
                except:
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        return False
 
        if forced_action == "reboot":
@@ -736,7 +743,7 @@ def reboot(hostname, config=None, forced_action=None):
                        args = {}
                        args['hostname'] = hostname
                        args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
                        emails = plc.getTechEmails(loginbase)
@@ -798,6 +805,8 @@ def reboot(hostname, config=None, forced_action=None):
                                node = api.GetNodes(hostname)[0]
                                net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
                        except:
+                               from nodecommon import email_exception
+                               email_exception()
                                print traceback.print_exc()
                                # TODO: api error. skip email, b/c all info is not available,
                                # flag_set will not be recorded.
index 34099be..f447c95 100644 (file)
@@ -92,8 +92,10 @@ class MonitorMergeDiagnoseSendEscellate:
                        actnode.update(fbnode)
                        actnode['ticket_id'] = ""
                        actnode['prev_category'] = "ERROR" 
+                       actnode['prev_state'] = "DOWN" 
                else:
                        actnode['prev_category']= actnode['category']
+                       actnode['prev_state']   = actnode['state']
                        actnode['comonstats']   = fbnode['comonstats']
                        actnode['category']             = fbnode['category']
                        actnode['state']                = fbnode['state']
@@ -115,6 +117,10 @@ class MonitorMergeDiagnoseSendEscellate:
                actnode= self.getActionRecord()
                actrec = self.mergeRecord(fbnode, actnode)
                record = Record(self.hostname, actrec)
+               #print record
+               #print actrec
+               #print record.data['time']
+               #print time.time() - record.data['time']
                diag   = self.diagnose(record)
                if self.act and diag is not None:
                        self.action(record,diag)
@@ -208,11 +214,12 @@ class MonitorMergeDiagnoseSendEscellate:
                                        record.data['ticket_id'] = message.rt.ticket_id
 
                        if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
-                               print "action: taking action"
+                               print "action: taking squeeze action"
                                record.takeAction(record.data['action-level'])
                                diag.resetFlag('Squeeze')
                                diag.save()
                        if diag.getFlag('BackOff'):
+                               print "action: taking backoff action"
                                record.takeAction(0)
                                diag.resetFlag('BackOff')
                                diag.save()
index b9fc10d..254a5b5 100644 (file)
@@ -110,8 +110,6 @@ class SPickle:
                                raise Exception, "No such file %s" % name
                                
 
-               #import traceback
-               #print traceback.print_stack()
                #print "loading %s" % self.__file(name, type)
                #sys.stderr.write("-----------------------------\n")
                f = open(self.__file(name, type), 'r')
index 2aabe01..630f1c5 100755 (executable)
@@ -81,6 +81,8 @@ EOF                   """)
                                                                'princeton_comon_procs' : '', 'sshport' : None})
        except:
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                sys.exit(1)
 
        ### RUN SSH ######################
@@ -203,6 +205,8 @@ EOF                 """)
        except:
                b_except = True
                traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
 
        plc_lock.release()
        if b_except: return (None, None)
@@ -240,6 +244,8 @@ EOF                 """)
        except:
                b_except = True
                traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
 
        plc_lock.release()
        if b_except: return (None, None)
@@ -397,6 +403,8 @@ if __name__ == '__main__':
                main()
        except Exception, err:
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                database.dbDump(config.dbname, externalState)
index ca65344..114c48b 100755 (executable)
@@ -85,6 +85,8 @@ def get_pcu(pcuname):
                                        l_pcu = i
                except:
                        traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        l_pcu = None
 
        plc_lock.release()
@@ -103,6 +105,8 @@ def get_nodes(node_ids):
                                        l_node.append(n)
                except:
                        traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        l_node = None
 
        plc_lock.release()
@@ -160,6 +164,8 @@ def get_plc_site_values(site_id):
                                        break
                except:
                        traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        values = None
 
        plc_lock.release()
@@ -198,6 +204,8 @@ def collectPingAndSSH(pcuname, cohash):
                except:
                        b_except = True
                        traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        continue_probe = False
 
                if b_except or not continue_probe: return (None, None, None)
@@ -461,6 +469,8 @@ if __name__ == '__main__':
                time.sleep(1)
        except Exception, err:
                traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                database.dbDump(config.dbname, externalState)
index cfefc6a..97ba05b 100755 (executable)
@@ -75,6 +75,7 @@ class Reboot(object):
                                        return ret
 
                                except Exception,e:
+                                       email_exception()
                                        print traceback.print_exc(); print e
 
                                        # NOTE: this failure could be an implementation issue on
@@ -97,6 +98,7 @@ class Reboot(object):
                                        return ret
 
                                except Exception,e:
+                                       email_exception()
                                        print traceback.print_exc(); print e
 
                                        # NOTE: this failure could be an implementation issue on
@@ -140,6 +142,7 @@ class Reboot(object):
                try:
                        return mailmonitor.reboot(host)
                except Exception, e:
+                       email_exception(host)
                        print traceback.print_exc(); print e
                        return False
 
@@ -262,6 +265,7 @@ for host in hostnames:
                try:
                        node = api.GetNodes(host)[0]
                except:
+                       email_exception()
                        print traceback.print_exc(); 
                        print "FAILED GETNODES for host: %s" % host
                        continue
@@ -286,6 +290,7 @@ for host in hostnames:
                                        # todo: send thank you, etc.
                                        mailmonitor.reboot(host)
                                except Exception, e:
+                                       email_exception()
                                        print traceback.print_exc(); print e
 
                                continue
@@ -356,6 +361,7 @@ for host in hostnames:
                print "Killed by interrupt"
                sys.exit(0)
        except:
+               email_exception()
                print traceback.print_exc();
                print "Continuing..."
 
index bda2389..50d31e2 100644 (file)
@@ -35,6 +35,9 @@ class CMD:
                except ExceptionTimeout:
                        import traceback; print traceback.print_exc()
                        return ("", "SCRIPTTIMEOUT")
+               except:
+                       from nodecommon import email_exception
+                       email_exception()
                        
        def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                (o,e) = self.run(cmd, timeout)
index 3b5bd65..88fd88f 100644 (file)
@@ -111,8 +111,6 @@ class SPickle:
                                raise Exception, "No such file %s" % name
                                
 
-               #import traceback
-               #print traceback.print_stack()
                #print "loading %s" % self.__file(name, type)
                #sys.stderr.write("-----------------------------\n")
                f = open(self.__file(name, type), 'r')
index 45242ea..5db440f 100644 (file)
@@ -281,6 +281,8 @@ class Diagnose:
                        print "----------------"
                        import traceback
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        print err
                        #if config.policysavedb:
                        sys.exit(1)
@@ -884,6 +886,8 @@ class Action:
                        print "----------------"
                        import traceback
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        print err
                        if config.policysavedb:
                                print "Saving Databases... act_all"
@@ -970,6 +974,8 @@ class Action:
                        print "exception on message:"
                        import traceback
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        print message
 
                return ticket_id
index 0130c3e..d9b6b4c 100755 (executable)
@@ -33,6 +33,7 @@ def main(config):
        l_plcnodes = database.dbLoad("l_plcnodes")
 
        l_nodes = get_nodeset(config)
+       print len(l_nodes)
        #if config.node:
        #       l_nodes = [config.node]
        ##else:
@@ -57,6 +58,9 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                        externalState['nodes'][nodename]['values'] = values
                        externalState['nodes'][nodename]['round'] = global_round
                else:
+                       pf = PersistFlags(nodename, 1, db='node_persistflags')
+                       print "%d %35s %s since %s" % (count, nodename, pf.status, pf.last_changed)
+                       del pf
                        count += 1
 
                if count % 20 == 0:
@@ -150,6 +154,8 @@ if __name__ == '__main__':
        except Exception, err:
                import traceback
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                database.dbDump(config.dbname, externalState)
index cbbc2c4..4882420 100644 (file)
@@ -198,3 +198,13 @@ def get_nodeset(config):
 
        return l_nodes
        
+def email_exception(content=None):
+       import config
+       from unified_model import Message
+       import traceback
+       msg=traceback.format_exc() 
+       if content:
+               msg = content + "\n" + msg
+       m=Message("exception running monitor", msg, False)
+       m.send([config.cc_email])
+       return
index 2327ec0..ce644e6 100755 (executable)
@@ -58,6 +58,8 @@ def main():
                except:
                        print "Error with %s" % node
                        import traceback; print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        pass
 
        # commands:
index e554e0a..f68d7b9 100755 (executable)
@@ -35,7 +35,8 @@ def get_filefromglob(d, str):
 def fb_print_nodeinfo(fbnode, verbose, date=None):
        if verbose: print "              state |  ssh  |  pcu  | bootcd | category | kernel"
        if 'checked' in fbnode:
-               print "%11.11s " % diff_time(fbnode['checked']),
+               if date: print date,
+               #print "%11.11s " % diff_time(fbnode['checked']),
        else:
                if date: print date,
                else: print "Unknown",
@@ -124,7 +125,6 @@ def main():
                except KeyboardInterrupt:
                        sys.exit(1)
                except:
-                       #import traceback; print traceback.print_exc()
                        print d.strftime("%Y-%m-%d"), "No record"
 
                d = d + tdelta
index c782b9a..008ecd8 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -155,6 +155,8 @@ if __name__ == '__main__':
        except Exception, err:
                import traceback
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                database.dbDump(config.dbname, externalState)
index 26187dd..a782a9d 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -295,6 +295,8 @@ class Diagnose(Thread):
                        print "----------------"
                        import traceback
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        print err
                        #if config.policysavedb:
                        sys.exit(1)
index 8efebae..ba75d78 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -275,6 +275,10 @@ class PCUControl(Transport,PCUModel,PCURecord):
                        import traceback
                        traceback.print_exc()
                        return "EOF connection reset" + str(err)
+               except:
+                       from nodecommon import email_exception
+                       email_exception()
+                       raise Exception('unknown')
                
 class IPAL(PCUControl):
        """ 
@@ -666,6 +670,13 @@ class BayTechAU(PCUControl):
 
 class BayTechGeorgeTown(PCUControl):
        def run(self, node_port, dryrun):
+               # this initial open/close is to prevent things from raising an
+               # exception.  the pcu always is weird during the first connection, and
+               # even if it's not, what does it matter to open a second connection
+               # right away?
+               self.open(self.host, self.username, None, "Enter user name:")
+               self.close()
+               time.sleep(1)
                self.open(self.host, self.username, None, "Enter user name:")
                self.sendPassword(self.password, "Enter Password:")
 
@@ -919,6 +930,8 @@ class ePowerSwitchGood(PCUControl):
                                if self.verbose: print f.read()
                        except:
                                import traceback; traceback.print_exc()
+                               from nodecommon import email_exception
+                               email_exception()
 
                                # fetch url one more time on cmd.html, econtrol.html or whatever.
                                # pass
@@ -1397,6 +1410,8 @@ def main():
                                print "failed"
        except Exception, err:
                import traceback; traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                print err
 
 if __name__ == '__main__':
index 4289e3d..aa09416 100755 (executable)
@@ -12,7 +12,6 @@ import comon
 from nodecommon import color_pcu_state, datetime_fromstr
 from nodehistory import get_filefromglob
 import time
-import traceback
 
 # region
 # total
@@ -150,7 +149,6 @@ def main():
                                                'hardware' : gethardwarequality(hostname, fb),
                                                'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) }
                                        #except:
-                                       #       print traceback.print_exc()
                                        #       print args
                                        #       print fb['nodes'][hostname]['values']
                                        results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args)
index f55a4d3..ecf4067 100755 (executable)
@@ -55,6 +55,9 @@ def checkAndRecordState(l_sites, l_plcsites):
                        externalState['sites'][sitename]['values'] = values
                        externalState['sites'][sitename]['round'] = global_round
                else:
+                       pf = PersistFlags(sitename, 1, db=config.dbpfname )
+                       print "%d noinc %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
+                                                                               pf.nodes_total, pf.nodes_up, pf.status)
                        count += 1
 
                if count % 20 == 0:
@@ -88,7 +91,7 @@ def collectStatusAndState(sitename, l_plcsites):
                return None
 
        if sitename in lb2hn:
-               pf = PersistFlags(sitename, 1, db='site_persistflags')
+               pf = PersistFlags(sitename, 1, db=config.dbpfname )
 
                if not pf.checkattr('last_changed'):
                        pf.last_changed = time.time()
@@ -123,7 +126,7 @@ if __name__ == '__main__':
 
        parser = parsermodule.getParser()
        parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None, 
-                                               increment=False, dbname="sitebad", cachenodes=False)
+                                               increment=False, dbname="sitebad", dbpfname="site_persistflags", cachenodes=False)
        parser.add_option("", "--site", dest="site", metavar="login_base", 
                                                help="Provide a single site to operate on")
        parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list", 
@@ -131,6 +134,8 @@ if __name__ == '__main__':
 
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
                                                help="Specify the name of the database to which the information is saved")
+       parser.add_option("", "--dbpfname", dest="dbpfname", metavar="FILE", 
+                                               help="Specify the persistflags db name")
        parser.add_option("-i", "--increment", action="store_true", dest="increment", 
                                                help="Increment round number to force refresh or retry")
        config = parsermodule.parse_args(parser)
@@ -140,6 +145,8 @@ if __name__ == '__main__':
        except Exception, err:
                import traceback
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                print "Exception: %s" % err
                print "Saving data... exitting."
                database.dbDump(config.dbname, externalState)
index 6fc714f..846a8f6 100644 (file)
@@ -184,6 +184,8 @@ class CMD:
                        return CMD.run(self,cmd,timeout)
                except ExceptionTimeout:
                        import traceback; print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                        return ("", "SCRIPTTIMEOUT")
                        
        def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
index 4876fe6..ca62990 100755 (executable)
@@ -15,5 +15,7 @@ try:
                network = api.GetNodeNetworks(node['nodenetwork_ids'])
        print "ok"
 except:
-       sys.stderr.write(traceback.print_exc())
+       sys.stderr.write(traceback.format_exc())
+       from nodecommon import email_exception
+       email_exception()
        print "fail"
index 97b0bb7..df4024e 100755 (executable)
@@ -40,7 +40,7 @@ def cmpCategoryVal(v1, v2):
        if v1 == 'ALPHA': v1 = "PROD"
        if v2 == 'ALPHA': v2 = "PROD"
        #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
-       map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
+       map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
        return cmpValMap(v1,v2,map)
 
 
@@ -355,7 +355,7 @@ class PersistSitePenalty(SitePenalty):
 
                #print pm
                if id in pm:
-                       print "Using existing object"
+                       print "PersistSitePenalty Using existing object"
                        obj = pm[id]
                else:
                        print "creating new object"
@@ -428,7 +428,11 @@ class Record(object):
        def severity(self):
                category = self.data['category']
                prev_category = self.data['prev_category']
-               #print "SEVERITY: ", category, prev_category
+               print "SEVERITY: ", category, prev_category
+               try:
+                       print "SEVERITY state: ", self.data['state'], self.data['prev_state']
+               except:
+                       print "SEVERITY state: unknown unknown"
                val = cmpCategoryVal(category, prev_category)
                return val 
 
@@ -514,6 +518,7 @@ class Record(object):
                else:
                        print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
+               print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
                pp.index = index
                pp.apply(self.hostname)
                pp.save()