From c9b0045bba8ab66adf5036f9cac7f37f476b9a69 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 10 Mar 2009 20:25:50 +0000 Subject: [PATCH] add email_exception() to all except: statements. --- bootman.py | 13 +++++++++++-- clean_policy.py | 9 ++++++++- database.py | 2 -- findbad.py | 8 ++++++++ findbadpcu.py | 10 ++++++++++ grouprins.py | 6 ++++++ moncommands.py | 3 +++ monitor/database.py | 2 -- monitor_policy.py | 6 ++++++ nodebad.py | 6 ++++++ nodecommon.py | 10 ++++++++++ nodeconfig.py | 2 ++ nodehistory.py | 4 ++-- pcubad.py | 2 ++ policy.py | 2 ++ reboot.py | 15 +++++++++++++++ showlatlon.py | 2 -- sitebad.py | 11 +++++++++-- soltesz.py | 2 ++ testapi.py | 4 +++- unified_model.py | 11 ++++++++--- 21 files changed, 113 insertions(+), 17 deletions(-) diff --git a/bootman.py b/bootman.py index fb5cf5d..f3ecf72 100755 --- a/bootman.py +++ b/bootman.py @@ -338,6 +338,8 @@ def reboot(hostname, config=None, forced_action=None): try: k = SSHKnownHosts(); k.update(node); k.write(); del k except: + from nodecommon import email_exception + email_exception() print traceback.print_exc() return False @@ -347,8 +349,11 @@ def reboot(hostname, config=None, forced_action=None): else: session = PlanetLabSession(node, config.nosetup, config.verbose) except Exception, e: - print "ERROR setting up session for %s" % hostname + msg = "ERROR setting up session for %s" % hostname + print msg print traceback.print_exc() + from nodecommon import email_exception + email_exception(msg) print e return False @@ -362,6 +367,8 @@ def reboot(hostname, config=None, forced_action=None): conn = session.get_connection(config) except: print traceback.print_exc() + from nodecommon import email_exception + email_exception() return False if forced_action == "reboot": @@ -736,7 +743,7 @@ def reboot(hostname, config=None, forced_action=None): args = {} args['hostname'] = hostname args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, + m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) emails = plc.getTechEmails(loginbase) @@ -798,6 +805,8 @@ def reboot(hostname, config=None, forced_action=None): node = api.GetNodes(hostname)[0] net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] except: + from nodecommon import email_exception + email_exception() print traceback.print_exc() # TODO: api error. skip email, b/c all info is not available, # flag_set will not be recorded. diff --git a/clean_policy.py b/clean_policy.py index 34099be..f447c95 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -92,8 +92,10 @@ class MonitorMergeDiagnoseSendEscellate: actnode.update(fbnode) actnode['ticket_id'] = "" actnode['prev_category'] = "ERROR" + actnode['prev_state'] = "DOWN" else: actnode['prev_category']= actnode['category'] + actnode['prev_state'] = actnode['state'] actnode['comonstats'] = fbnode['comonstats'] actnode['category'] = fbnode['category'] actnode['state'] = fbnode['state'] @@ -115,6 +117,10 @@ class MonitorMergeDiagnoseSendEscellate: actnode= self.getActionRecord() actrec = self.mergeRecord(fbnode, actnode) record = Record(self.hostname, actrec) + #print record + #print actrec + #print record.data['time'] + #print time.time() - record.data['time'] diag = self.diagnose(record) if self.act and diag is not None: self.action(record,diag) @@ -208,11 +214,12 @@ class MonitorMergeDiagnoseSendEscellate: record.data['ticket_id'] = message.rt.ticket_id if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): - print "action: taking action" + print "action: taking squeeze action" record.takeAction(record.data['action-level']) diag.resetFlag('Squeeze') diag.save() if diag.getFlag('BackOff'): + print "action: taking backoff action" record.takeAction(0) diag.resetFlag('BackOff') diag.save() diff --git a/database.py b/database.py index b9fc10d..254a5b5 100644 --- a/database.py +++ b/database.py @@ -110,8 +110,6 @@ class SPickle: raise Exception, "No such file %s" % name - #import traceback - #print traceback.print_stack() #print "loading %s" % self.__file(name, type) #sys.stderr.write("-----------------------------\n") f = open(self.__file(name, type), 'r') diff --git a/findbad.py b/findbad.py index 2aabe01..630f1c5 100755 --- a/findbad.py +++ b/findbad.py @@ -81,6 +81,8 @@ EOF """) 'princeton_comon_procs' : '', 'sshport' : None}) except: print traceback.print_exc() + from nodecommon import email_exception + email_exception() sys.exit(1) ### RUN SSH ###################### @@ -203,6 +205,8 @@ EOF """) except: b_except = True traceback.print_exc() + from nodecommon import email_exception + email_exception() plc_lock.release() if b_except: return (None, None) @@ -240,6 +244,8 @@ EOF """) except: b_except = True traceback.print_exc() + from nodecommon import email_exception + email_exception() plc_lock.release() if b_except: return (None, None) @@ -397,6 +403,8 @@ if __name__ == '__main__': main() except Exception, err: print traceback.print_exc() + from nodecommon import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." database.dbDump(config.dbname, externalState) diff --git a/findbadpcu.py b/findbadpcu.py index ca65344..114c48b 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -85,6 +85,8 @@ def get_pcu(pcuname): l_pcu = i except: traceback.print_exc() + from nodecommon import email_exception + email_exception() l_pcu = None plc_lock.release() @@ -103,6 +105,8 @@ def get_nodes(node_ids): l_node.append(n) except: traceback.print_exc() + from nodecommon import email_exception + email_exception() l_node = None plc_lock.release() @@ -160,6 +164,8 @@ def get_plc_site_values(site_id): break except: traceback.print_exc() + from nodecommon import email_exception + email_exception() values = None plc_lock.release() @@ -198,6 +204,8 @@ def collectPingAndSSH(pcuname, cohash): except: b_except = True traceback.print_exc() + from nodecommon import email_exception + email_exception() continue_probe = False if b_except or not continue_probe: return (None, None, None) @@ -461,6 +469,8 @@ if __name__ == '__main__': time.sleep(1) except Exception, err: traceback.print_exc() + from nodecommon import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." database.dbDump(config.dbname, externalState) diff --git a/grouprins.py b/grouprins.py index cfefc6a..97ba05b 100755 --- a/grouprins.py +++ b/grouprins.py @@ -75,6 +75,7 @@ class Reboot(object): return ret except Exception,e: + email_exception() print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on @@ -97,6 +98,7 @@ class Reboot(object): return ret except Exception,e: + email_exception() print traceback.print_exc(); print e # NOTE: this failure could be an implementation issue on @@ -140,6 +142,7 @@ class Reboot(object): try: return mailmonitor.reboot(host) except Exception, e: + email_exception(host) print traceback.print_exc(); print e return False @@ -262,6 +265,7 @@ for host in hostnames: try: node = api.GetNodes(host)[0] except: + email_exception() print traceback.print_exc(); print "FAILED GETNODES for host: %s" % host continue @@ -286,6 +290,7 @@ for host in hostnames: # todo: send thank you, etc. mailmonitor.reboot(host) except Exception, e: + email_exception() print traceback.print_exc(); print e continue @@ -356,6 +361,7 @@ for host in hostnames: print "Killed by interrupt" sys.exit(0) except: + email_exception() print traceback.print_exc(); print "Continuing..." diff --git a/moncommands.py b/moncommands.py index bda2389..50d31e2 100644 --- a/moncommands.py +++ b/moncommands.py @@ -35,6 +35,9 @@ class CMD: except ExceptionTimeout: import traceback; print traceback.print_exc() return ("", "SCRIPTTIMEOUT") + except: + from nodecommon import email_exception + email_exception() def system(self, cmd, timeout=COMMAND_TIMEOUT*2): (o,e) = self.run(cmd, timeout) diff --git a/monitor/database.py b/monitor/database.py index 3b5bd65..88fd88f 100644 --- a/monitor/database.py +++ b/monitor/database.py @@ -111,8 +111,6 @@ class SPickle: raise Exception, "No such file %s" % name - #import traceback - #print traceback.print_stack() #print "loading %s" % self.__file(name, type) #sys.stderr.write("-----------------------------\n") f = open(self.__file(name, type), 'r') diff --git a/monitor_policy.py b/monitor_policy.py index 45242ea..5db440f 100644 --- a/monitor_policy.py +++ b/monitor_policy.py @@ -281,6 +281,8 @@ class Diagnose: print "----------------" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print err #if config.policysavedb: sys.exit(1) @@ -884,6 +886,8 @@ class Action: print "----------------" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print err if config.policysavedb: print "Saving Databases... act_all" @@ -970,6 +974,8 @@ class Action: print "exception on message:" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print message return ticket_id diff --git a/nodebad.py b/nodebad.py index 0130c3e..d9b6b4c 100755 --- a/nodebad.py +++ b/nodebad.py @@ -33,6 +33,7 @@ def main(config): l_plcnodes = database.dbLoad("l_plcnodes") l_nodes = get_nodeset(config) + print len(l_nodes) #if config.node: # l_nodes = [config.node] ##else: @@ -57,6 +58,9 @@ def checkAndRecordState(l_nodes, l_plcnodes): externalState['nodes'][nodename]['values'] = values externalState['nodes'][nodename]['round'] = global_round else: + pf = PersistFlags(nodename, 1, db='node_persistflags') + print "%d %35s %s since %s" % (count, nodename, pf.status, pf.last_changed) + del pf count += 1 if count % 20 == 0: @@ -150,6 +154,8 @@ if __name__ == '__main__': except Exception, err: import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." database.dbDump(config.dbname, externalState) diff --git a/nodecommon.py b/nodecommon.py index cbbc2c4..4882420 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -198,3 +198,13 @@ def get_nodeset(config): return l_nodes +def email_exception(content=None): + import config + from unified_model import Message + import traceback + msg=traceback.format_exc() + if content: + msg = content + "\n" + msg + m=Message("exception running monitor", msg, False) + m.send([config.cc_email]) + return diff --git a/nodeconfig.py b/nodeconfig.py index 2327ec0..ce644e6 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -58,6 +58,8 @@ def main(): except: print "Error with %s" % node import traceback; print traceback.print_exc() + from nodecommon import email_exception + email_exception() pass # commands: diff --git a/nodehistory.py b/nodehistory.py index e554e0a..f68d7b9 100755 --- a/nodehistory.py +++ b/nodehistory.py @@ -35,7 +35,8 @@ def get_filefromglob(d, str): def fb_print_nodeinfo(fbnode, verbose, date=None): if verbose: print " state | ssh | pcu | bootcd | category | kernel" if 'checked' in fbnode: - print "%11.11s " % diff_time(fbnode['checked']), + if date: print date, + #print "%11.11s " % diff_time(fbnode['checked']), else: if date: print date, else: print "Unknown", @@ -124,7 +125,6 @@ def main(): except KeyboardInterrupt: sys.exit(1) except: - #import traceback; print traceback.print_exc() print d.strftime("%Y-%m-%d"), "No record" d = d + tdelta diff --git a/pcubad.py b/pcubad.py index c782b9a..008ecd8 100755 --- a/pcubad.py +++ b/pcubad.py @@ -155,6 +155,8 @@ if __name__ == '__main__': except Exception, err: import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." database.dbDump(config.dbname, externalState) diff --git a/policy.py b/policy.py index 26187dd..a782a9d 100644 --- a/policy.py +++ b/policy.py @@ -295,6 +295,8 @@ class Diagnose(Thread): print "----------------" import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print err #if config.policysavedb: sys.exit(1) diff --git a/reboot.py b/reboot.py index 8efebae..ba75d78 100755 --- a/reboot.py +++ b/reboot.py @@ -275,6 +275,10 @@ class PCUControl(Transport,PCUModel,PCURecord): import traceback traceback.print_exc() return "EOF connection reset" + str(err) + except: + from nodecommon import email_exception + email_exception() + raise Exception('unknown') class IPAL(PCUControl): """ @@ -666,6 +670,13 @@ class BayTechAU(PCUControl): class BayTechGeorgeTown(PCUControl): def run(self, node_port, dryrun): + # this initial open/close is to prevent things from raising an + # exception. the pcu always is weird during the first connection, and + # even if it's not, what does it matter to open a second connection + # right away? + self.open(self.host, self.username, None, "Enter user name:") + self.close() + time.sleep(1) self.open(self.host, self.username, None, "Enter user name:") self.sendPassword(self.password, "Enter Password:") @@ -919,6 +930,8 @@ class ePowerSwitchGood(PCUControl): if self.verbose: print f.read() except: import traceback; traceback.print_exc() + from nodecommon import email_exception + email_exception() # fetch url one more time on cmd.html, econtrol.html or whatever. # pass @@ -1397,6 +1410,8 @@ def main(): print "failed" except Exception, err: import traceback; traceback.print_exc() + from nodecommon import email_exception + email_exception() print err if __name__ == '__main__': diff --git a/showlatlon.py b/showlatlon.py index 4289e3d..aa09416 100755 --- a/showlatlon.py +++ b/showlatlon.py @@ -12,7 +12,6 @@ import comon from nodecommon import color_pcu_state, datetime_fromstr from nodehistory import get_filefromglob import time -import traceback # region # total @@ -150,7 +149,6 @@ def main(): 'hardware' : gethardwarequality(hostname, fb), 'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) } #except: - # print traceback.print_exc() # print args # print fb['nodes'][hostname]['values'] results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args) diff --git a/sitebad.py b/sitebad.py index f55a4d3..ecf4067 100755 --- a/sitebad.py +++ b/sitebad.py @@ -55,6 +55,9 @@ def checkAndRecordState(l_sites, l_plcsites): externalState['sites'][sitename]['values'] = values externalState['sites'][sitename]['round'] = global_round else: + pf = PersistFlags(sitename, 1, db=config.dbpfname ) + print "%d noinc %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, + pf.nodes_total, pf.nodes_up, pf.status) count += 1 if count % 20 == 0: @@ -88,7 +91,7 @@ def collectStatusAndState(sitename, l_plcsites): return None if sitename in lb2hn: - pf = PersistFlags(sitename, 1, db='site_persistflags') + pf = PersistFlags(sitename, 1, db=config.dbpfname ) if not pf.checkattr('last_changed'): pf.last_changed = time.time() @@ -123,7 +126,7 @@ if __name__ == '__main__': parser = parsermodule.getParser() parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None, - increment=False, dbname="sitebad", cachenodes=False) + increment=False, dbname="sitebad", dbpfname="site_persistflags", cachenodes=False) parser.add_option("", "--site", dest="site", metavar="login_base", help="Provide a single site to operate on") parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list", @@ -131,6 +134,8 @@ if __name__ == '__main__': parser.add_option("", "--dbname", dest="dbname", metavar="FILE", help="Specify the name of the database to which the information is saved") + parser.add_option("", "--dbpfname", dest="dbpfname", metavar="FILE", + help="Specify the persistflags db name") parser.add_option("-i", "--increment", action="store_true", dest="increment", help="Increment round number to force refresh or retry") config = parsermodule.parse_args(parser) @@ -140,6 +145,8 @@ if __name__ == '__main__': except Exception, err: import traceback print traceback.print_exc() + from nodecommon import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." database.dbDump(config.dbname, externalState) diff --git a/soltesz.py b/soltesz.py index 6fc714f..846a8f6 100644 --- a/soltesz.py +++ b/soltesz.py @@ -184,6 +184,8 @@ class CMD: return CMD.run(self,cmd,timeout) except ExceptionTimeout: import traceback; print traceback.print_exc() + from nodecommon import email_exception + email_exception() return ("", "SCRIPTTIMEOUT") def system(self, cmd, timeout=COMMAND_TIMEOUT*2): diff --git a/testapi.py b/testapi.py index 4876fe6..ca62990 100755 --- a/testapi.py +++ b/testapi.py @@ -15,5 +15,7 @@ try: network = api.GetNodeNetworks(node['nodenetwork_ids']) print "ok" except: - sys.stderr.write(traceback.print_exc()) + sys.stderr.write(traceback.format_exc()) + from nodecommon import email_exception + email_exception() print "fail" diff --git a/unified_model.py b/unified_model.py index 97b0bb7..df4024e 100755 --- a/unified_model.py +++ b/unified_model.py @@ -40,7 +40,7 @@ def cmpCategoryVal(v1, v2): if v1 == 'ALPHA': v1 = "PROD" if v2 == 'ALPHA': v2 = "PROD" #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) - map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) + map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) return cmpValMap(v1,v2,map) @@ -355,7 +355,7 @@ class PersistSitePenalty(SitePenalty): #print pm if id in pm: - print "Using existing object" + print "PersistSitePenalty Using existing object" obj = pm[id] else: print "creating new object" @@ -428,7 +428,11 @@ class Record(object): def severity(self): category = self.data['category'] prev_category = self.data['prev_category'] - #print "SEVERITY: ", category, prev_category + print "SEVERITY: ", category, prev_category + try: + print "SEVERITY state: ", self.data['state'], self.data['prev_state'] + except: + print "SEVERITY state: unknown unknown" val = cmpCategoryVal(category, prev_category) return val @@ -514,6 +518,7 @@ class Record(object): else: print "takeAction: increasing penalty for %s"%self.hostname pp.increase() + print "takeAction: applying penalty to %s as index %s"% (self.hostname, index) pp.index = index pp.apply(self.hostname) pp.save() -- 2.47.0