add email_exception() to all except: statements.

author Stephen Soltesz <soltesz@cs.princeton.edu>

Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
diff --git a/bootman.py b/bootman.py

index fb5cf5d..f3ecf72 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -338,6 +338,8 @@ def reboot(hostname, config=None, forced_action=None):
         try:
                 k = SSHKnownHosts(); k.update(node); k.write(); del k
         except:
+               from nodecommon import email_exception
+               email_exception()
                 print traceback.print_exc()
                 return False
  
@@ -347,8 +349,11 @@ def reboot(hostname, config=None, forced_action=None):
                 else:
                         session = PlanetLabSession(node, config.nosetup, config.verbose)
         except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception(msg)
                 print e
                 return False
  
@@ -362,6 +367,8 @@ def reboot(hostname, config=None, forced_action=None):
                         conn = session.get_connection(config)
                 except:
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         return False
  
         if forced_action == "reboot":
@@ -736,7 +743,7 @@ def reboot(hostname, config=None, forced_action=None):
                         args = {}
                         args['hostname'] = hostname
                         args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodenet_persistmessages')
                         loginbase = plc.siteId(hostname)
                         emails = plc.getTechEmails(loginbase)
@@ -798,6 +805,8 @@ def reboot(hostname, config=None, forced_action=None):
                                 node = api.GetNodes(hostname)[0]
                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
                         except:
+                               from nodecommon import email_exception
+                               email_exception()
                                 print traceback.print_exc()
                                 # TODO: api error. skip email, b/c all info is not available,
                                 # flag_set will not be recorded.
diff --git a/clean_policy.py b/clean_policy.py

index 34099be..f447c95 100644 (file)
--- a/clean_policy.py
+++ b/clean_policy.py
@@ -92,8 +92,10 @@ class MonitorMergeDiagnoseSendEscellate:
                         actnode.update(fbnode)
                         actnode['ticket_id'] = ""
                         actnode['prev_category'] = "ERROR" 
+                       actnode['prev_state'] = "DOWN" 
                 else:
                         actnode['prev_category']= actnode['category']
+                       actnode['prev_state']   = actnode['state']
                         actnode['comonstats']   = fbnode['comonstats']
                         actnode['category']             = fbnode['category']
                         actnode['state']                = fbnode['state']
@@ -115,6 +117,10 @@ class MonitorMergeDiagnoseSendEscellate:
                 actnode= self.getActionRecord()
                 actrec = self.mergeRecord(fbnode, actnode)
                 record = Record(self.hostname, actrec)
+               #print record
+               #print actrec
+               #print record.data['time']
+               #print time.time() - record.data['time']
                 diag   = self.diagnose(record)
                 if self.act and diag is not None:
                         self.action(record,diag)
@@ -208,11 +214,12 @@ class MonitorMergeDiagnoseSendEscellate:
                                         record.data['ticket_id'] = message.rt.ticket_id
  
                         if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
-                               print "action: taking action"
+                               print "action: taking squeeze action"
                                 record.takeAction(record.data['action-level'])
                                 diag.resetFlag('Squeeze')
                                 diag.save()
                         if diag.getFlag('BackOff'):
+                               print "action: taking backoff action"
                                 record.takeAction(0)
                                 diag.resetFlag('BackOff')
                                 diag.save()
diff --git a/database.py b/database.py

index b9fc10d..254a5b5 100644 (file)
--- a/database.py
+++ b/database.py
@@ -110,8 +110,6 @@ class SPickle:
                                 raise Exception, "No such file %s" % name
                                 
  
-               #import traceback
-               #print traceback.print_stack()
                 #print "loading %s" % self.__file(name, type)
                 #sys.stderr.write("-----------------------------\n")
                 f = open(self.__file(name, type), 'r')
diff --git a/findbad.py b/findbad.py

index 2aabe01..630f1c5 100755 (executable)
--- a/findbad.py
+++ b/findbad.py
@@ -81,6 +81,8 @@ EOF                   """)
                                                                 'princeton_comon_procs' : '', 'sshport' : None})
         except:
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 sys.exit(1)
  
         ### RUN SSH ######################
@@ -203,6 +205,8 @@ EOF                 """)
         except:
                 b_except = True
                 traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
  
         plc_lock.release()
         if b_except: return (None, None)
@@ -240,6 +244,8 @@ EOF                 """)
         except:
                 b_except = True
                 traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
  
         plc_lock.release()
         if b_except: return (None, None)
@@ -397,6 +403,8 @@ if __name__ == '__main__':
                 main()
         except Exception, err:
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 print "Exception: %s" % err
                 print "Saving data... exitting."
                 database.dbDump(config.dbname, externalState)
diff --git a/findbadpcu.py b/findbadpcu.py

index ca65344..114c48b 100755 (executable)
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -85,6 +85,8 @@ def get_pcu(pcuname):
                                         l_pcu = i
                 except:
                         traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         l_pcu = None
  
         plc_lock.release()
@@ -103,6 +105,8 @@ def get_nodes(node_ids):
                                         l_node.append(n)
                 except:
                         traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         l_node = None
  
         plc_lock.release()
@@ -160,6 +164,8 @@ def get_plc_site_values(site_id):
                                         break
                 except:
                         traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         values = None
  
         plc_lock.release()
@@ -198,6 +204,8 @@ def collectPingAndSSH(pcuname, cohash):
                 except:
                         b_except = True
                         traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         continue_probe = False
  
                 if b_except or not continue_probe: return (None, None, None)
@@ -461,6 +469,8 @@ if __name__ == '__main__':
                 time.sleep(1)
         except Exception, err:
                 traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 print "Exception: %s" % err
                 print "Saving data... exitting."
                 database.dbDump(config.dbname, externalState)
diff --git a/grouprins.py b/grouprins.py

index cfefc6a..97ba05b 100755 (executable)
--- a/grouprins.py
+++ b/grouprins.py
@@ -75,6 +75,7 @@ class Reboot(object):
                                         return ret
  
                                 except Exception,e:
+                                       email_exception()
                                         print traceback.print_exc(); print e
  
                                         # NOTE: this failure could be an implementation issue on
@@ -97,6 +98,7 @@ class Reboot(object):
                                         return ret
  
                                 except Exception,e:
+                                       email_exception()
                                         print traceback.print_exc(); print e
  
                                         # NOTE: this failure could be an implementation issue on
@@ -140,6 +142,7 @@ class Reboot(object):
                 try:
                         return mailmonitor.reboot(host)
                 except Exception, e:
+                       email_exception(host)
                         print traceback.print_exc(); print e
                         return False
  
@@ -262,6 +265,7 @@ for host in hostnames:
                 try:
                         node = api.GetNodes(host)[0]
                 except:
+                       email_exception()
                         print traceback.print_exc(); 
                         print "FAILED GETNODES for host: %s" % host
                         continue
@@ -286,6 +290,7 @@ for host in hostnames:
                                         # todo: send thank you, etc.
                                         mailmonitor.reboot(host)
                                 except Exception, e:
+                                       email_exception()
                                         print traceback.print_exc(); print e
  
                                 continue
@@ -356,6 +361,7 @@ for host in hostnames:
                 print "Killed by interrupt"
                 sys.exit(0)
         except:
+               email_exception()
                 print traceback.print_exc();
                 print "Continuing..."
  
diff --git a/moncommands.py b/moncommands.py

index bda2389..50d31e2 100644 (file)
--- a/moncommands.py
+++ b/moncommands.py
@@ -35,6 +35,9 @@ class CMD:
                 except ExceptionTimeout:
                         import traceback; print traceback.print_exc()
                         return ("", "SCRIPTTIMEOUT")
+               except:
+                       from nodecommon import email_exception
+                       email_exception()
                         
         def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                 (o,e) = self.run(cmd, timeout)
diff --git a/monitor/database.py b/monitor/database.py

index 3b5bd65..88fd88f 100644 (file)
--- a/monitor/database.py
+++ b/monitor/database.py
@@ -111,8 +111,6 @@ class SPickle:
                                 raise Exception, "No such file %s" % name
                                 
  
-               #import traceback
-               #print traceback.print_stack()
                 #print "loading %s" % self.__file(name, type)
                 #sys.stderr.write("-----------------------------\n")
                 f = open(self.__file(name, type), 'r')
diff --git a/monitor_policy.py b/monitor_policy.py

index 45242ea..5db440f 100644 (file)
--- a/monitor_policy.py
+++ b/monitor_policy.py
@@ -281,6 +281,8 @@ class Diagnose:
                         print "----------------"
                         import traceback
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         print err
                         #if config.policysavedb:
                         sys.exit(1)
@@ -884,6 +886,8 @@ class Action:
                         print "----------------"
                         import traceback
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         print err
                         if config.policysavedb:
                                 print "Saving Databases... act_all"
@@ -970,6 +974,8 @@ class Action:
                         print "exception on message:"
                         import traceback
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         print message
  
                 return ticket_id
diff --git a/nodebad.py b/nodebad.py

index 0130c3e..d9b6b4c 100755 (executable)
--- a/nodebad.py
+++ b/nodebad.py
@@ -33,6 +33,7 @@ def main(config):
         l_plcnodes = database.dbLoad("l_plcnodes")
  
         l_nodes = get_nodeset(config)
+       print len(l_nodes)
         #if config.node:
         #       l_nodes = [config.node]
         ##else:
@@ -57,6 +58,9 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                         externalState['nodes'][nodename]['values'] = values
                         externalState['nodes'][nodename]['round'] = global_round
                 else:
+                       pf = PersistFlags(nodename, 1, db='node_persistflags')
+                       print "%d %35s %s since %s" % (count, nodename, pf.status, pf.last_changed)
+                       del pf
                         count += 1
  
                 if count % 20 == 0:
@@ -150,6 +154,8 @@ if __name__ == '__main__':
         except Exception, err:
                 import traceback
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 print "Exception: %s" % err
                 print "Saving data... exitting."
                 database.dbDump(config.dbname, externalState)
diff --git a/nodecommon.py b/nodecommon.py

index cbbc2c4..4882420 100644 (file)
--- a/nodecommon.py
+++ b/nodecommon.py
@@ -198,3 +198,13 @@ def get_nodeset(config):
  
         return l_nodes
         
+def email_exception(content=None):
+       import config
+       from unified_model import Message
+       import traceback
+       msg=traceback.format_exc() 
+       if content:
+               msg = content + "\n" + msg
+       m=Message("exception running monitor", msg, False)
+       m.send([config.cc_email])
+       return
diff --git a/nodeconfig.py b/nodeconfig.py

index 2327ec0..ce644e6 100755 (executable)
--- a/nodeconfig.py
+++ b/nodeconfig.py
@@ -58,6 +58,8 @@ def main():
                 except:
                         print "Error with %s" % node
                         import traceback; print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         pass
  
         # commands:
diff --git a/nodehistory.py b/nodehistory.py

index e554e0a..f68d7b9 100755 (executable)
--- a/nodehistory.py
+++ b/nodehistory.py
@@ -35,7 +35,8 @@ def get_filefromglob(d, str):
  def fb_print_nodeinfo(fbnode, verbose, date=None):
         if verbose: print "              state |  ssh  |  pcu  | bootcd | category | kernel"
         if 'checked' in fbnode:
-               print "%11.11s " % diff_time(fbnode['checked']),
+               if date: print date,
+               #print "%11.11s " % diff_time(fbnode['checked']),
         else:
                 if date: print date,
                 else: print "Unknown",
@@ -124,7 +125,6 @@ def main():
                 except KeyboardInterrupt:
                         sys.exit(1)
                 except:
-                       #import traceback; print traceback.print_exc()
                         print d.strftime("%Y-%m-%d"), "No record"
  
                 d = d + tdelta
diff --git a/pcubad.py b/pcubad.py

index c782b9a..008ecd8 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -155,6 +155,8 @@ if __name__ == '__main__':
         except Exception, err:
                 import traceback
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 print "Exception: %s" % err
                 print "Saving data... exitting."
                 database.dbDump(config.dbname, externalState)
diff --git a/policy.py b/policy.py

index 26187dd..a782a9d 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -295,6 +295,8 @@ class Diagnose(Thread):
                         print "----------------"
                         import traceback
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         print err
                         #if config.policysavedb:
                         sys.exit(1)
diff --git a/reboot.py b/reboot.py

index 8efebae..ba75d78 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -275,6 +275,10 @@ class PCUControl(Transport,PCUModel,PCURecord):
                         import traceback
                         traceback.print_exc()
                         return "EOF connection reset" + str(err)
+               except:
+                       from nodecommon import email_exception
+                       email_exception()
+                       raise Exception('unknown')
                 
  class IPAL(PCUControl):
         """ 
@@ -666,6 +670,13 @@ class BayTechAU(PCUControl):
  
  class BayTechGeorgeTown(PCUControl):
         def run(self, node_port, dryrun):
+               # this initial open/close is to prevent things from raising an
+               # exception.  the pcu always is weird during the first connection, and
+               # even if it's not, what does it matter to open a second connection
+               # right away?
+               self.open(self.host, self.username, None, "Enter user name:")
+               self.close()
+               time.sleep(1)
                 self.open(self.host, self.username, None, "Enter user name:")
                 self.sendPassword(self.password, "Enter Password:")
  
@@ -919,6 +930,8 @@ class ePowerSwitchGood(PCUControl):
                                 if self.verbose: print f.read()
                         except:
                                 import traceback; traceback.print_exc()
+                               from nodecommon import email_exception
+                               email_exception()
  
                                 # fetch url one more time on cmd.html, econtrol.html or whatever.
                                 # pass
@@ -1397,6 +1410,8 @@ def main():
                                 print "failed"
         except Exception, err:
                 import traceback; traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 print err
  
  if __name__ == '__main__':
diff --git a/showlatlon.py b/showlatlon.py

index 4289e3d..aa09416 100755 (executable)
--- a/showlatlon.py
+++ b/showlatlon.py
@@ -12,7 +12,6 @@ import comon
  from nodecommon import color_pcu_state, datetime_fromstr
  from nodehistory import get_filefromglob
  import time
-import traceback
  
  # region
  # total
@@ -150,7 +149,6 @@ def main():
                                                 'hardware' : gethardwarequality(hostname, fb),
                                                 'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) }
                                         #except:
-                                       #       print traceback.print_exc()
                                         #       print args
                                         #       print fb['nodes'][hostname]['values']
                                         results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args)
diff --git a/sitebad.py b/sitebad.py

index f55a4d3..ecf4067 100755 (executable)
--- a/sitebad.py
+++ b/sitebad.py
@@ -55,6 +55,9 @@ def checkAndRecordState(l_sites, l_plcsites):
                         externalState['sites'][sitename]['values'] = values
                         externalState['sites'][sitename]['round'] = global_round
                 else:
+                       pf = PersistFlags(sitename, 1, db=config.dbpfname )
+                       print "%d noinc %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
+                                                                               pf.nodes_total, pf.nodes_up, pf.status)
                         count += 1
  
                 if count % 20 == 0:
@@ -88,7 +91,7 @@ def collectStatusAndState(sitename, l_plcsites):
                 return None
  
         if sitename in lb2hn:
-               pf = PersistFlags(sitename, 1, db='site_persistflags')
+               pf = PersistFlags(sitename, 1, db=config.dbpfname )
  
                 if not pf.checkattr('last_changed'):
                         pf.last_changed = time.time()
@@ -123,7 +126,7 @@ if __name__ == '__main__':
  
         parser = parsermodule.getParser()
         parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None, 
-                                               increment=False, dbname="sitebad", cachenodes=False)
+                                               increment=False, dbname="sitebad", dbpfname="site_persistflags", cachenodes=False)
         parser.add_option("", "--site", dest="site", metavar="login_base", 
                                                 help="Provide a single site to operate on")
         parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list", 
@@ -131,6 +134,8 @@ if __name__ == '__main__':
  
         parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
                                                 help="Specify the name of the database to which the information is saved")
+       parser.add_option("", "--dbpfname", dest="dbpfname", metavar="FILE", 
+                                               help="Specify the persistflags db name")
         parser.add_option("-i", "--increment", action="store_true", dest="increment", 
                                                 help="Increment round number to force refresh or retry")
         config = parsermodule.parse_args(parser)
@@ -140,6 +145,8 @@ if __name__ == '__main__':
         except Exception, err:
                 import traceback
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception()
                 print "Exception: %s" % err
                 print "Saving data... exitting."
                 database.dbDump(config.dbname, externalState)
diff --git a/soltesz.py b/soltesz.py

index 6fc714f..846a8f6 100644 (file)
--- a/soltesz.py
+++ b/soltesz.py
@@ -184,6 +184,8 @@ class CMD:
                         return CMD.run(self,cmd,timeout)
                 except ExceptionTimeout:
                         import traceback; print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception()
                         return ("", "SCRIPTTIMEOUT")
                         
         def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
diff --git a/testapi.py b/testapi.py

index 4876fe6..ca62990 100755 (executable)
--- a/testapi.py
+++ b/testapi.py
@@ -15,5 +15,7 @@ try:
                 network = api.GetNodeNetworks(node['nodenetwork_ids'])
         print "ok"
  except:
-       sys.stderr.write(traceback.print_exc())
+       sys.stderr.write(traceback.format_exc())
+       from nodecommon import email_exception
+       email_exception()
         print "fail"
diff --git a/unified_model.py b/unified_model.py

index 97b0bb7..df4024e 100755 (executable)
--- a/unified_model.py
+++ b/unified_model.py
@@ -40,7 +40,7 @@ def cmpCategoryVal(v1, v2):
         if v1 == 'ALPHA': v1 = "PROD"
         if v2 == 'ALPHA': v2 = "PROD"
         #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
-       map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
+       map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
         return cmpValMap(v1,v2,map)
  
  
@@ -355,7 +355,7 @@ class PersistSitePenalty(SitePenalty):
  
                 #print pm
                 if id in pm:
-                       print "Using existing object"
+                       print "PersistSitePenalty Using existing object"
                         obj = pm[id]
                 else:
                         print "creating new object"
@@ -428,7 +428,11 @@ class Record(object):
         def severity(self):
                 category = self.data['category']
                 prev_category = self.data['prev_category']
-               #print "SEVERITY: ", category, prev_category
+               print "SEVERITY: ", category, prev_category
+               try:
+                       print "SEVERITY state: ", self.data['state'], self.data['prev_state']
+               except:
+                       print "SEVERITY state: unknown unknown"
                 val = cmpCategoryVal(category, prev_category)
                 return val 
  
@@ -514,6 +518,7 @@ class Record(object):
                 else:
                         print "takeAction: increasing penalty for %s"%self.hostname
                         pp.increase()
+               print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
                 pp.index = index
                 pp.apply(self.hostname)
                 pp.save()
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Tue, 10 Mar 2009 20:25:50 +0000 (20:25 +0000)
bootman.py		patch \| blob \| history
clean_policy.py		patch \| blob \| history
database.py		patch \| blob \| history
findbad.py		patch \| blob \| history
findbadpcu.py		patch \| blob \| history
grouprins.py		patch \| blob \| history
moncommands.py		patch \| blob \| history
monitor/database.py		patch \| blob \| history
monitor_policy.py		patch \| blob \| history
nodebad.py		patch \| blob \| history
nodecommon.py		patch \| blob \| history
nodeconfig.py		patch \| blob \| history
nodehistory.py		patch \| blob \| history
pcubad.py		patch \| blob \| history
policy.py		patch \| blob \| history
reboot.py		patch \| blob \| history
showlatlon.py		patch \| blob \| history
sitebad.py		patch \| blob \| history
soltesz.py		patch \| blob \| history
testapi.py		patch \| blob \| history
unified_model.py		patch \| blob \| history