moved found_within to common.py

author Stephen Soltesz <soltesz@cs.princeton.edu>

Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
diff --git a/monitor/common.py b/monitor/common.py

index 0f6dd40..aecd866 100644 (file)
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -238,3 +238,14 @@ def changed_greaterthan(last_changed, days):
                 #print "last changed less than %s" % timedelta(days)
                 return False
         
+def found_within(recent_actions, action_type, within):
+       for action in recent_actions:
+               if action_type == action.action_type and \
+                               datetime.now() - action.date_created < timedelta(within):
+                       # recent action of given type.
+                       #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+                       return True
+
+       print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+       return False
+       
diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py

index b437842..a5139eb 100644 (file)
--- a/monitor/database/info/findbad.py
+++ b/monitor/database/info/findbad.py
@@ -94,7 +94,7 @@ class FindbadPCURecord(Entity):
  
         @classmethod
         def get_latest_by(cls, **kwargs):
-               return cls.query.filter_by(**kwargs)
+               return cls.query.filter_by(**kwargs).first()
  
  # ACCOUNTING
         date_checked = Field(DateTime)
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py

index 385ac63..98c8856 100644 (file)
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -274,6 +274,17 @@ legend:
    2+ - all existing slices will be disabled.
         """)
  
+       newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+    %(hostname)s  
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
         nmreset =("""NM Reset at %(loginbase)s""",
         """
  Monitor restarted NM on the following machines:
@@ -361,10 +372,10 @@ Thank you very much for your help,
    -- PlanetLab Central (support@planet-lab.org)
  """)
  
-       newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: 
+       newalphacd_notice=(""" New Boot Images for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware: 
  
-%(hostname_list)s  
+%(hostname)s  
  
  To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
  
@@ -385,14 +396,14 @@ Thank you for your help,
         # TODO: need reminder versions for repeats...
         newdown=[newdown_one, newdown_two, newdown_three]
         newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
-       newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+       #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
         newthankyou=[thankyou,thankyou,thankyou]
         pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
         NMReset=[nmreset,nmreset,nmreset]
         pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
         pcudown=[pcudown_one, pcudown_one, pcudown_one]
  
-       unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+       unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
                                            """
  While trying to automatically recover this machine:
  
@@ -478,7 +489,7 @@ Thank you for your help,
         donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
  
  
-       minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+       minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
                                            """
  While trying to automatically recover this machine:
  
@@ -498,7 +509,7 @@ BootManager.log output follows:
  %(bmlog)s
  """      )
  
-       baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+       baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""", 
                            """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
  
  Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
@@ -564,7 +575,7 @@ BootManager.log output follows:
  %(bmlog)s
  """)
  
-       plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+       nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
  """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
  
         https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
@@ -604,7 +615,7 @@ Thanks.
  """)
  
  
-       baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+       baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
  """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
  
      %(hostname)s 
diff --git a/nodebad.py b/nodebad.py

index a0490e4..46ca879 100755 (executable)
--- a/nodebad.py
+++ b/nodebad.py
@@ -44,31 +44,47 @@ def check_node_state(rec, node):
                 boot_state = "unknown"
                 last_contact = None
  
+       if boot_state == 'disable': boot_state = 'disabled'
+       if boot_state == 'diag':        boot_state = 'diagnose'
+
         # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
         #                       'translations' into the node.status state
         #               'BOOT' is a permanent state, but we want it to have a bit of
         #                       hysteresis (less than 0.5 days)
  
-       #################################################################3
-       # "Translate" the findbad states into nodebad status.
+       #################################################################
+       # "Initialize" the findbad states into nodebad status if they are not already set
  
-       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disable' :
+       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
                 print "changed status from %s to offline" % node.status
                 node.status = 'offline'
                 node.last_changed = datetime.now()
  
-       if node_state == 'DEBUG' and node.status != 'monitordebug':
-               print "changed status from %s to monitordebug" % (node.status)
-               node.status = "monitordebug"
-               node.last_changed = datetime.now()
+       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+                                                                node.status != 'disabled' and \
+                                                                node.status != 'diagnose':
+               if boot_state != 'disabled' and boot_state != 'diagnose':
+
+                       print "changed status from %s to monitordebug" % (node.status)
+                       node.status = "monitordebug"
+                       node.last_changed = datetime.now()
+               else:
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
  
         if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
                 print "changed status from %s to online" % node.status
                 node.status = 'online'
                 node.last_changed = datetime.now()
  
-       #################################################################3
+       #################################################################
         # Switch temporary hystersis states into their 'firm' states.
+       #         online -> good                after half a day
+       #         offline -> down               after two days
+       #         monitordebug -> down  after 30 days
+       #         diagnose -> monitordebug after 60 days
+       #         disabled -> down              after 60 days
  
         if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
                 print "changed status from %s to good" % node.status
@@ -80,11 +96,16 @@ def check_node_state(rec, node):
                 node.status = 'down'
                 # NOTE: do not reset last_changed, or you lose how long it's been down.
  
-       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 14):
+       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
                 print "changed status from %s to down" % node.status
                 node.status = 'down'
                 # NOTE: do not reset last_changed, or you lose how long it's been down.
-               #node.last_changed = datetime.now()
+
+       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+               print "changed status from %s to down" % node.status
+               # NOTE: change an admin mode back into monitordebug after two months.
+               node.status = 'monitordebug'
+               node.last_changed = datetime.now()
  
         # extreme cases of offline nodes
         if ( boot_state == 'disabled' or last_contact == None ) and \
diff --git a/pcucontrol/util/command.py b/pcucontrol/util/command.py

index 899d667..47627b4 100644 (file)
--- a/pcucontrol/util/command.py
+++ b/pcucontrol/util/command.py
@@ -197,6 +197,7 @@ class SSH(CMD):
         def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
                 cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
                                                                         self.user, self.host, cmd)
+               #print cmd
                 r = CMD.run_noexcept(self, cmd, timeout)
                 self.ret = -1
  
diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py

index 1178aa1..0d4e703 100644 (file)
--- a/web/MonitorWeb/monitorweb/controllers.py
+++ b/web/MonitorWeb/monitorweb/controllers.py
@@ -12,14 +12,15 @@ from monitor.database.zabbixapi.model import *
  from monitor.database.dborm import zab_session as session
  from monitor.database.dborm import zab_metadata as metadata
  
-from pcucontrol import reboot
+from monitor import reboot
+from monitor import scanapi
+
  from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
  from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
  from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
  
  from monitorweb.templates.links import *
  
-from monitor import scanapi
  
  
  def query_to_dict(query):
@@ -103,7 +104,7 @@ class NodeWidget(widgets.Widget):
  
  def prep_node_for_display(node):
         if node.plc_pcuid:
-               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                 if pcu:
                         node.pcu_status = pcu.reboot_trial_status
                         node.pcu_short_status = format_pcu_shortstatus(pcu)
@@ -168,40 +169,72 @@ class Root(controllers.RootController):
                 return self.pcuview(None, hostname) # dict(nodequery=nodequery)
  
         @expose(template="monitorweb.templates.nodelist")
-       def node(self, filter='BOOT'):
+       def node(self, filter='boot'):
                 import time
                 fbquery = FindbadNodeRecord.get_all_latest()
                 query = []
-               filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
+               filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
+                                               'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
                 for node in fbquery:
                         # NOTE: reformat some fields.
                         prep_node_for_display(node)
  
-                       # NOTE: count filters
-                       if node.observed_status != 'DOWN':
-                               print node.hostname, node.observed_status
-                               filtercount[node.observed_status] += 1
-                       else:
+                       node.history.status
+
+                       if node.history.status in ['down', 'offline']:
                                 if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
-                                       filtercount[node.observed_status] += 1
+                                       filtercount['down'] += 1
                                 else:
                                         filtercount['neverboot'] += 1
+                       elif node.history.status in ['good', 'online']:
+                               filtercount['boot'] += 1
+                       elif node.history.status in ['debug', 'monitordebug']:
+                               filtercount['debug'] += 1
+                       else:
+                               filtercount[node.history.status] += 1
+                               
+                       ## NOTE: count filters
+                       #if node.observed_status != 'DOWN':
+                       #       print node.hostname, node.observed_status
+                       #       if node.observed_status == 'DEBUG':
+                       #               if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+                       #                       filtercount[node.plc_node_stats['boot_state']] += 1
+                       #               else:
+                       #                       filtercount['debug'] += 1
+                       #                       
+                       #       else:
+                       #               filtercount[node.observed_status] += 1
+                       #else:
+                       #       if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+                       #               filtercount[node.observed_status] += 1
+                       #       else:
+                       #               filtercount['neverboot'] += 1
  
                         # NOTE: apply filter
-                       if filter == node.observed_status:
-                               if filter == "DOWN":
-                                       if node.plc_node_stats['last_contact'] != None:
-                                               query.append(node)
-                               else:
-                                       query.append(node)
-                       elif filter == "neverboot":
+                       if filter == "neverboot":
                                 if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
                                         query.append(node)
-                       elif filter == "pending":
-                               # TODO: look in message logs...
-                               pass
                         elif filter == "all":
                                 query.append(node)
+                       elif filter == node.history.status:
+                               query.append(node)
+
+                       #if filter == node.observed_status:
+                       #       if filter == "DOWN":
+                       #               if node.plc_node_stats['last_contact'] != None:
+                       #                       query.append(node)
+                       #       else:
+                       #               query.append(node)
+                       #elif filter == "neverboot":
+                       #       if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+                       #               query.append(node)
+                       #elif filter == "pending":
+                       #       # TODO: look in message logs...
+                       #       pass
+                       #elif filter == node.plc_node_stats['boot_state']:
+                       #       query.append(node)
+                       #elif filter == "all":
+                       #       query.append(node)
                                 
                 widget = NodeWidget(template='monitorweb.templates.node_template')
                 return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
@@ -222,7 +255,7 @@ class Root(controllers.RootController):
                                 if 'pcuid' in val:
                                         pcuid = val['pcuid']
                                 elif 'hostname' in val:
-                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
                                 else:
                                         pcuid=None
                         else:
@@ -304,7 +337,7 @@ class Root(controllers.RootController):
                                         prep_node_for_display(node)
                                         nodequery += [node]
                                         if node.plc_pcuid:      # not None
-                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                                                 prep_pcu_for_display(pcu)
                                                 pcus[node.plc_pcuid] = pcu
  
@@ -326,7 +359,6 @@ class Root(controllers.RootController):
                                         node = FindbadNodeRecord.get_latest_by(hostname=nodename)
                                         print "%s" % node.port_status
                                         print "%s" % node.to_dict()
-                                       print "%s" % len(q.all())
                                         if node:
                                                 prep_node_for_display(node)
                                                 nodequery += [node]
diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid

index 5b4e7c3..53bbe5b 100644 (file)
--- a/web/MonitorWeb/monitorweb/templates/nodelist.kid
+++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid
@@ -13,17 +13,19 @@ from links import *
         <table width="100%">
                 <thead>
                         <tr>
-                               <th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
-                               <th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
-                               <th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+                               <th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+                               <th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+                               <th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+                               <th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+                               <th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
                                 <th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
-                               <th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+                               <!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
                                 <th><a href="${link('node', filter='all')}">All</a></th>
                         </tr>
                 </thead>
                 <tbody>
                 <tr>
-               <td colspan="5">
+               <td colspan="7">
                 <table id="sortable_table" class="datagrid" border="1" width="100%">
                         <thead>
                                 <tr>
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
monitor/common.py		patch \| blob \| history
monitor/database/info/findbad.py		patch \| blob \| history
monitor/wrapper/emailTxt.py		patch \| blob \| history
nodebad.py		patch \| blob \| history
pcucontrol/util/command.py		patch \| blob \| history
web/MonitorWeb/monitorweb/controllers.py		patch \| blob \| history
web/MonitorWeb/monitorweb/templates/nodelist.kid		patch \| blob \| history