moved found_within to common.py
authorStephen Soltesz <soltesz@cs.princeton.edu>
Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Fri, 27 Mar 2009 17:07:07 +0000 (17:07 +0000)
renamed email messages in emailTxt to reflect action types
updated findbad model to perform single-node queries correctly.
added node.status categories to nodelist.kid since this is the primary
difference between nodes now.

monitor/common.py
monitor/database/info/findbad.py
monitor/wrapper/emailTxt.py
nodebad.py
pcucontrol/util/command.py
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/templates/nodelist.kid

index 0f6dd40..aecd866 100644 (file)
@@ -238,3 +238,14 @@ def changed_greaterthan(last_changed, days):
                #print "last changed less than %s" % timedelta(days)
                return False
        
+def found_within(recent_actions, action_type, within):
+       for action in recent_actions:
+               if action_type == action.action_type and \
+                               datetime.now() - action.date_created < timedelta(within):
+                       # recent action of given type.
+                       #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+                       return True
+
+       print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+       return False
+       
index b437842..a5139eb 100644 (file)
@@ -94,7 +94,7 @@ class FindbadPCURecord(Entity):
 
        @classmethod
        def get_latest_by(cls, **kwargs):
-               return cls.query.filter_by(**kwargs)
+               return cls.query.filter_by(**kwargs).first()
 
 # ACCOUNTING
        date_checked = Field(DateTime)
index 385ac63..98c8856 100644 (file)
@@ -274,6 +274,17 @@ legend:
   2+ - all existing slices will be disabled.
        """)
 
+       newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
+
+    %(hostname)s  
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
        nmreset =("""NM Reset at %(loginbase)s""",
        """
 Monitor restarted NM on the following machines:
@@ -361,10 +372,10 @@ Thank you very much for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-       newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", 
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: 
+       newalphacd_notice=(""" New Boot Images for %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware: 
 
-%(hostname_list)s  
+%(hostname)s  
 
 To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.  
 
@@ -385,14 +396,14 @@ Thank you for your help,
        # TODO: need reminder versions for repeats...
        newdown=[newdown_one, newdown_two, newdown_three]
        newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
-       newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+       #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
        newthankyou=[thankyou,thankyou,thankyou]
        pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
        NMReset=[nmreset,nmreset,nmreset]
        pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
        pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
-       unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+       unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
 
@@ -478,7 +489,7 @@ Thank you for your help,
        donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
 
 
-       minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+       minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
 
@@ -498,7 +509,7 @@ BootManager.log output follows:
 %(bmlog)s
 """      )
 
-       baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+       baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""", 
                           """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
 
 Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
@@ -564,7 +575,7 @@ BootManager.log output follows:
 %(bmlog)s
 """)
 
-       plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+       nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
        https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
@@ -604,7 +615,7 @@ Thanks.
 """)
 
 
-       baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
+       baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""", 
 """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
 
     %(hostname)s 
index a0490e4..46ca879 100755 (executable)
@@ -44,31 +44,47 @@ def check_node_state(rec, node):
                boot_state = "unknown"
                last_contact = None
 
+       if boot_state == 'disable': boot_state = 'disabled'
+       if boot_state == 'diag':        boot_state = 'diagnose'
+
        # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
        #                       'translations' into the node.status state
        #               'BOOT' is a permanent state, but we want it to have a bit of
        #                       hysteresis (less than 0.5 days)
 
-       #################################################################3
-       # "Translate" the findbad states into nodebad status.
+       #################################################################
+       # "Initialize" the findbad states into nodebad status if they are not already set
 
-       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disable' :
+       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
                print "changed status from %s to offline" % node.status
                node.status = 'offline'
                node.last_changed = datetime.now()
 
-       if node_state == 'DEBUG' and node.status != 'monitordebug':
-               print "changed status from %s to monitordebug" % (node.status)
-               node.status = "monitordebug"
-               node.last_changed = datetime.now()
+       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+                                                                node.status != 'disabled' and \
+                                                                node.status != 'diagnose':
+               if boot_state != 'disabled' and boot_state != 'diagnose':
+
+                       print "changed status from %s to monitordebug" % (node.status)
+                       node.status = "monitordebug"
+                       node.last_changed = datetime.now()
+               else:
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
 
        if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
                print "changed status from %s to online" % node.status
                node.status = 'online'
                node.last_changed = datetime.now()
 
-       #################################################################3
+       #################################################################
        # Switch temporary hystersis states into their 'firm' states.
+       #         online -> good                after half a day
+       #         offline -> down               after two days
+       #         monitordebug -> down  after 30 days
+       #         diagnose -> monitordebug after 60 days
+       #         disabled -> down              after 60 days
 
        if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
                print "changed status from %s to good" % node.status
@@ -80,11 +96,16 @@ def check_node_state(rec, node):
                node.status = 'down'
                # NOTE: do not reset last_changed, or you lose how long it's been down.
 
-       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 14):
+       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
                print "changed status from %s to down" % node.status
                node.status = 'down'
                # NOTE: do not reset last_changed, or you lose how long it's been down.
-               #node.last_changed = datetime.now()
+
+       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+               print "changed status from %s to down" % node.status
+               # NOTE: change an admin mode back into monitordebug after two months.
+               node.status = 'monitordebug'
+               node.last_changed = datetime.now()
 
        # extreme cases of offline nodes
        if ( boot_state == 'disabled' or last_contact == None ) and \
index 899d667..47627b4 100644 (file)
@@ -197,6 +197,7 @@ class SSH(CMD):
        def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
                cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
                                                                        self.user, self.host, cmd)
+               #print cmd
                r = CMD.run_noexcept(self, cmd, timeout)
                self.ret = -1
 
index 1178aa1..0d4e703 100644 (file)
@@ -12,14 +12,15 @@ from monitor.database.zabbixapi.model import *
 from monitor.database.dborm import zab_session as session
 from monitor.database.dborm import zab_metadata as metadata
 
-from pcucontrol import reboot
+from monitor import reboot
+from monitor import scanapi
+
 from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
 from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
 from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
 
 from monitorweb.templates.links import *
 
-from monitor import scanapi
 
 
 def query_to_dict(query):
@@ -103,7 +104,7 @@ class NodeWidget(widgets.Widget):
 
 def prep_node_for_display(node):
        if node.plc_pcuid:
-               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                if pcu:
                        node.pcu_status = pcu.reboot_trial_status
                        node.pcu_short_status = format_pcu_shortstatus(pcu)
@@ -168,40 +169,72 @@ class Root(controllers.RootController):
                return self.pcuview(None, hostname) # dict(nodequery=nodequery)
 
        @expose(template="monitorweb.templates.nodelist")
-       def node(self, filter='BOOT'):
+       def node(self, filter='boot'):
                import time
                fbquery = FindbadNodeRecord.get_all_latest()
                query = []
-               filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
+               filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, 
+                                               'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
                for node in fbquery:
                        # NOTE: reformat some fields.
                        prep_node_for_display(node)
 
-                       # NOTE: count filters
-                       if node.observed_status != 'DOWN':
-                               print node.hostname, node.observed_status
-                               filtercount[node.observed_status] += 1
-                       else:
+                       node.history.status
+
+                       if node.history.status in ['down', 'offline']:
                                if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
-                                       filtercount[node.observed_status] += 1
+                                       filtercount['down'] += 1
                                else:
                                        filtercount['neverboot'] += 1
+                       elif node.history.status in ['good', 'online']:
+                               filtercount['boot'] += 1
+                       elif node.history.status in ['debug', 'monitordebug']:
+                               filtercount['debug'] += 1
+                       else:
+                               filtercount[node.history.status] += 1
+                               
+                       ## NOTE: count filters
+                       #if node.observed_status != 'DOWN':
+                       #       print node.hostname, node.observed_status
+                       #       if node.observed_status == 'DEBUG':
+                       #               if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+                       #                       filtercount[node.plc_node_stats['boot_state']] += 1
+                       #               else:
+                       #                       filtercount['debug'] += 1
+                       #                       
+                       #       else:
+                       #               filtercount[node.observed_status] += 1
+                       #else:
+                       #       if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+                       #               filtercount[node.observed_status] += 1
+                       #       else:
+                       #               filtercount['neverboot'] += 1
 
                        # NOTE: apply filter
-                       if filter == node.observed_status:
-                               if filter == "DOWN":
-                                       if node.plc_node_stats['last_contact'] != None:
-                                               query.append(node)
-                               else:
-                                       query.append(node)
-                       elif filter == "neverboot":
+                       if filter == "neverboot":
                                if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
                                        query.append(node)
-                       elif filter == "pending":
-                               # TODO: look in message logs...
-                               pass
                        elif filter == "all":
                                query.append(node)
+                       elif filter == node.history.status:
+                               query.append(node)
+
+                       #if filter == node.observed_status:
+                       #       if filter == "DOWN":
+                       #               if node.plc_node_stats['last_contact'] != None:
+                       #                       query.append(node)
+                       #       else:
+                       #               query.append(node)
+                       #elif filter == "neverboot":
+                       #       if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+                       #               query.append(node)
+                       #elif filter == "pending":
+                       #       # TODO: look in message logs...
+                       #       pass
+                       #elif filter == node.plc_node_stats['boot_state']:
+                       #       query.append(node)
+                       #elif filter == "all":
+                       #       query.append(node)
                                
                widget = NodeWidget(template='monitorweb.templates.node_template')
                return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
@@ -222,7 +255,7 @@ class Root(controllers.RootController):
                                if 'pcuid' in val:
                                        pcuid = val['pcuid']
                                elif 'hostname' in val:
-                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+                                       pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
                                else:
                                        pcuid=None
                        else:
@@ -304,7 +337,7 @@ class Root(controllers.RootController):
                                        prep_node_for_display(node)
                                        nodequery += [node]
                                        if node.plc_pcuid:      # not None
-                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+                                               pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
                                                prep_pcu_for_display(pcu)
                                                pcus[node.plc_pcuid] = pcu
 
@@ -326,7 +359,6 @@ class Root(controllers.RootController):
                                        node = FindbadNodeRecord.get_latest_by(hostname=nodename)
                                        print "%s" % node.port_status
                                        print "%s" % node.to_dict()
-                                       print "%s" % len(q.all())
                                        if node:
                                                prep_node_for_display(node)
                                                nodequery += [node]
index 5b4e7c3..53bbe5b 100644 (file)
@@ -13,17 +13,19 @@ from links import *
        <table width="100%">
                <thead>
                        <tr>
-                               <th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
-                               <th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
-                               <th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+                               <th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+                               <th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+                               <th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+                               <th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+                               <th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
                                <th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
-                               <th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+                               <!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
                                <th><a href="${link('node', filter='all')}">All</a></th>
                        </tr>
                </thead>
                <tbody>
                <tr>
-               <td colspan="5">
+               <td colspan="7">
                <table id="sortable_table" class="datagrid" border="1" width="100%">
                        <thead>
                                <tr>