Tagging module Monitor - Monitor-2.0-10
[monitor.git] / nodebad.py
index 90c3be0..c3aae39 100755 (executable)
@@ -38,41 +38,82 @@ def check_node_state(rec, node):
 
        node_state = rec.observed_status
        if rec.plc_node_stats:
+               print rec.plc_node_stats
                boot_state = rec.plc_node_stats['boot_state']
                last_contact = rec.plc_node_stats['last_contact']
        else:
                boot_state = "unknown"
                last_contact = None
 
-       if node_state == 'DOWN' and ( node.status == 'online' or node.status == 'good' ):
+       if boot_state == 'disable': boot_state = 'disabled'
+       if boot_state == 'diag':        boot_state = 'diagnose'
+
+       if len(rec.plc_node_stats['pcu_ids']) > 0:
+               node.haspcu = True
+       else:
+               node.haspcu = False
+
+       # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+       #                       'translations' into the node.status state
+       #               'BOOT' is a permanent state, but we want it to have a bit of
+       #                       hysteresis (less than 0.5 days)
+
+       #################################################################
+       # "Initialize" the findbad states into nodebad status if they are not already set
+
+       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
                print "changed status from %s to offline" % node.status
                node.status = 'offline'
                node.last_changed = datetime.now()
 
-       if node_state == 'BOOT' and changed_lessthan(node.last_changed, 0.5) and node.status != 'online':
+       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+                                                                node.status != 'disabled' and \
+                                                                node.status != 'diagnose':
+               if boot_state != 'disabled' and boot_state != 'diagnose':
+
+                       print "changed status from %s to monitordebug" % (node.status)
+                       node.status = "monitordebug"
+                       node.last_changed = datetime.now()
+               else:
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
+
+       if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
                print "changed status from %s to online" % node.status
                node.status = 'online'
                node.last_changed = datetime.now()
 
+       #################################################################
+       # Switch temporary hystersis states into their 'firm' states.
+       #         online -> good                after half a day
+       #         offline -> down               after two days
+       #         monitordebug -> down  after 30 days
+       #         diagnose -> monitordebug after 60 days
+       #         disabled -> down              after 60 days
+
        if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
-               #send thank you notice, or on-line notice.
                print "changed status from %s to good" % node.status
                node.status = 'good'
                # NOTE: do not reset last_changed, or you lose how long it's been up.
 
-       #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1): #  and pcu.status == 'good' 
-       #       # attempt reboots
-       #       pass
-       #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1.5): # and node.has_pcu
-       #       # send PCU failure message
-       #       pass
-
        if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
                print "changed status from %s to down" % node.status
-               # send down node notice
                node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+               print "changed status from %s to down" % node.status
+               # NOTE: change an admin mode back into monitordebug after two months.
+               node.status = 'monitordebug'
                node.last_changed = datetime.now()
 
+       # extreme cases of offline nodes
        if ( boot_state == 'disabled' or last_contact == None ) and \
                        changed_greaterthan(node.last_changed, 2*30) and \
                        node.status != 'down':
@@ -92,10 +133,11 @@ def checkAndRecordState(l_nodes, l_plcnodes):
 
                try:
                        # Find the most recent record
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
+                       noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
+                       email_exception()
                        print traceback.print_exc()
                        continue
 
@@ -108,11 +150,8 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                count += 1
                print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
-       # NOTE: this commits all pending operations to the DB.  Do not remove, or
-       # replace with another operations that also commits all pending ops, such
-       # as session.commit() or flush() or something
+       # NOTE: this commits all pending operations to the DB.  Do not remove. 
        session.flush()
-       print HistoryNodeRecord.query.count()
 
        return True