svn merge -r 12308:13112 https://svn.planet-lab.org/svn/Monitor/branches/2.0/

[monitor.git] / nodebad.py
diff --git a/nodebad.py b/nodebad.py

index 767a4fe..46ca879 100755 (executable)
--- a/nodebad.py
+++ b/nodebad.py
@@ -22,33 +22,112 @@ api = plc.getAuthAPI()
  
  round = 1
  count = 0
+def main():
+       main2(config)
  
-def main(config):
+def main2(config):
  
         l_plcnodes = plccache.l_nodes
         l_nodes = get_nodeset(config)
         
         checkAndRecordState(l_nodes, l_plcnodes)
  
+# Node states:
+
+def check_node_state(rec, node):
+
+       node_state = rec.observed_status
+       if rec.plc_node_stats:
+               boot_state = rec.plc_node_stats['boot_state']
+               last_contact = rec.plc_node_stats['last_contact']
+       else:
+               boot_state = "unknown"
+               last_contact = None
+
+       if boot_state == 'disable': boot_state = 'disabled'
+       if boot_state == 'diag':        boot_state = 'diagnose'
+
+       # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+       #                       'translations' into the node.status state
+       #               'BOOT' is a permanent state, but we want it to have a bit of
+       #                       hysteresis (less than 0.5 days)
+
+       #################################################################
+       # "Initialize" the findbad states into nodebad status if they are not already set
+
+       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+               print "changed status from %s to offline" % node.status
+               node.status = 'offline'
+               node.last_changed = datetime.now()
+
+       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+                                                                node.status != 'disabled' and \
+                                                                node.status != 'diagnose':
+               if boot_state != 'disabled' and boot_state != 'diagnose':
+
+                       print "changed status from %s to monitordebug" % (node.status)
+                       node.status = "monitordebug"
+                       node.last_changed = datetime.now()
+               else:
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
+
+       if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+               print "changed status from %s to online" % node.status
+               node.status = 'online'
+               node.last_changed = datetime.now()
+
+       #################################################################
+       # Switch temporary hystersis states into their 'firm' states.
+       #         online -> good                after half a day
+       #         offline -> down               after two days
+       #         monitordebug -> down  after 30 days
+       #         diagnose -> monitordebug after 60 days
+       #         disabled -> down              after 60 days
+
+       if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+               print "changed status from %s to good" % node.status
+               node.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+               print "changed status from %s to down" % node.status
+               # NOTE: change an admin mode back into monitordebug after two months.
+               node.status = 'monitordebug'
+               node.last_changed = datetime.now()
+
+       # extreme cases of offline nodes
+       if ( boot_state == 'disabled' or last_contact == None ) and \
+                       changed_greaterthan(node.last_changed, 2*30) and \
+                       node.status != 'down':
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               node.last_changed = datetime.now()
+
  def checkAndRecordState(l_nodes, l_plcnodes):
         global count
  
         for nodename in l_nodes:
-               d_node = None
-               for node in l_plcnodes:
-                       if node['hostname'] == nodename:
-                               d_node = node
-                               break
-               if not d_node:
-                       continue
  
-               pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
-               pf.last_checked = datetime.now()
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                                                       if_new_set={'status' : 'offline', 
+                                                                               'last_changed' : datetime.now()})
+               nodehist.last_checked = datetime.now()
  
                 try:
                         # Find the most recent record
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #print "NODEREC: ", noderec.date_checked
+                       noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
                 except:
                         print "COULD NOT FIND %s" % nodename
                         import traceback
@@ -59,33 +138,16 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                         print "none object for %s"% nodename
                         continue
  
-               node_state = noderec.observed_status
-               if noderec.plc_node_stats:
-                       boot_state = noderec.plc_node_stats['boot_state']
-               else:
-                       boot_state = "unknown"
-
-               if node_state == "BOOT":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "good"
-               elif node_state == "DEBUG":
-                       if pf.status != boot_state: 
-                               pf.last_changed = datetime.now()
-                               pf.status = boot_state
-               else:
-                       if pf.status != "down": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "down"
+               check_node_state(noderec, nodehist)
  
                 count += 1
-               print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
  
         # NOTE: this commits all pending operations to the DB.  Do not remove, or
         # replace with another operations that also commits all pending ops, such
         # as session.commit() or flush() or something
-       print HistoryNodeRecord.query.count()
         session.flush()
+       print HistoryNodeRecord.query.count()
  
         return True
  
@@ -97,7 +159,7 @@ if __name__ == '__main__':
         config = parsermodule.parse_args(parser)
  
         try:
-               main(config)
+               main2(config)
         except Exception, err:
                 import traceback
                 print traceback.print_exc()