add pcu_name to pcufailed_notice

[monitor.git] / nodebad.py
diff --git a/nodebad.py b/nodebad.py

index 46ca879..e7fc819 100755 (executable)
--- a/nodebad.py
+++ b/nodebad.py
@@ -38,39 +38,60 @@ def check_node_state(rec, node):
  
         node_state = rec.observed_status
         if rec.plc_node_stats:
+               print rec.plc_node_stats
                 boot_state = rec.plc_node_stats['boot_state']
+               run_level = rec.plc_node_stats['run_level']
                 last_contact = rec.plc_node_stats['last_contact']
+               node.plc_nodeid = rec.plc_node_stats['node_id']
         else:
                 boot_state = "unknown"
                 last_contact = None
  
         if boot_state == 'disable': boot_state = 'disabled'
-       if boot_state == 'diag':        boot_state = 'diagnose'
+       if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
+
+       if len(rec.plc_node_stats['pcu_ids']) > 0:
+               node.haspcu = True
+       else:
+               node.haspcu = False
+
+       node.firewall = rec.firewall
+       node.plc_siteid = rec.plc_node_stats['site_id']
  
         # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
         #                       'translations' into the node.status state
         #               'BOOT' is a permanent state, but we want it to have a bit of
         #                       hysteresis (less than 0.5 days)
-
         #################################################################
         # "Initialize" the findbad states into nodebad status if they are not already set
  
-       if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
-               print "changed status from %s to offline" % node.status
-               node.status = 'offline'
-               node.last_changed = datetime.now()
+       if node_state == 'DOWN':
+               if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
+                       node.status != 'disabled':
+                       # NOTE: if changed less than 2 months, then we can allow this. 
+                       # otherwise, apply 'down' status after greater than 2 months (below).
  
-       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
-                                                                node.status != 'disabled' and \
-                                                                node.status != 'diagnose':
-               if boot_state != 'disabled' and boot_state != 'diagnose':
+                       print "changed status from %s to %s" % (node.status, boot_state)
+                       node.status = boot_state
+                       node.last_changed = datetime.now()
  
-                       print "changed status from %s to monitordebug" % (node.status)
-                       node.status = "monitordebug"
+               if node.status not in ['offline', 'down', 'disabled']:
+                       print "changed status from %s to offline" % node.status
+                       node.status = 'offline'
                         node.last_changed = datetime.now()
+
+       if node_state == 'DEBUG':
+               if boot_state != 'disabled' and boot_state != 'safeboot':
+                       print "changed status from %s to failboot" % (node.status)
+                       current_status = "failboot"
                 else:
                         print "changed status from %s to %s" % (node.status, boot_state)
-                       node.status = boot_state
+                       current_status = boot_state
+
+               if current_status != node.status and \
+                       current_status in ['failboot', 'disabled', 'safeboot']:
+
+                       node.status = current_status
                         node.last_changed = datetime.now()
  
         if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
@@ -82,8 +103,8 @@ def check_node_state(rec, node):
         # Switch temporary hystersis states into their 'firm' states.
         #         online -> good                after half a day
         #         offline -> down               after two days
-       #         monitordebug -> down  after 30 days
-       #         diagnose -> monitordebug after 60 days
+       #         failboot -> down  after 30 days
+       #         safeboot -> failboot after 60 days
         #         disabled -> down              after 60 days
  
         if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
@@ -96,15 +117,15 @@ def check_node_state(rec, node):
                 node.status = 'down'
                 # NOTE: do not reset last_changed, or you lose how long it's been down.
  
-       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+       if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
                 print "changed status from %s to down" % node.status
                 node.status = 'down'
                 # NOTE: do not reset last_changed, or you lose how long it's been down.
  
-       if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+       if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
                 print "changed status from %s to down" % node.status
-               # NOTE: change an admin mode back into monitordebug after two months.
-               node.status = 'monitordebug'
+               # NOTE: change an admin mode back into failboot after two months.
+               node.status = 'failboot'
                 node.last_changed = datetime.now()
  
         # extreme cases of offline nodes
@@ -131,6 +152,7 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                 except:
                         print "COULD NOT FIND %s" % nodename
                         import traceback
+                       email_exception()
                         print traceback.print_exc()
                         continue
  
@@ -143,11 +165,8 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                 count += 1
                 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
  
-       # NOTE: this commits all pending operations to the DB.  Do not remove, or
-       # replace with another operations that also commits all pending ops, such
-       # as session.commit() or flush() or something
+       # NOTE: this commits all pending operations to the DB.  Do not remove. 
         session.flush()
-       print HistoryNodeRecord.query.count()
  
         return True