changed 'monitordebug' to failboot
[monitor.git] / policy.py
index f9605ae..77cf76e 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -91,6 +91,7 @@ def main(hostnames, sitenames):
                # if it is offline and HAS a PCU, then try to use it.
                if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not nodehist.firewall and \
                        not found_between(recent_actions, 'try_reboot', 3.5, 1):
 
                                sitehist.attemptReboot(host)
@@ -100,6 +101,7 @@ def main(hostnames, sitenames):
                #               will be false for a day after the above condition is satisfied
                if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.5) and \
+                       not nodehist.firewall and \
                        found_between(recent_actions, 'try_reboot', 3.5, 1) and \
                        not found_within(recent_actions, 'pcufailed_notice', 3.5):
                                
@@ -108,7 +110,7 @@ def main(hostnames, sitenames):
                                sitehist.sendMessage('pcufailed_notice', hostname=host)
                                print "send message for host %s PCU Failure" % host
 
-               if nodehist.status == 'monitordebug' and \
+               if nodehist.status == 'failboot' and \
                        changed_greaterthan(nodehist.last_changed, 1) and \
                        not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
                                # send down node notice
@@ -119,12 +121,17 @@ def main(hostnames, sitenames):
                        #       sitehist.sendMessage('retry_bootman', hostname=host)
 
                if nodehist.status == 'down' and \
-                       changed_greaterthan(nodehist.last_changed, 2) and \
-                       not found_within(recent_actions, 'down_notice', 3.5):
-                               # send down node notice
-
-                               sitehist.sendMessage('down_notice', hostname=host)
-                               print "send message for host %s down" % host
+                       changed_greaterthan(nodehist.last_changed, 2):
+                               if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
+                                       # send down node notice
+                                       sitehist.sendMessage('down_notice', hostname=host)
+                                       print "send message for host %s down" % host
+
+                               if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
+                                       # send down node notice
+                                       #email_exception(host, "firewall_notice")
+                                       sitehist.sendMessage('firewall_notice', hostname=host)
+                                       print "send message for host %s down" % host
 
                node_count = node_count + 1
                print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
@@ -139,6 +146,8 @@ def main(hostnames, sitenames):
                if siteblack and not siteblack.expired():
                        print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
                        skip_due_to_blacklist=True
+                       sitehist.clearPenalty()
+                       sitehist.applyPenalty()
                        continue
 
                # TODO: make query only return records within a certin time range,