X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=policy.py;h=77cf76e2b6097110d53a56ba73efc86cc3de27ee;hb=40884a302bf204a7f42044b72d87f9431ad6dd35;hp=f9605ae77dc6746647840e7f880f8a0b2255b468;hpb=035a846d8617889c01cae12bc6d64eb7c48b64bd;p=monitor.git diff --git a/policy.py b/policy.py index f9605ae..77cf76e 100755 --- a/policy.py +++ b/policy.py @@ -91,6 +91,7 @@ def main(hostnames, sitenames): # if it is offline and HAS a PCU, then try to use it. if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.0) and \ + not nodehist.firewall and \ not found_between(recent_actions, 'try_reboot', 3.5, 1): sitehist.attemptReboot(host) @@ -100,6 +101,7 @@ def main(hostnames, sitenames): # will be false for a day after the above condition is satisfied if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.5) and \ + not nodehist.firewall and \ found_between(recent_actions, 'try_reboot', 3.5, 1) and \ not found_within(recent_actions, 'pcufailed_notice', 3.5): @@ -108,7 +110,7 @@ def main(hostnames, sitenames): sitehist.sendMessage('pcufailed_notice', hostname=host) print "send message for host %s PCU Failure" % host - if nodehist.status == 'monitordebug' and \ + if nodehist.status == 'failboot' and \ changed_greaterthan(nodehist.last_changed, 1) and \ not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): # send down node notice @@ -119,12 +121,17 @@ def main(hostnames, sitenames): # sitehist.sendMessage('retry_bootman', hostname=host) if nodehist.status == 'down' and \ - changed_greaterthan(nodehist.last_changed, 2) and \ - not found_within(recent_actions, 'down_notice', 3.5): - # send down node notice - - sitehist.sendMessage('down_notice', hostname=host) - print "send message for host %s down" % host + changed_greaterthan(nodehist.last_changed, 2): + if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5): + # send down node notice + sitehist.sendMessage('down_notice', hostname=host) + print "send message for host %s down" % host + + if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): + # send down node notice + #email_exception(host, "firewall_notice") + sitehist.sendMessage('firewall_notice', hostname=host) + print "send message for host %s down" % host node_count = node_count + 1 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') @@ -139,6 +146,8 @@ def main(hostnames, sitenames): if siteblack and not siteblack.expired(): print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() ) skip_due_to_blacklist=True + sitehist.clearPenalty() + sitehist.applyPenalty() continue # TODO: make query only return records within a certin time range,