X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=policy.py;h=d3af8e5424c5ea74824699a0805aef4de9b131f3;hb=45fea95bb898f254ea7e987d7417e9091885fbee;hp=fe54863c8d41a6a4198d378cd232ce7b9eed6ce3;hpb=32e64e33bc81735e22024c5a44510848bb3c88df;p=monitor.git diff --git a/policy.py b/policy.py index fe54863..d3af8e5 100755 --- a/policy.py +++ b/policy.py @@ -91,25 +91,62 @@ def main(hostnames, sitenames): # if it is offline and HAS a PCU, then try to use it. if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.0) and \ + not nodehist.firewall and \ not found_between(recent_actions, 'try_reboot', 3.5, 1): + # TODO: there MUST be a better way to do this... + # get fb node record for pcuid + fbpcu = None + fbnode = FindbadNodeRecord.get_latest_by(hostname=host) + if fbnode: + fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid) + sitehist.attemptReboot(host) print "send message for host %s try_reboot" % host + if not fbpcu.test_is_ok() and \ + not found_within(recent_actions, 'pcuerror_notice', 3.0): + + args = {} + if fbpcu: + args['pcu_name'] = fbpcu.pcu_name() + args['pcu_errors'] = fbpcu.pcu_errors() + args['plc_pcuid'] = fbpcu.plc_pcuid + else: + args['pcu_name'] = "error looking up pcu name" + args['pcu_errors'] = "" + args['plc_pcuid'] = 0 + + args['hostname'] = host + sitehist.sendMessage('pcuerror_notice', **args) + print "send message for host %s PCU Failure" % host + # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1) # will be false for a day after the above condition is satisfied if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.5) and \ + not nodehist.firewall and \ found_between(recent_actions, 'try_reboot', 3.5, 1) and \ not found_within(recent_actions, 'pcufailed_notice', 3.5): + # TODO: there MUST be a better way to do this... + # get fb node record for pcuid + fbpcu = None + fbnode = FindbadNodeRecord.get_latest_by(hostname=host) + if fbnode: + fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid) + if fbpcu: + pcu_name = fbpcu.pcu_name() + else: + pcu_name = "error looking up pcu name" + + # get fb pcu record for pcuid # send pcu failure message - #act = ActionRecord(**kwargs) - sitehist.sendMessage('pcufailed_notice', hostname=host) + sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name) print "send message for host %s PCU Failure" % host - if nodehist.status == 'monitordebug' and \ - changed_greaterthan(nodehist.last_changed, 1) and \ + if nodehist.status == 'failboot' and \ + changed_greaterthan(nodehist.last_changed, 0.25) and \ not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): # send down node notice # delay 0.5 days before retrying... @@ -127,7 +164,7 @@ def main(hostnames, sitenames): if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): # send down node notice - email_exception(host, "firewall_notice") + #email_exception(host, "firewall_notice") sitehist.sendMessage('firewall_notice', hostname=host) print "send message for host %s down" % host