X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=nodebad.py;h=dc866645eba29d39184df4544391e3b19bfea537;hb=d56e149e9dbd22321a9d843b41b4d279614937e2;hp=7f512e16656e3a499887240d645b495061fae530;hpb=0a49240ceff10f9da64fd470ed883bb17a11c458;p=monitor.git diff --git a/nodebad.py b/nodebad.py index 7f512e1..dc86664 100755 --- a/nodebad.py +++ b/nodebad.py @@ -6,7 +6,7 @@ import string import time from datetime import datetime,timedelta -from nodequery import verify,query_to_dict,node_select +from monitor.query import verify,query_to_dict,node_select from monitor.common import * @@ -40,6 +40,7 @@ def check_node_state(rec, node): if rec.plc_node_stats: print rec.plc_node_stats boot_state = rec.plc_node_stats['boot_state'] + run_level = rec.plc_node_stats['run_level'] last_contact = rec.plc_node_stats['last_contact'] node.plc_nodeid = rec.plc_node_stats['node_id'] else: @@ -54,31 +55,43 @@ def check_node_state(rec, node): else: node.haspcu = False + node.firewall = rec.firewall + node.plc_siteid = rec.plc_node_stats['site_id'] # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need # 'translations' into the node.status state # 'BOOT' is a permanent state, but we want it to have a bit of # hysteresis (less than 0.5 days) - ################################################################# # "Initialize" the findbad states into nodebad status if they are not already set - if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' : - print "changed status from %s to offline" % node.status - node.status = 'offline' - node.last_changed = datetime.now() + if node_state == 'DOWN': + if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \ + node.status != 'disabled': + # NOTE: if changed less than 2 months, then we can allow this. + # otherwise, apply 'down' status after greater than 2 months (below). - if node_state == 'DEBUG' and node.status != 'monitordebug' and \ - node.status != 'disabled' and \ - node.status != 'safeboot': - if boot_state != 'disabled' and boot_state != 'safeboot': + print "changed status from %s to %s" % (node.status, boot_state) + node.status = boot_state + node.last_changed = datetime.now() - print "changed status from %s to monitordebug" % (node.status) - node.status = "monitordebug" + if node.status not in ['offline', 'down', 'disabled']: + print "changed status from %s to offline" % node.status + node.status = 'offline' node.last_changed = datetime.now() + + if node_state == 'DEBUG': + if boot_state != 'disabled' and boot_state != 'safeboot': + print "changed status from %s to failboot" % (node.status) + current_status = "failboot" else: print "changed status from %s to %s" % (node.status, boot_state) - node.status = boot_state + current_status = boot_state + + if current_status != node.status and \ + current_status in ['failboot', 'disabled', 'safeboot']: + + node.status = current_status node.last_changed = datetime.now() if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': @@ -90,8 +103,8 @@ def check_node_state(rec, node): # Switch temporary hystersis states into their 'firm' states. # online -> good after half a day # offline -> down after two days - # monitordebug -> down after 30 days - # safeboot -> monitordebug after 60 days + # failboot -> down after 30 days + # safeboot -> failboot after 60 days # disabled -> down after 60 days if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): @@ -104,15 +117,15 @@ def check_node_state(rec, node): node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. - if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30): + if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30): print "changed status from %s to down" % node.status node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): print "changed status from %s to down" % node.status - # NOTE: change an admin mode back into monitordebug after two months. - node.status = 'monitordebug' + # NOTE: change an admin mode back into failboot after two months. + node.status = 'failboot' node.last_changed = datetime.now() # extreme cases of offline nodes