X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=nodebad.py;h=7f512e16656e3a499887240d645b495061fae530;hb=820bfaa1c3b4e91209a6c0c55001714b3b78f77f;hp=46ca879bbc78dd05ad6689e3f959e98285bdf807;hpb=c9d06f3b274ecbc092a0b3eb1f5ceb6c0f734aad;p=monitor.git diff --git a/nodebad.py b/nodebad.py index 46ca879..7f512e1 100755 --- a/nodebad.py +++ b/nodebad.py @@ -38,14 +38,22 @@ def check_node_state(rec, node): node_state = rec.observed_status if rec.plc_node_stats: + print rec.plc_node_stats boot_state = rec.plc_node_stats['boot_state'] last_contact = rec.plc_node_stats['last_contact'] + node.plc_nodeid = rec.plc_node_stats['node_id'] else: boot_state = "unknown" last_contact = None if boot_state == 'disable': boot_state = 'disabled' - if boot_state == 'diag': boot_state = 'diagnose' + if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot' + + if len(rec.plc_node_stats['pcu_ids']) > 0: + node.haspcu = True + else: + node.haspcu = False + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need # 'translations' into the node.status state @@ -62,8 +70,8 @@ def check_node_state(rec, node): if node_state == 'DEBUG' and node.status != 'monitordebug' and \ node.status != 'disabled' and \ - node.status != 'diagnose': - if boot_state != 'disabled' and boot_state != 'diagnose': + node.status != 'safeboot': + if boot_state != 'disabled' and boot_state != 'safeboot': print "changed status from %s to monitordebug" % (node.status) node.status = "monitordebug" @@ -83,7 +91,7 @@ def check_node_state(rec, node): # online -> good after half a day # offline -> down after two days # monitordebug -> down after 30 days - # diagnose -> monitordebug after 60 days + # safeboot -> monitordebug after 60 days # disabled -> down after 60 days if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): @@ -101,7 +109,7 @@ def check_node_state(rec, node): node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. - if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60): + if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): print "changed status from %s to down" % node.status # NOTE: change an admin mode back into monitordebug after two months. node.status = 'monitordebug' @@ -131,6 +139,7 @@ def checkAndRecordState(l_nodes, l_plcnodes): except: print "COULD NOT FIND %s" % nodename import traceback + email_exception() print traceback.print_exc() continue @@ -143,11 +152,8 @@ def checkAndRecordState(l_nodes, l_plcnodes): count += 1 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) - # NOTE: this commits all pending operations to the DB. Do not remove, or - # replace with another operations that also commits all pending ops, such - # as session.commit() or flush() or something + # NOTE: this commits all pending operations to the DB. Do not remove. session.flush() - print HistoryNodeRecord.query.count() return True