X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=nodebad.py;h=90c3be025a3d7b1d806bf7153b52fd5d6713c352;hb=ceacc2c6b09245cf360dc8f2afecc702d7540553;hp=767a4fe634e0c3aa569b89f04625bf2780e99b10;hpb=6e0002914c3efb262f55cb250a0627b40659ccbf;p=monitor.git diff --git a/nodebad.py b/nodebad.py index 767a4fe..90c3be0 100755 --- a/nodebad.py +++ b/nodebad.py @@ -22,33 +22,77 @@ api = plc.getAuthAPI() round = 1 count = 0 +def main(): + main2(config) -def main(config): +def main2(config): l_plcnodes = plccache.l_nodes l_nodes = get_nodeset(config) checkAndRecordState(l_nodes, l_plcnodes) +# Node states: + +def check_node_state(rec, node): + + node_state = rec.observed_status + if rec.plc_node_stats: + boot_state = rec.plc_node_stats['boot_state'] + last_contact = rec.plc_node_stats['last_contact'] + else: + boot_state = "unknown" + last_contact = None + + if node_state == 'DOWN' and ( node.status == 'online' or node.status == 'good' ): + print "changed status from %s to offline" % node.status + node.status = 'offline' + node.last_changed = datetime.now() + + if node_state == 'BOOT' and changed_lessthan(node.last_changed, 0.5) and node.status != 'online': + print "changed status from %s to online" % node.status + node.status = 'online' + node.last_changed = datetime.now() + + if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): + #send thank you notice, or on-line notice. + print "changed status from %s to good" % node.status + node.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1): # and pcu.status == 'good' + # # attempt reboots + # pass + #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1.5): # and node.has_pcu + # # send PCU failure message + # pass + + if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): + print "changed status from %s to down" % node.status + # send down node notice + node.status = 'down' + node.last_changed = datetime.now() + + if ( boot_state == 'disabled' or last_contact == None ) and \ + changed_greaterthan(node.last_changed, 2*30) and \ + node.status != 'down': + print "changed status from %s to down" % node.status + node.status = 'down' + node.last_changed = datetime.now() + def checkAndRecordState(l_nodes, l_plcnodes): global count for nodename in l_nodes: - d_node = None - for node in l_plcnodes: - if node['hostname'] == nodename: - d_node = node - break - if not d_node: - continue - pf = HistoryNodeRecord.findby_or_create(hostname=nodename) - pf.last_checked = datetime.now() + nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + nodehist.last_checked = datetime.now() try: # Find the most recent record noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first() - #print "NODEREC: ", noderec.date_checked except: print "COULD NOT FIND %s" % nodename import traceback @@ -59,33 +103,16 @@ def checkAndRecordState(l_nodes, l_plcnodes): print "none object for %s"% nodename continue - node_state = noderec.observed_status - if noderec.plc_node_stats: - boot_state = noderec.plc_node_stats['boot_state'] - else: - boot_state = "unknown" - - if node_state == "BOOT": - if pf.status != "good": - pf.last_changed = datetime.now() - pf.status = "good" - elif node_state == "DEBUG": - if pf.status != boot_state: - pf.last_changed = datetime.now() - pf.status = boot_state - else: - if pf.status != "down": - pf.last_changed = datetime.now() - pf.status = "down" + check_node_state(noderec, nodehist) count += 1 - print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple()))) + print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) # NOTE: this commits all pending operations to the DB. Do not remove, or # replace with another operations that also commits all pending ops, such # as session.commit() or flush() or something - print HistoryNodeRecord.query.count() session.flush() + print HistoryNodeRecord.query.count() return True @@ -97,7 +124,7 @@ if __name__ == '__main__': config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback print traceback.print_exc()