X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=nodebad.py;h=c3aae39619d18335e985f1d62f2da2584d053ce5;hb=6a452e8ece2ca8a47105c128eaebc38507bc76c5;hp=96720fbda5d8bbcb3c4e3d8e895658f1347c36de;hpb=6496f5b4a0220e4055fee76c97f92293f9559117;p=monitor.git

diff --git a/nodebad.py b/nodebad.py
index 96720fb..c3aae39 100755
--- a/nodebad.py
+++ b/nodebad.py
@@ -4,158 +4,168 @@ import os
 import sys
 import string
 import time
+from datetime import datetime,timedelta
 
-
-import database
-import comon
-import threadpool
-import syncplcdb
 from nodequery import verify,query_to_dict,node_select
 
-import plc
-import auth
-api = plc.PLC(auth.auth, auth.plc)
-from unified_model import *
-from monitor_policy import MINUP
+from monitor.common import *
+
+from monitor import config
+from monitor.wrapper import plc,plccache
+from monitor.const import MINUP
+from monitor.database.info.model import  FindbadNodeRecord, HistoryNodeRecord
+from monitor.database.dborm import  mon_session as session
+
+from monitor.model import *
+
+api = plc.getAuthAPI()
 
 round = 1
-externalState = {'round': round, 'nodes': {}}
 count = 0
+def main():
+	main2(config)
 
-def main(config):
-	global externalState
-	externalState = database.if_cached_else(1, config.dbname, lambda : externalState) 
-	if config.increment:
-		# update global round number to force refreshes across all nodes
-		externalState['round'] += 1
+def main2(config):
 
-	l_nodes = syncplcdb.create_plcdb()
-	l_plcnodes = database.dbLoad("l_plcnodes")
-
-	if config.node:
-		l_nodes = [config.node]
-	else:
-		l_nodes = [node['hostname'] for node in l_plcnodes]
+	l_plcnodes = plccache.l_nodes
+	l_nodes = get_nodeset(config)
 	
 	checkAndRecordState(l_nodes, l_plcnodes)
 
+# Node states:
+
+def check_node_state(rec, node):
+
+	node_state = rec.observed_status
+	if rec.plc_node_stats:
+		print rec.plc_node_stats
+		boot_state = rec.plc_node_stats['boot_state']
+		last_contact = rec.plc_node_stats['last_contact']
+	else:
+		boot_state = "unknown"
+		last_contact = None
+
+	if boot_state == 'disable': boot_state = 'disabled'
+	if boot_state == 'diag': 	boot_state = 'diagnose'
+
+	if len(rec.plc_node_stats['pcu_ids']) > 0:
+		node.haspcu = True
+	else:
+		node.haspcu = False
+
+	# NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+	# 			'translations' into the node.status state
+	#		'BOOT' is a permanent state, but we want it to have a bit of
+	#			hysteresis (less than 0.5 days)
+
+	#################################################################
+	# "Initialize" the findbad states into nodebad status if they are not already set
+
+	if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+		print "changed status from %s to offline" % node.status
+		node.status = 'offline'
+		node.last_changed = datetime.now()
+
+	if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+								 node.status != 'disabled' and \
+								 node.status != 'diagnose':
+		if boot_state != 'disabled' and boot_state != 'diagnose':
+
+			print "changed status from %s to monitordebug" % (node.status)
+			node.status = "monitordebug"
+			node.last_changed = datetime.now()
+		else:
+			print "changed status from %s to %s" % (node.status, boot_state)
+			node.status = boot_state
+			node.last_changed = datetime.now()
+
+	if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+		print "changed status from %s to online" % node.status
+		node.status = 'online'
+		node.last_changed = datetime.now()
+
+	#################################################################
+	# Switch temporary hystersis states into their 'firm' states.
+	#	  online -> good		after half a day
+	#	  offline -> down		after two days
+	#	  monitordebug -> down  after 30 days
+	#	  diagnose -> monitordebug after 60 days
+	#	  disabled -> down		after 60 days
+
+	if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+		print "changed status from %s to good" % node.status
+		node.status = 'good'
+		# NOTE: do not reset last_changed, or you lose how long it's been up.
+
+	if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+		print "changed status from %s to down" % node.status
+		node.status = 'down'
+		# NOTE: do not reset last_changed, or you lose how long it's been down.
+
+	if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+		print "changed status from %s to down" % node.status
+		node.status = 'down'
+		# NOTE: do not reset last_changed, or you lose how long it's been down.
+
+	if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+		print "changed status from %s to down" % node.status
+		# NOTE: change an admin mode back into monitordebug after two months.
+		node.status = 'monitordebug'
+		node.last_changed = datetime.now()
+
+	# extreme cases of offline nodes
+	if ( boot_state == 'disabled' or last_contact == None ) and \
+			changed_greaterthan(node.last_changed, 2*30) and \
+			node.status != 'down':
+		print "changed status from %s to down" % node.status
+		node.status = 'down'
+		node.last_changed = datetime.now()
+
 def checkAndRecordState(l_nodes, l_plcnodes):
-	global externalState
 	global count
-	global_round = externalState['round']
 
 	for nodename in l_nodes:
-		if nodename not in externalState['nodes']:
-			externalState['nodes'][nodename] = {'round': 0, 'values': []}
-
-		node_round   = externalState['nodes'][nodename]['round']
-		if node_round < global_round:
-			# do work
-			values = collectStatusAndState(nodename, l_plcnodes)
-			global_round = externalState['round']
-			externalState['nodes'][nodename]['values'] = values
-			externalState['nodes'][nodename]['round'] = global_round
-		else:
-			count += 1
-
-		if count % 20 == 0:
-			database.dbDump(config.dbname, externalState)
-
-	database.dbDump(config.dbname, externalState)
-
-fb = database.dbLoad('findbad')
-hn2lb = database.dbLoad("plcdb_hn2lb")
-
-def getnodesup(nodelist):
-	up = 0
-	for node in nodelist:
-		if node['hostname'] in fb['nodes'].keys():
-			try:
-				if fb['nodes'][node['hostname']]['values']['state'] == "BOOT":
-					up = up + 1
-			except:
-				pass
-	return up
-
-def get(fb, path):
-	indexes = path.split("/")
-	values = fb
-	for index in indexes:
-		if index in values:
-			values = values[index]
-		else:
-			return None
-	return values
 
-def collectStatusAndState(nodename, l_plcnodes):
-	global count
+		nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+							if_new_set={'status' : 'offline', 
+										'last_changed' : datetime.now()})
+		nodehist.last_checked = datetime.now()
 
-	d_node = None
-	for node in l_plcnodes:
-		if node['hostname'] == nodename:
-			d_node = node
-			break
-	if not d_node:
-		return None
-
-	pf = PersistFlags(nodename, 1, db='node_persistflags')
-
-	if not pf.checkattr('last_changed'):
-		pf.last_changed = time.time()
-		
-	pf.last_checked = time.time()
-
-	if not pf.checkattr('status'):
-		pf.status = "unknown"
-
-	state_path     = "nodes/" + nodename + "/values/state"
-	bootstate_path = "nodes/" + nodename + "/values/plcnode/boot_state"
-
-	if get(fb, state_path) == "BOOT":
-		if pf.status != "good": pf.last_changed = time.time()
-		pf.status = "good"
-	elif get(fb, state_path)  == "DEBUG":
-		bs = get(fb, bootstate_path)
-		if pf.status != bs: pf.last_changed = time.time()
-		pf.status = bs
-	else:
-		if pf.status != "down": pf.last_changed = time.time()
-		pf.status = "down"
+		try:
+			# Find the most recent record
+			noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
+		except:
+			print "COULD NOT FIND %s" % nodename
+			import traceback
+			email_exception()
+			print traceback.print_exc()
+			continue
+
+		if not noderec:
+			print "none object for %s"% nodename
+			continue
+
+		check_node_state(noderec, nodehist)
 
-	count += 1
-	print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(pf.last_changed))
-	# updated by other modules
-	#pf.enabled = 
-	#pf.suspended = 
+		count += 1
+		print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
-	pf.save()
+	# NOTE: this commits all pending operations to the DB.  Do not remove. 
+	session.flush()
 
 	return True
 
 if __name__ == '__main__':
-	from config import config
-	from optparse import OptionParser
-	parser = OptionParser()
-	parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, 
-						increment=False, dbname="nodebad", cachenodes=False)
-	parser.add_option("", "--node", dest="node", metavar="hostname", 
-						help="Provide a single node to operate on")
-	parser.add_option("", "--nodelist", dest="nodelist", metavar="file.list", 
-						help="Provide a list of files to operate on")
-
-	parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
-						help="Specify the name of the database to which the information is saved")
-	parser.add_option("-i", "--increment", action="store_true", dest="increment", 
-						help="Increment round number to force refresh or retry")
-	config = config(parser)
-	config.parse_args()
+	from monitor import parser as parsermodule
+	parser = parsermodule.getParser(['nodesets'])
+	parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
+	parser = parsermodule.getParser(['defaults'], parser)
+	config = parsermodule.parse_args(parser)
 
 	try:
-		main(config)
+		main2(config)
 	except Exception, err:
 		import traceback
 		print traceback.print_exc()
 		print "Exception: %s" % err
-		print "Saving data... exitting."
-		database.dbDump(config.dbname, externalState)
 		sys.exit(0)