import sys
import string
import time
+from datetime import datetime,timedelta
-
-import database
-import comon
-import threadpool
-import syncplcdb
from nodequery import verify,query_to_dict,node_select
-import plc
-import auth
-api = plc.PLC(auth.auth, auth.plc)
-from unified_model import *
-from monitor_policy import MINUP
+from monitor.common import *
+
+from monitor import config
+from monitor.wrapper import plc,plccache
+from monitor.const import MINUP
+from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord
+from monitor.database.dborm import mon_session as session
+
+from monitor.model import *
+
+api = plc.getAuthAPI()
round = 1
-externalState = {'round': round, 'nodes': {}}
count = 0
+def main():
+ main2(config)
-def main(config):
- global externalState
- externalState = database.if_cached_else(1, config.dbname, lambda : externalState)
- if config.increment:
- # update global round number to force refreshes across all nodes
- externalState['round'] += 1
+def main2(config):
- l_nodes = syncplcdb.create_plcdb()
- l_plcnodes = database.dbLoad("l_plcnodes")
-
- if config.node:
- l_nodes = [config.node]
- else:
- l_nodes = [node['hostname'] for node in l_plcnodes]
+ l_plcnodes = plccache.l_nodes
+ l_nodes = get_nodeset(config)
checkAndRecordState(l_nodes, l_plcnodes)
+# Node states:
+
+def check_node_state(rec, node):
+
+ node_state = rec.observed_status
+ if rec.plc_node_stats:
+ print rec.plc_node_stats
+ boot_state = rec.plc_node_stats['boot_state']
+ last_contact = rec.plc_node_stats['last_contact']
+ else:
+ boot_state = "unknown"
+ last_contact = None
+
+ if boot_state == 'disable': boot_state = 'disabled'
+ if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
+
+ if len(rec.plc_node_stats['pcu_ids']) > 0:
+ node.haspcu = True
+ else:
+ node.haspcu = False
+
+ # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
+ # 'translations' into the node.status state
+ # 'BOOT' is a permanent state, but we want it to have a bit of
+ # hysteresis (less than 0.5 days)
+
+ #################################################################
+ # "Initialize" the findbad states into nodebad status if they are not already set
+
+ if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+ print "changed status from %s to offline" % node.status
+ node.status = 'offline'
+ node.last_changed = datetime.now()
+
+ if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+ node.status != 'disabled' and \
+ node.status != 'safeboot':
+ if boot_state != 'disabled' and boot_state != 'safeboot':
+
+ print "changed status from %s to monitordebug" % (node.status)
+ node.status = "monitordebug"
+ node.last_changed = datetime.now()
+ else:
+ print "changed status from %s to %s" % (node.status, boot_state)
+ node.status = boot_state
+ node.last_changed = datetime.now()
+
+ if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+ print "changed status from %s to online" % node.status
+ node.status = 'online'
+ node.last_changed = datetime.now()
+
+ #################################################################
+ # Switch temporary hystersis states into their 'firm' states.
+ # online -> good after half a day
+ # offline -> down after two days
+ # monitordebug -> down after 30 days
+ # safeboot -> monitordebug after 60 days
+ # disabled -> down after 60 days
+
+ if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+ print "changed status from %s to good" % node.status
+ node.status = 'good'
+ # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+ if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+ if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+ if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
+ print "changed status from %s to down" % node.status
+ # NOTE: change an admin mode back into monitordebug after two months.
+ node.status = 'monitordebug'
+ node.last_changed = datetime.now()
+
+ # extreme cases of offline nodes
+ if ( boot_state == 'disabled' or last_contact == None ) and \
+ changed_greaterthan(node.last_changed, 2*30) and \
+ node.status != 'down':
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ node.last_changed = datetime.now()
+
def checkAndRecordState(l_nodes, l_plcnodes):
- global externalState
global count
- global_round = externalState['round']
for nodename in l_nodes:
- if nodename not in externalState['nodes']:
- externalState['nodes'][nodename] = {'round': 0, 'values': []}
-
- node_round = externalState['nodes'][nodename]['round']
- if node_round < global_round:
- # do work
- values = collectStatusAndState(nodename, l_plcnodes)
- global_round = externalState['round']
- externalState['nodes'][nodename]['values'] = values
- externalState['nodes'][nodename]['round'] = global_round
- else:
- count += 1
-
- if count % 20 == 0:
- database.dbDump(config.dbname, externalState)
-
- database.dbDump(config.dbname, externalState)
-
-fb = database.dbLoad('findbad')
-hn2lb = database.dbLoad("plcdb_hn2lb")
-
-def getnodesup(nodelist):
- up = 0
- for node in nodelist:
- if node['hostname'] in fb['nodes'].keys():
- try:
- if fb['nodes'][node['hostname']]['values']['state'] == "BOOT":
- up = up + 1
- except:
- pass
- return up
-
-def get(fb, path):
- indexes = path.split("/")
- values = fb
- for index in indexes:
- if index in values:
- values = values[index]
- else:
- return None
- return values
-def collectStatusAndState(nodename, l_plcnodes):
- global count
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
+ if_new_set={'status' : 'offline',
+ 'last_changed' : datetime.now()})
+ nodehist.last_checked = datetime.now()
- d_node = None
- for node in l_plcnodes:
- if node['hostname'] == nodename:
- d_node = node
- break
- if not d_node:
- return None
-
- pf = PersistFlags(nodename, 1, db='node_persistflags')
-
- if not pf.checkattr('last_changed'):
- pf.last_changed = time.time()
-
- pf.last_checked = time.time()
-
- if not pf.checkattr('status'):
- pf.status = "unknown"
-
- state_path = "nodes/" + nodename + "/values/state"
- bootstate_path = "nodes/" + nodename + "/values/plcnode/boot_state"
-
- if get(fb, state_path) == "BOOT":
- if pf.status != "good": pf.last_changed = time.time()
- pf.status = "good"
- elif get(fb, state_path) == "DEBUG":
- bs = get(fb, bootstate_path)
- if pf.status != bs: pf.last_changed = time.time()
- pf.status = bs
- else:
- if pf.status != "down": pf.last_changed = time.time()
- pf.status = "down"
+ try:
+ # Find the most recent record
+ noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
+ except:
+ print "COULD NOT FIND %s" % nodename
+ import traceback
+ email_exception()
+ print traceback.print_exc()
+ continue
+
+ if not noderec:
+ print "none object for %s"% nodename
+ continue
+
+ check_node_state(noderec, nodehist)
- count += 1
- print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(pf.last_changed))
- # updated by other modules
- #pf.enabled =
- #pf.suspended =
+ count += 1
+ print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
- pf.save()
+ # NOTE: this commits all pending operations to the DB. Do not remove.
+ session.flush()
return True
if __name__ == '__main__':
- from config import config
- from optparse import OptionParser
- parser = OptionParser()
- parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None,
- increment=False, dbname="nodebad", cachenodes=False)
- parser.add_option("", "--node", dest="node", metavar="hostname",
- help="Provide a single node to operate on")
- parser.add_option("", "--nodelist", dest="nodelist", metavar="file.list",
- help="Provide a list of files to operate on")
-
- parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
- help="Specify the name of the database to which the information is saved")
- parser.add_option("-i", "--increment", action="store_true", dest="increment",
- help="Increment round number to force refresh or retry")
- config = config(parser)
- config.parse_args()
+ from monitor import parser as parsermodule
+ parser = parsermodule.getParser(['nodesets'])
+ parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
+ parser = parsermodule.getParser(['defaults'], parser)
+ config = parsermodule.parse_args(parser)
try:
- main(config)
+ main2(config)
except Exception, err:
import traceback
print traceback.print_exc()
print "Exception: %s" % err
- print "Saving data... exitting."
- database.dbDump(config.dbname, externalState)
sys.exit(0)