7 from datetime import datetime,timedelta
9 from nodequery import verify,query_to_dict,node_select
11 from monitor.common import *
13 from monitor import config
14 from monitor.wrapper import plc,plccache
15 from monitor.const import MINUP
16 from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord
17 from monitor.database.dborm import mon_session as session
19 from monitor.model import *
21 api = plc.getAuthAPI()
30 l_plcnodes = plccache.l_nodes
31 l_nodes = get_nodeset(config)
33 checkAndRecordState(l_nodes, l_plcnodes)
37 def check_node_state(rec, node):
39 node_state = rec.observed_status
40 if rec.plc_node_stats:
41 boot_state = rec.plc_node_stats['boot_state']
42 last_contact = rec.plc_node_stats['last_contact']
44 boot_state = "unknown"
47 if boot_state == 'disable': boot_state = 'disabled'
48 if boot_state == 'diag': boot_state = 'diagnose'
50 # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
51 # 'translations' into the node.status state
52 # 'BOOT' is a permanent state, but we want it to have a bit of
53 # hysteresis (less than 0.5 days)
55 #################################################################
56 # "Initialize" the findbad states into nodebad status if they are not already set
58 if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
59 print "changed status from %s to offline" % node.status
60 node.status = 'offline'
61 node.last_changed = datetime.now()
63 if node_state == 'DEBUG' and node.status != 'monitordebug' and \
64 node.status != 'disabled' and \
65 node.status != 'diagnose':
66 if boot_state != 'disabled' and boot_state != 'diagnose':
68 print "changed status from %s to monitordebug" % (node.status)
69 node.status = "monitordebug"
70 node.last_changed = datetime.now()
72 print "changed status from %s to %s" % (node.status, boot_state)
73 node.status = boot_state
74 node.last_changed = datetime.now()
76 if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
77 print "changed status from %s to online" % node.status
78 node.status = 'online'
79 node.last_changed = datetime.now()
81 #################################################################
82 # Switch temporary hystersis states into their 'firm' states.
83 # online -> good after half a day
84 # offline -> down after two days
85 # monitordebug -> down after 30 days
86 # diagnose -> monitordebug after 60 days
87 # disabled -> down after 60 days
89 if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
90 print "changed status from %s to good" % node.status
92 # NOTE: do not reset last_changed, or you lose how long it's been up.
94 if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
95 print "changed status from %s to down" % node.status
97 # NOTE: do not reset last_changed, or you lose how long it's been down.
99 if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
100 print "changed status from %s to down" % node.status
102 # NOTE: do not reset last_changed, or you lose how long it's been down.
104 if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
105 print "changed status from %s to down" % node.status
106 # NOTE: change an admin mode back into monitordebug after two months.
107 node.status = 'monitordebug'
108 node.last_changed = datetime.now()
110 # extreme cases of offline nodes
111 if ( boot_state == 'disabled' or last_contact == None ) and \
112 changed_greaterthan(node.last_changed, 2*30) and \
113 node.status != 'down':
114 print "changed status from %s to down" % node.status
116 node.last_changed = datetime.now()
118 def checkAndRecordState(l_nodes, l_plcnodes):
121 for nodename in l_nodes:
123 nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
124 if_new_set={'status' : 'offline',
125 'last_changed' : datetime.now()})
126 nodehist.last_checked = datetime.now()
129 # Find the most recent record
130 noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
132 print "COULD NOT FIND %s" % nodename
134 print traceback.print_exc()
138 print "none object for %s"% nodename
141 check_node_state(noderec, nodehist)
144 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
146 # NOTE: this commits all pending operations to the DB. Do not remove, or
147 # replace with another operations that also commits all pending ops, such
148 # as session.commit() or flush() or something
150 print HistoryNodeRecord.query.count()
154 if __name__ == '__main__':
155 from monitor import parser as parsermodule
156 parser = parsermodule.getParser(['nodesets'])
157 parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
158 parser = parsermodule.getParser(['defaults'], parser)
159 config = parsermodule.parse_args(parser)
163 except Exception, err:
165 print traceback.print_exc()
166 print "Exception: %s" % err