7 from datetime import datetime,timedelta
9 from monitor.query import verify,query_to_dict,node_select
11 from monitor.common import *
13 from monitor import config
14 from monitor.wrapper import plc,plccache
15 from monitor.const import MINUP
16 from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord
17 from monitor.database.dborm import mon_session as session
19 from monitor.model import *
21 api = plc.getAuthAPI()
30 l_plcnodes = plccache.l_nodes
31 l_nodes = get_nodeset(config)
33 checkAndRecordState(l_nodes, l_plcnodes)
37 def check_node_state(rec, node):
39 node_state = rec.observed_status
40 if rec.plc_node_stats:
41 print rec.plc_node_stats
42 boot_state = rec.plc_node_stats['boot_state']
43 run_level = rec.plc_node_stats['run_level']
44 last_contact = rec.plc_node_stats['last_contact']
45 node.plc_nodeid = rec.plc_node_stats['node_id']
47 boot_state = "unknown"
50 if boot_state == 'disable': boot_state = 'disabled'
51 if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
53 if len(rec.plc_node_stats['pcu_ids']) > 0:
58 node.firewall = rec.firewall
59 node.plc_siteid = rec.plc_node_stats['site_id']
61 # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
62 # 'translations' into the node.status state
63 # 'BOOT' is a permanent state, but we want it to have a bit of
64 # hysteresis (less than 0.5 days)
65 #################################################################
66 # "Initialize" the findbad states into nodebad status if they are not already set
68 if node_state == 'DOWN':
69 if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
70 node.status != 'disabled':
71 # NOTE: if changed less than 2 months, then we can allow this.
72 # otherwise, apply 'down' status after greater than 2 months (below).
74 print "changed status from %s to %s" % (node.status, boot_state)
75 node.status = boot_state
76 node.last_changed = datetime.now()
78 if node.status not in ['offline', 'down', 'disabled']:
79 print "changed status from %s to offline" % node.status
80 node.status = 'offline'
81 node.last_changed = datetime.now()
83 if node_state == 'DEBUG':
84 if boot_state != 'disabled' and boot_state != 'safeboot':
85 print "changed status from %s to failboot" % (node.status)
86 current_status = "failboot"
88 print "changed status from %s to %s" % (node.status, boot_state)
89 current_status = boot_state
91 if current_status != node.status and \
92 current_status in ['failboot', 'disabled', 'safeboot']:
94 node.status = current_status
95 node.last_changed = datetime.now()
97 if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
98 print "changed status from %s to online" % node.status
99 node.status = 'online'
100 node.last_changed = datetime.now()
102 #################################################################
103 # Switch temporary hystersis states into their 'firm' states.
104 # online -> good after half a day
105 # offline -> down after two days
106 # failboot -> down after 30 days
107 # safeboot -> failboot after 60 days
108 # disabled -> down after 60 days
110 if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
111 print "changed status from %s to good" % node.status
113 # NOTE: do not reset last_changed, or you lose how long it's been up.
115 if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
116 print "changed status from %s to down" % node.status
118 # NOTE: do not reset last_changed, or you lose how long it's been down.
120 if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
121 print "changed status from %s to down" % node.status
123 # NOTE: do not reset last_changed, or you lose how long it's been down.
125 if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
126 print "changed status from %s to down" % node.status
127 # NOTE: change an admin mode back into failboot after two months.
128 node.status = 'failboot'
129 node.last_changed = datetime.now()
131 # extreme cases of offline nodes
132 if ( boot_state == 'disabled' or last_contact == None ) and \
133 changed_greaterthan(node.last_changed, 2*30) and \
134 node.status != 'down':
135 print "changed status from %s to down" % node.status
137 node.last_changed = datetime.now()
139 def checkAndRecordState(l_nodes, l_plcnodes):
142 for nodename in l_nodes:
144 nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
145 if_new_set={'status' : 'offline',
146 'last_changed' : datetime.now()})
147 nodehist.last_checked = datetime.now()
150 # Find the most recent record
151 noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
153 print "COULD NOT FIND %s" % nodename
156 print traceback.print_exc()
160 print "none object for %s"% nodename
163 check_node_state(noderec, nodehist)
166 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
168 # NOTE: this commits all pending operations to the DB. Do not remove.
173 if __name__ == '__main__':
174 from monitor import parser as parsermodule
175 parser = parsermodule.getParser(['nodesets'])
176 parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
177 parser = parsermodule.getParser(['defaults'], parser)
178 config = parsermodule.parse_args(parser)
182 except Exception, err:
184 print traceback.print_exc()
185 print "Exception: %s" % err