7 from datetime import datetime,timedelta
10 from monitor.common import *
11 from monitor.query import verify,query_to_dict,node_select
13 from monitor import config
14 from monitor.wrapper import plc,plccache
15 from monitor.const import MINUP
16 from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord
17 from monitor.database.dborm import mon_session as session
19 from monitor.model import *
21 api = plc.getAuthAPI()
30 l_plcnodes = plccache.l_nodes
31 l_nodes = get_nodeset(config)
33 checkAndRecordState(l_nodes, l_plcnodes)
35 def get_uptime(uptime_str):
37 if len(uptime_str) > 0:
39 up = float(uptime_str.split()[0])
40 print "uptime: %s" % up
47 def check_node_state(rec, node):
49 node_state = rec.observed_status
50 if rec.plc_node_stats:
51 print rec.plc_node_stats
52 boot_state = rec.plc_node_stats['boot_state']
53 run_level = rec.plc_node_stats['run_level']
54 last_contact = rec.plc_node_stats['last_contact']
55 node.plc_nodeid = rec.plc_node_stats['node_id']
57 boot_state = "unknown"
60 if boot_state == 'disable': boot_state = 'disabled'
61 if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
63 if rec.plc_node_stats and len(rec.plc_node_stats['pcu_ids']) > 0:
68 node.firewall = rec.firewall
69 node.plc_siteid = rec.plc_node_stats['site_id']
71 # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
72 # 'translations' into the node.status state
73 # 'BOOT' is a permanent state, but we want it to have a bit of
74 # hysteresis (less than 0.5 days)
75 #################################################################
76 # "Initialize" the findbad states into nodebad status if they are not already set
78 if node_state == 'DOWN':
79 if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
80 node.status != 'disabled':
81 # NOTE: if changed less than 2 months, then we can allow this.
82 # otherwise, apply 'down' status after greater than 2 months (below).
84 print "changed status from %s to %s" % (node.status, boot_state)
85 node.status = boot_state
86 node.last_changed = datetime.now()
88 if node.status not in ['offline', 'down', 'disabled']:
89 print "changed status from %s to offline" % node.status
90 node.status = 'offline'
91 node.last_changed = datetime.now()
93 if node_state == 'DEBUG':
94 if boot_state != 'disabled' and boot_state != 'safeboot':
95 print "changed status from %s to failboot" % (node.status)
96 current_status = "failboot"
98 print "changed status from %s to %s" % (node.status, boot_state)
99 current_status = boot_state
101 if current_status != node.status and \
102 current_status in ['failboot', 'disabled', 'safeboot']:
104 node.status = current_status
105 node.last_changed = datetime.now()
107 if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
108 old_status = node.status
109 uptime = get_uptime(rec.uptime)
110 if uptime > (60*60*24):
112 node.last_changed = datetime.now() - timedelta(0,uptime)
114 node.status = 'online'
115 node.last_changed = datetime.now()
116 print "changed status from %s to %s" % (old_status, node.status)
118 #################################################################
119 # Switch temporary hystersis states into their 'firm' states.
120 # online -> good after half a day
121 # offline -> down after two days
122 # failboot -> down after 30 days
123 # safeboot -> failboot after 60 days
124 # disabled -> down after 60 days
126 if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
127 print "changed status from %s to good" % node.status
129 # NOTE: do not reset last_changed, or you lose how long it's been up.
131 if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
132 print "changed status from %s to down" % node.status
134 # NOTE: do not reset last_changed, or you lose how long it's been down.
136 if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
137 print "changed status from %s to down" % node.status
139 # NOTE: do not reset last_changed, or you lose how long it's been down.
141 if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
142 print "changed status from %s to down" % node.status
143 # NOTE: change an admin mode back into failboot after two months.
144 node.status = 'failboot'
145 node.last_changed = datetime.now()
147 # extreme cases of offline nodes
148 if ( boot_state == 'disabled' or last_contact == None ) and \
149 changed_greaterthan(node.last_changed, 2*30) and \
150 node.status != 'down':
151 print "changed status from %s to down" % node.status
153 node.last_changed = datetime.now()
155 def checkAndRecordState(l_nodes, l_plcnodes):
158 for nodename in l_nodes:
160 nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
161 if_new_set={'status' : 'offline',
162 'last_changed' : datetime.now()})
163 nodehist.last_checked = datetime.now()
166 # Find the most recent record
167 noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
169 print "COULD NOT FIND %s" % nodename
172 print traceback.print_exc()
176 print "none object for %s"% nodename
180 check_node_state(noderec, nodehist)
182 print "check_node_state failed %s" % nodename
184 email_exception(nodename)
185 print traceback.print_exc()
189 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
191 # NOTE: this commits all pending operations to the DB. Do not remove.
196 if __name__ == '__main__':
197 from monitor import parser as parsermodule
198 parser = parsermodule.getParser(['nodesets'])
199 parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
200 parser = parsermodule.getParser(['defaults'], parser)
201 config = parsermodule.parse_args(parser)
205 except Exception, err:
207 print traceback.print_exc()
208 print "Exception: %s" % err