added actionlist_template to display action list consistently on different pages
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.model import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.database.info.model import *
28 from monitor.database.info.interface import *
29
30 from nodequery import verify,query_to_dict,node_select
31
32 api = plc.getAuthAPI()
33
34 def logic():
35
36         plc.nodeBootState(host, 'reinstall')
37         node_end_record(host)
38
39 def main(hostnames, sitenames):
40         # commands:
41         i = 1
42         node_count = 1
43         site_count = 1
44         #print "hosts: %s" % hostnames
45         for i,host in enumerate(hostnames):
46                 try:
47                         lb = plccache.plcdb_hn2lb[host]
48                 except:
49                         print "unknown host in plcdb_hn2lb %s" % host
50                         email_exception(host)
51                         continue
52
53                 nodeblack = BlacklistRecord.get_by(hostname=host)
54
55                 if nodeblack and not nodeblack.expired():
56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
57                         continue
58
59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
60
61                 recent_actions = sitehist.getRecentActions(hostname=host)
62
63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
64
65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
66                 if nodehist.status == 'good' and \
67                         changed_lessthan(nodehist.last_changed, 1.0) and \
68                         found_within(recent_actions, 'down_notice', 7.0) and \
69                         not found_within(recent_actions, 'online_notice', 0.5):
70                                 # NOTE: chronicly flapping nodes will not get 'online' notices
71                                 #               since, they are never up long enough to be 'good'.
72                             # NOTE: searching for down_notice proves that the node has
73                                 #               gone through a 'down' state first, rather than just
74                                 #               flapping through: good, offline, online, ...
75                                 #       
76                                 # NOTE: there is a narrow window in which this command must be
77                                 #               evaluated, otherwise the notice will not go out.  
78                                 #               this is not ideal.
79                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
80                                 print "send message for host %s online" % host
81
82
83                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
84                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
85                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
86                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
87                 #
88                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
89                 #               print "send message for host %s pcumissing_notice" % host
90
91                 # if it is offline and HAS a PCU, then try to use it.
92                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
93                         changed_greaterthan(nodehist.last_changed,1.0) and \
94                         not nodehist.firewall and \
95                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
96
97                                 # TODO: there MUST be a better way to do this... 
98                                 # get fb node record for pcuid
99                                 fbpcu = None
100                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
101                                 if fbnode:
102                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
103
104                                 sitehist.attemptReboot(host)
105                                 print "send message for host %s try_reboot" % host
106                                 if not fbpcu.test_is_ok() and \
107                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
108
109                                         args = {}
110                                         if fbpcu:
111                                                 args['pcu_name'] = fbpcu.pcu_name()
112                                                 args['pcu_errors'] = fbpcu.pcu_errors()
113                                                 args['plc_pcuid'] = fbpcu.plc_pcuid
114                                         else:
115                                                 args['pcu_name'] = "error looking up pcu name"
116                                                 args['pcu_errors'] = ""
117                                                 args['plc_pcuid'] = 0
118
119                                         args['hostname'] = host
120                                         sitehist.sendMessage('pcuerror_notice', **args)
121                                         print "send message for host %s PCU Failure" % host
122                                         
123
124                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
125                 #               will be false for a day after the above condition is satisfied
126                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
127                         changed_greaterthan(nodehist.last_changed,1.5) and \
128                         not nodehist.firewall and \
129                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
130                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
131                                 
132                                 # TODO: there MUST be a better way to do this... 
133                                 # get fb node record for pcuid
134                                 fbpcu = None
135                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
136                                 if fbnode:
137                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
138                                 if fbpcu:
139                                         pcu_name = fbpcu.pcu_name()
140                                 else:
141                                         pcu_name = "error looking up pcu name"
142
143                                 # get fb pcu record for pcuid
144                                 # send pcu failure message
145                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
146                                 print "send message for host %s PCU Failure" % host
147
148                 if nodehist.status == 'failboot' and \
149                         changed_greaterthan(nodehist.last_changed, 0.25) and \
150                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
151                                 # send down node notice
152                                 # delay 0.5 days before retrying...
153
154                                 print "send message for host %s bootmanager_restore" % host
155                                 sitehist.runBootManager(host)
156                         #       sitehist.sendMessage('retry_bootman', hostname=host)
157
158                 if nodehist.status == 'down' and \
159                         changed_greaterthan(nodehist.last_changed, 2):
160                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
161                                         # send down node notice
162                                         sitehist.sendMessage('down_notice', hostname=host)
163                                         print "send message for host %s down" % host
164
165                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
166                                         # send down node notice
167                                         #email_exception(host, "firewall_notice")
168                                         sitehist.sendMessage('firewall_notice', hostname=host)
169                                         print "send message for host %s down" % host
170
171                 node_count = node_count + 1
172                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
173                 sys.stdout.flush()
174                 session.flush()
175
176         for i,site in enumerate(sitenames):
177                 sitehist = SiteInterface.get_or_make(loginbase=site)
178                 siteblack = BlacklistRecord.get_by(loginbase=site)
179                 skip_due_to_blacklist=False
180
181                 if siteblack and not siteblack.expired():
182                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
183                         skip_due_to_blacklist=True
184                         sitehist.clearPenalty()
185                         sitehist.applyPenalty()
186                         continue
187
188                 # TODO: make query only return records within a certin time range,
189                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
190                 recent_actions = sitehist.getRecentActions(loginbase=site)
191
192                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
193
194                 # determine if there are penalties within the last 30 days?
195                 # if so, add a 'pause_penalty' action.
196                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
197                         sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
198                         #       pause escalation
199                         print "Pausing penalties for %s" % site
200                         sitehist.pausePenalty()
201                 else:
202
203                         if sitehist.db.status == 'down':
204                                 if  not found_within(recent_actions, 'pause_penalty', 30) and \
205                                         not found_within(recent_actions, 'increase_penalty', 7) and \
206                                         changed_greaterthan(sitehist.db.last_changed, 7):
207
208                                         # TODO: catch errors
209                                         sitehist.increasePenalty()
210                                         sitehist.applyPenalty()
211                                         sitehist.sendMessage('increase_penalty')
212
213                                         print "send message for site %s penalty increase" % site
214
215                         if sitehist.db.status == 'good':
216                                 # clear penalty
217                                 # NOTE: because 'all clear' should have an indefinite status, we
218                                 #               have a boolean value rather than a 'recent action'
219                                 if sitehist.db.penalty_applied:
220                                         # send message that penalties are cleared.
221
222                                         sitehist.clearPenalty()
223                                         sitehist.applyPenalty()
224                                         sitehist.sendMessage('clear_penalty')
225                                         sitehist.closeTicket()
226
227                                         print "send message for site %s penalty cleared" % site
228
229
230                 site_count = site_count + 1
231
232                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
233                 sys.stdout.flush()
234                 session.flush()
235
236         session.flush()
237         return
238
239
240 if __name__ == "__main__":
241         parser = parsermodule.getParser(['nodesets'])
242         parser.set_defaults( timewait=0,
243                                                 skip=0,
244                                                 rins=False,
245                                                 reboot=False,
246                                                 findbad=False,
247                                                 force=False, 
248                                                 nosetup=False, 
249                                                 verbose=False, 
250                                                 quiet=False,)
251
252         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
253                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
254         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
255                                                 help="Re-run findbad on the nodes we're going to check before acting.")
256         parser.add_option("", "--force", dest="force", action="store_true", 
257                                                 help="Force action regardless of previous actions/logs.")
258         parser.add_option("", "--rins", dest="rins", action="store_true", 
259                                                 help="Set the boot_state to 'rins' for all nodes.")
260         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
261                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
262
263         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
264                                                 help="Extra debug output messages.")
265         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
266                                                 help="Do not perform the orginary setup phase.")
267         parser.add_option("", "--skip", dest="skip", 
268                                                 help="Number of machines to skip on the input queue.")
269         parser.add_option("", "--timewait", dest="timewait", 
270                                                 help="Minutes to wait between iterations of 10 nodes.")
271
272         parser = parsermodule.getParser(['defaults'], parser)
273         config = parsermodule.parse_args(parser)
274
275         fbquery = HistoryNodeRecord.query.all()
276         hostnames = [ n.hostname for n in fbquery ]
277         
278         fbquery = HistorySiteRecord.query.all()
279         sitenames = [ s.loginbase for s in fbquery ]
280
281         if config.site:
282                 # TODO: replace with calls to local db.  the api fails so often that
283                 #               these calls should be regarded as unreliable.
284                 l_nodes = plccache.GetNodesBySite(config.site)
285                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
286
287                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
288                 sitenames = [config.site]
289
290         if config.node:
291                 hostnames = [ config.node ] 
292                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
293
294         try:
295                 main(hostnames, sitenames)
296                 session.flush()
297         except KeyboardInterrupt:
298                 print "Killed by interrupt"
299                 session.flush()
300                 sys.exit(0)
301         except:
302                 email_exception()
303                 print traceback.print_exc();
304                 print "fail all..."