Many small updates and fixes:
[monitor.git] / commands / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.const import MINUP
25 from monitor.model import *
26 from monitor.wrapper import plc
27 from monitor.wrapper import plccache
28 from monitor.database.info.model import *
29 from monitor.database.info.interface import *
30
31 from monitor.query import verify,query_to_dict,node_select
32
33 api = plc.getAuthAPI()
34
35 def logic():
36
37         plc.nodeBootState(host, 'reinstall')
38         node_end_record(host)
39
40 def check_node_and_pcu_status_for(loginbase):
41         """
42                 this function checks whether all the nodes and associated pcus for a
43                 given site are considered 'good'.  
44                 
45                 If so, the function returns True.
46                 Otherwise, the function returns False.
47         """
48
49         results = [] 
50         for node in plccache.plcdb_lb2hn[loginbase]:
51
52                 noderec  = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
53                 nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
54                 nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
55                 pcuhist  = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
56
57                 if (nodehist is not None and nodehist.status == 'good' and \
58                         ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
59                         if nodebl is None:                      # no entry in blacklist table
60                                 results.append(True)
61                         elif nodebl is not None and nodebl.expired():   # expired entry in blacklist table
62                                 results.append(True)
63                         else:
64                                 results.append(False)   # entry that is not expired.
65                 else:
66                         results.append(False)
67
68         try:
69                 print "test: %s" % results
70                 # NOTE: incase results is empty, reduce does not work on an empty set.
71                 return reduce(lambda x,y: x&y, results) and len(results) > MINUP
72         except:
73                 return False
74
75 def main(hostnames, sitenames):
76         # commands:
77         i = 1
78         node_count = 1
79         site_count = 1
80         #print "hosts: %s" % hostnames
81         print "apply-policy"
82         for i,host in enumerate(hostnames):
83                 try:
84                         lb = plccache.plcdb_hn2lb[host]
85                 except:
86                         print "unknown host in plcdb_hn2lb %s" % host
87                         email_exception("%s %s" % (i,host))
88                         continue
89
90                 nodeblack = BlacklistRecord.get_by(hostname=host)
91
92                 if nodeblack and not nodeblack.expired():
93                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
94                         continue
95
96                 sitehist = SiteInterface.get_or_make(loginbase=lb)
97
98                 recent_actions = sitehist.getRecentActions(hostname=host)
99
100                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
101
102                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
103                 if nodehist.status == 'good' and \
104                         changed_lessthan(nodehist.last_changed, 1.0) and \
105                         found_within(recent_actions, 'down_notice', 7.0) and \
106                         not found_within(recent_actions, 'online_notice', 0.5):
107                                 # NOTE: chronicly flapping nodes will not get 'online' notices
108                                 #               since, they are never up long enough to be 'good'.
109                                 # NOTE: searching for down_notice proves that the node has
110                                 #               gone through a 'down' state first, rather than just
111                                 #               flapping through: good, offline, online, ...
112                                 #       
113                                 # NOTE: there is a narrow window in which this command must be
114                                 #               evaluated, otherwise the notice will not go out.  
115                                 #               this is not ideal.
116                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
117                                 print "send message for host %s online" % host
118
119
120                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
121                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
122                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
123                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
124                 #
125                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
126                 #               print "send message for host %s pcumissing_notice" % host
127
128                 # if it is offline and HAS a PCU, then try to use it.
129                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
130                         changed_greaterthan(nodehist.last_changed,1.0) and \
131                         not nodehist.firewall and \
132                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
133
134                                 # TODO: there MUST be a better way to do this... 
135                                 # get fb node record for pcuid
136                                 fbpcu = None
137                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
138                                 if fbnode:
139                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
140
141                                 sitehist.attemptReboot(host)
142                                 print "send message for host %s try_reboot" % host
143                                 if False and not fbpcu.test_is_ok() and \
144                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
145
146                                         args = {}
147                                         if fbpcu:
148                                                 args['pcu_name'] = fbpcu.pcu_name()
149                                                 args['pcu_errors'] = fbpcu.pcu_errors()
150                                                 args['plc_pcuid'] = fbpcu.plc_pcuid
151                                         else:
152                                                 args['pcu_name'] = "error looking up pcu name"
153                                                 args['pcu_errors'] = ""
154                                                 args['plc_pcuid'] = 0
155
156                                         args['hostname'] = host
157                                         sitehist.sendMessage('pcuerror_notice', **args)
158                                         print "send message for host %s PCU Failure" % host
159                                         
160
161                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
162                 #               will be false for a day after the above condition is satisfied
163                 if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
164                         changed_greaterthan(nodehist.last_changed,1.5) and \
165                         not nodehist.firewall and \
166                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
167                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
168                                 
169                                 # TODO: there MUST be a better way to do this... 
170                                 # get fb node record for pcuid
171                                 fbpcu = None
172                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
173                                 if fbnode:
174                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
175                                 if fbpcu:
176                                         pcu_name = fbpcu.pcu_name()
177                                 else:
178                                         pcu_name = "error looking up pcu name"
179
180                                 # get fb pcu record for pcuid
181                                 # send pcu failure message
182                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
183                                 print "send message for host %s PCU Failure" % host
184
185                 if nodehist.status == 'failboot' and \
186                         changed_greaterthan(nodehist.last_changed, 0.25) and \
187                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
188                                 # send down node notice
189                                 # delay 0.5 days before retrying...
190
191                                 print "send message for host %s bootmanager_restore" % host
192                                 sitehist.runBootManager(host)
193                         #       sitehist.sendMessage('retry_bootman', hostname=host)
194
195                 if nodehist.status == 'down' and \
196                         changed_greaterthan(nodehist.last_changed, 2):
197                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
198                                         # send down node notice
199                                         sitehist.sendMessage('down_notice', hostname=host)
200                                         print "send message for host %s down" % host
201
202                                 #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
203                                         # send down node notice
204                                         #email_exception(host, "firewall_notice")
205                                 #       sitehist.sendMessage('firewall_notice', hostname=host)
206                                 #       print "send message for host %s down" % host
207
208                 node_count = node_count + 1
209                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
210                 sys.stdout.flush()
211                 session.flush()
212
213         for i,site in enumerate(sitenames):
214                 sitehist = SiteInterface.get_or_make(loginbase=site)
215                 siteblack = BlacklistRecord.get_by(loginbase=site)
216                 skip_due_to_blacklist=False
217
218                 if siteblack and not siteblack.expired():
219                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
220                         skip_due_to_blacklist=True
221                         sitehist.clearPenalty()
222                         sitehist.applyPenalty()
223                         continue
224
225                 # TODO: make query only return records within a certin time range,
226                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
227                 recent_actions = sitehist.getRecentActions(loginbase=site)
228
229                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
230
231                 if sitehist.db.status == 'down':
232                         if sitehist.db.penalty_pause and \
233                                 changed_greaterthan(sitehist.db.penalty_pause_time, 30):
234
235                                 email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
236                                 sitehist.closeTicket()
237                                 # NOTE: but preserve the penalty status.
238                                 sitehist.clearPenaltyPause()
239
240                         if sitehist.db.message_id != 0 and \
241                                 sitehist.db.message_status == 'open' and \
242                                 not sitehist.db.penalty_pause:
243
244                                 email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
245                                 sitehist.setPenaltyPause()
246
247                         if  not sitehist.db.penalty_pause and \
248                                 not found_within(recent_actions, 'increase_penalty', 7) and \
249                                 changed_greaterthan(sitehist.db.last_changed, 7):
250
251                                 # TODO: catch errors
252                                 sitehist.increasePenalty()
253                                 sitehist.applyPenalty()
254                                 sitehist.sendMessage('increase_penalty')
255
256                                 print "send message for site %s penalty increase" % site
257
258                 if sitehist.db.status == 'good':
259                         # clear penalty
260                         # NOTE: because 'all clear' should have an indefinite status, we
261                         #               have a boolean value rather than a 'recent action'
262                         if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
263                                 # send message that penalties are cleared.
264
265                                 sitehist.clearPenalty()
266                                 sitehist.applyPenalty()
267                                 sitehist.sendMessage('clear_penalty')
268                                 sitehist.closeTicket()
269
270                                 print "send message for site %s penalty cleared" % site
271                                 
272                         # check all nodes and pcus for this site; if they're all ok,
273                         #               close the ticket, else leave it open.
274                         # NOTE: in the case where a PCU reboots and fails, a message is
275                         #               sent, but the PCU may appear to be ok according to tests.
276                         # NOTE: Also, bootmanager sends messages regarding disks,
277                         #               configuration, etc.  So, the conditions here are 'good'
278                         #               rather than 'not down' as it is in sitebad.
279                         close_ticket = check_node_and_pcu_status_for(site)
280                         if close_ticket:
281                                 sitehist.closeTicket()
282
283                 site_count = site_count + 1
284
285                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
286                 sys.stdout.flush()
287                 session.flush()
288
289         session.flush()
290         return
291
292
293 if __name__ == "__main__":
294         parser = parsermodule.getParser(['nodesets'])
295         parser.set_defaults( timewait=0,
296                                                 skip=0,
297                                                 rins=False,
298                                                 reboot=False,
299                                                 findbad=False,
300                                                 force=False, 
301                                                 nosetup=False, 
302                                                 verbose=False, 
303                                                 quiet=False,)
304
305         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
306                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
307         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
308                                                 help="Re-run findbad on the nodes we're going to check before acting.")
309         parser.add_option("", "--force", dest="force", action="store_true", 
310                                                 help="Force action regardless of previous actions/logs.")
311         parser.add_option("", "--rins", dest="rins", action="store_true", 
312                                                 help="Set the boot_state to 'rins' for all nodes.")
313         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
314                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
315
316         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
317                                                 help="Extra debug output messages.")
318         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
319                                                 help="Do not perform the orginary setup phase.")
320         parser.add_option("", "--skip", dest="skip", 
321                                                 help="Number of machines to skip on the input queue.")
322         parser.add_option("", "--timewait", dest="timewait", 
323                                                 help="Minutes to wait between iterations of 10 nodes.")
324
325         parser = parsermodule.getParser(['defaults'], parser)
326         config = parsermodule.parse_args(parser)
327
328         fbquery = HistoryNodeRecord.query.all()
329         hostnames = [ n.hostname for n in fbquery ]
330         
331         fbquery = HistorySiteRecord.query.all()
332         sitenames = [ s.loginbase for s in fbquery ]
333
334         if config.site:
335                 # TODO: replace with calls to local db.  the api fails so often that
336                 #               these calls should be regarded as unreliable.
337                 l_nodes = plccache.GetNodesBySite(config.site)
338                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
339
340                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
341                 sitenames = [config.site]
342
343         if config.node:
344                 hostnames = [ config.node ] 
345                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
346
347         try:
348                 main(hostnames, sitenames)
349                 session.flush()
350         except KeyboardInterrupt:
351                 print "Killed by interrupt"
352                 session.flush()
353                 sys.exit(0)
354         except:
355                 email_exception()
356                 print traceback.print_exc();
357                 print "fail all..."