add external commands as stubs for the nagios plugins
[monitor.git] / commands / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.const import MINUP
25 from monitor.model import *
26 from monitor.wrapper import plc
27 from monitor.wrapper import plccache
28 from monitor.database.info.model import *
29 from monitor.database.info.interface import *
30
31 from monitor.query import verify,query_to_dict,node_select
32
33 api = plc.getAuthAPI()
34
35 def logic():
36
37         plc.nodeBootState(host, 'reinstall')
38         node_end_record(host)
39
40 def check_node_and_pcu_status_for(loginbase):
41         """
42                 this function checks whether all the nodes and associated pcus for a
43                 given site are considered 'good'.  
44                 
45                 If so, the function returns True.
46                 Otherwise, the function returns False.
47         """
48
49         results = [] 
50         for node in plccache.plcdb_lb2hn[loginbase]:
51
52                 noderec  = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
53                 nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
54                 nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
55                 pcuhist  = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
56
57                 if (nodehist is not None and nodehist.status == 'good' and \
58                         ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
59                         if nodebl is None:                      # no entry in blacklist table
60                                 results.append(True)
61                         elif nodebl is not None and nodebl.expired():   # expired entry in blacklist table
62                                 results.append(True)
63                         else:
64                                 results.append(False)   # entry that is not expired.
65                 else:
66                         results.append(False)
67
68         try:
69                 print "test: %s" % results
70                 # NOTE: incase results is empty, reduce does not work on an empty set.
71                 return reduce(lambda x,y: x&y, results) and len(results) > MINUP
72         except:
73                 return False
74
75 def main(hostnames, sitenames):
76         # commands:
77         i = 1
78         node_count = 1
79         site_count = 1
80         #print "hosts: %s" % hostnames
81         for i,host in enumerate(hostnames):
82                 try:
83                         lb = plccache.plcdb_hn2lb[host]
84                 except:
85                         print "unknown host in plcdb_hn2lb %s" % host
86                         email_exception(host)
87                         continue
88
89                 nodeblack = BlacklistRecord.get_by(hostname=host)
90
91                 if nodeblack and not nodeblack.expired():
92                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
93                         continue
94
95                 sitehist = SiteInterface.get_or_make(loginbase=lb)
96
97                 recent_actions = sitehist.getRecentActions(hostname=host)
98
99                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
100
101                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
102                 if nodehist.status == 'good' and \
103                         changed_lessthan(nodehist.last_changed, 1.0) and \
104                         found_within(recent_actions, 'down_notice', 7.0) and \
105                         not found_within(recent_actions, 'online_notice', 0.5):
106                                 # NOTE: chronicly flapping nodes will not get 'online' notices
107                                 #               since, they are never up long enough to be 'good'.
108                             # NOTE: searching for down_notice proves that the node has
109                                 #               gone through a 'down' state first, rather than just
110                                 #               flapping through: good, offline, online, ...
111                                 #       
112                                 # NOTE: there is a narrow window in which this command must be
113                                 #               evaluated, otherwise the notice will not go out.  
114                                 #               this is not ideal.
115                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
116                                 print "send message for host %s online" % host
117
118
119                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
120                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
121                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
122                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
123                 #
124                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
125                 #               print "send message for host %s pcumissing_notice" % host
126
127                 # if it is offline and HAS a PCU, then try to use it.
128                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
129                         changed_greaterthan(nodehist.last_changed,1.0) and \
130                         not nodehist.firewall and \
131                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
132
133                                 # TODO: there MUST be a better way to do this... 
134                                 # get fb node record for pcuid
135                                 fbpcu = None
136                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
137                                 if fbnode:
138                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
139
140                                 sitehist.attemptReboot(host)
141                                 print "send message for host %s try_reboot" % host
142                                 if not fbpcu.test_is_ok() and \
143                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
144
145                                         args = {}
146                                         if fbpcu:
147                                                 args['pcu_name'] = fbpcu.pcu_name()
148                                                 args['pcu_errors'] = fbpcu.pcu_errors()
149                                                 args['plc_pcuid'] = fbpcu.plc_pcuid
150                                         else:
151                                                 args['pcu_name'] = "error looking up pcu name"
152                                                 args['pcu_errors'] = ""
153                                                 args['plc_pcuid'] = 0
154
155                                         args['hostname'] = host
156                                         sitehist.sendMessage('pcuerror_notice', **args)
157                                         print "send message for host %s PCU Failure" % host
158                                         
159
160                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
161                 #               will be false for a day after the above condition is satisfied
162                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
163                         changed_greaterthan(nodehist.last_changed,1.5) and \
164                         not nodehist.firewall and \
165                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
166                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
167                                 
168                                 # TODO: there MUST be a better way to do this... 
169                                 # get fb node record for pcuid
170                                 fbpcu = None
171                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
172                                 if fbnode:
173                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
174                                 if fbpcu:
175                                         pcu_name = fbpcu.pcu_name()
176                                 else:
177                                         pcu_name = "error looking up pcu name"
178
179                                 # get fb pcu record for pcuid
180                                 # send pcu failure message
181                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
182                                 print "send message for host %s PCU Failure" % host
183
184                 if nodehist.status == 'failboot' and \
185                         changed_greaterthan(nodehist.last_changed, 0.25) and \
186                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
187                                 # send down node notice
188                                 # delay 0.5 days before retrying...
189
190                                 print "send message for host %s bootmanager_restore" % host
191                                 sitehist.runBootManager(host)
192                         #       sitehist.sendMessage('retry_bootman', hostname=host)
193
194                 if nodehist.status == 'down' and \
195                         changed_greaterthan(nodehist.last_changed, 2):
196                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
197                                         # send down node notice
198                                         sitehist.sendMessage('down_notice', hostname=host)
199                                         print "send message for host %s down" % host
200
201                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
202                                         # send down node notice
203                                         #email_exception(host, "firewall_notice")
204                                         sitehist.sendMessage('firewall_notice', hostname=host)
205                                         print "send message for host %s down" % host
206
207                 node_count = node_count + 1
208                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
209                 sys.stdout.flush()
210                 session.flush()
211
212         for i,site in enumerate(sitenames):
213                 sitehist = SiteInterface.get_or_make(loginbase=site)
214                 siteblack = BlacklistRecord.get_by(loginbase=site)
215                 skip_due_to_blacklist=False
216
217                 if siteblack and not siteblack.expired():
218                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
219                         skip_due_to_blacklist=True
220                         sitehist.clearPenalty()
221                         sitehist.applyPenalty()
222                         continue
223
224                 # TODO: make query only return records within a certin time range,
225                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
226                 recent_actions = sitehist.getRecentActions(loginbase=site)
227
228                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
229
230                 if sitehist.db.status == 'down':
231                         if sitehist.db.penalty_pause and \
232                                 changed_greaterthan(sitehist.db.penalty_pause_time, 30):
233
234                                 email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
235                                 sitehist.closeTicket()
236                                 # NOTE: but preserve the penalty status.
237                                 sitehist.clearPenaltyPause()
238
239                         if sitehist.db.message_id != 0 and \
240                                 sitehist.db.message_status == 'open' and \
241                                 not sitehist.db.penalty_pause:
242
243                                 email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
244                                 sitehist.setPenaltyPause()
245
246                         if  not sitehist.db.penalty_pause and \
247                                 not found_within(recent_actions, 'increase_penalty', 7) and \
248                                 changed_greaterthan(sitehist.db.last_changed, 7):
249
250                                 # TODO: catch errors
251                                 sitehist.increasePenalty()
252                                 sitehist.applyPenalty()
253                                 sitehist.sendMessage('increase_penalty')
254
255                                 print "send message for site %s penalty increase" % site
256
257                 if sitehist.db.status == 'good':
258                         # clear penalty
259                         # NOTE: because 'all clear' should have an indefinite status, we
260                         #               have a boolean value rather than a 'recent action'
261                         if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
262                                 # send message that penalties are cleared.
263
264                                 sitehist.clearPenalty()
265                                 sitehist.applyPenalty()
266                                 sitehist.sendMessage('clear_penalty')
267                                 sitehist.closeTicket()
268
269                                 print "send message for site %s penalty cleared" % site
270                                 
271                         # check all nodes and pcus for this site; if they're all ok,
272                         #               close the ticket, else leave it open.
273                         # NOTE: in the case where a PCU reboots and fails, a message is
274                         #               sent, but the PCU may appear to be ok according to tests.
275                         # NOTE: Also, bootmanager sends messages regarding disks,
276                         #               configuration, etc.  So, the conditions here are 'good'
277                         #               rather than 'not down' as it is in sitebad.
278                         close_ticket = check_node_and_pcu_status_for(site)
279                         if close_ticket:
280                                 sitehist.closeTicket()
281
282                 site_count = site_count + 1
283
284                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
285                 sys.stdout.flush()
286                 session.flush()
287
288         session.flush()
289         return
290
291
292 if __name__ == "__main__":
293         parser = parsermodule.getParser(['nodesets'])
294         parser.set_defaults( timewait=0,
295                                                 skip=0,
296                                                 rins=False,
297                                                 reboot=False,
298                                                 findbad=False,
299                                                 force=False, 
300                                                 nosetup=False, 
301                                                 verbose=False, 
302                                                 quiet=False,)
303
304         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
305                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
306         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
307                                                 help="Re-run findbad on the nodes we're going to check before acting.")
308         parser.add_option("", "--force", dest="force", action="store_true", 
309                                                 help="Force action regardless of previous actions/logs.")
310         parser.add_option("", "--rins", dest="rins", action="store_true", 
311                                                 help="Set the boot_state to 'rins' for all nodes.")
312         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
313                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
314
315         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
316                                                 help="Extra debug output messages.")
317         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
318                                                 help="Do not perform the orginary setup phase.")
319         parser.add_option("", "--skip", dest="skip", 
320                                                 help="Number of machines to skip on the input queue.")
321         parser.add_option("", "--timewait", dest="timewait", 
322                                                 help="Minutes to wait between iterations of 10 nodes.")
323
324         parser = parsermodule.getParser(['defaults'], parser)
325         config = parsermodule.parse_args(parser)
326
327         fbquery = HistoryNodeRecord.query.all()
328         hostnames = [ n.hostname for n in fbquery ]
329         
330         fbquery = HistorySiteRecord.query.all()
331         sitenames = [ s.loginbase for s in fbquery ]
332
333         if config.site:
334                 # TODO: replace with calls to local db.  the api fails so often that
335                 #               these calls should be regarded as unreliable.
336                 l_nodes = plccache.GetNodesBySite(config.site)
337                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
338
339                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
340                 sitenames = [config.site]
341
342         if config.node:
343                 hostnames = [ config.node ] 
344                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
345
346         try:
347                 main(hostnames, sitenames)
348                 session.flush()
349         except KeyboardInterrupt:
350                 print "Killed by interrupt"
351                 session.flush()
352                 sys.exit(0)
353         except:
354                 email_exception()
355                 print traceback.print_exc();
356                 print "fail all..."