added blacklist to action.py
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 import bootman          # debug nodes
22
23 from monitor import util
24 from monitor import const
25 from monitor import reboot
26 from monitor import config
27 from monitor import database
28 from monitor import parser as parsermodule
29 from monitor.common import *
30 from monitor.model import *
31 from monitor.wrapper import plc
32 from monitor.wrapper import plccache
33 from monitor.wrapper.emailTxt import mailtxt
34 from monitor.database.info.model import *
35
36 from nodequery import verify,query_to_dict,node_select
37
38 api = plc.getAuthAPI()
39
40
41 class SiteInterface(HistorySiteRecord):
42         @classmethod
43         def get_or_make(cls, if_new_set={}, **kwargs):
44                 if 'hostname' in kwargs:
45                         kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
46                         del kwargs['hostname']
47                 res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
48                 return SiteInterface(res)
49         
50         def __init__(self, sitehist):
51                 self.db = sitehist
52
53         def getRecentActions(self, **kwargs):
54                 # TODO: make query only return records within a certin time range,
55                 # i.e. greater than 0.5 days ago. or 5 days, etc.
56
57                 #print "kwargs: ", kwargs
58
59                 recent_actions = []
60                 if 'loginbase' in kwargs:
61                         recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
62                 elif 'hostname' in kwargs:
63                         recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
64                 return recent_actions
65         
66         def increasePenalty(self):
67                 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
68                 self.db.penalty_level += 1
69                 # NOTE: this is to prevent overflow or index errors in applyPenalty.
70                 #       there's probably a better approach to this.
71                 if self.db.penalty_level >= 2:
72                         self.db.penalty_level = 2
73                 self.db.penalty_applied = True
74         
75         def applyPenalty(self):
76                 penalty_map = [] 
77                 penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
78                                                                                                                 'disable'  : lambda site: None } )
79                 penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
80                                                                                                                 'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
81                 penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
82                                                                                                                 'disable'  : lambda site: plc.enableSiteSlices(site) } )
83
84                 for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
85                         print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
86                         penalty_map[i]['disable'](self.db.loginbase) 
87
88                 for i in range(0,self.db.penalty_level+1):
89                         print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
90                         penalty_map[i]['enable'](self.db.loginbase)
91
92                 return
93
94         def pausePenalty(self):
95                 act = ActionRecord(loginbase=self.db.loginbase,
96                                                         action='penalty',
97                                                         action_type='pause_penalty',)
98         
99         def clearPenalty(self):
100                 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
101                 self.db.penalty_level = 0
102                 self.db.penalty_applied = False
103         
104         def getTicketStatus(self):
105                 if self.db.message_id != 0:
106                         rtstatus = mailer.getTicketStatus(self.db.message_id)
107                         self.db.message_status = rtstatus['Status']
108                         self.db.message_queue = rtstatus['Queue']
109                         self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
110
111         def setTicketStatus(self, status):
112                 print 'SETTING status %s' % status
113                 if self.db.message_id != 0:
114                         rtstatus = mailer.setTicketStatus(self.db.message_id, status)
115
116         def getContacts(self):
117                 contacts = []
118                 if self.db.penalty_level >= 0:
119                         contacts += plc.getTechEmails(self.db.loginbase)
120
121                 if self.db.penalty_level >= 1:
122                         contacts += plc.getPIEmails(self.db.loginbase)
123
124                 if self.db.penalty_level >= 2:
125                         contacts += plc.getSliceUserEmails(self.db.loginbase)
126
127                 return contacts
128
129         def sendMessage(self, type, **kwargs):
130
131                 # NOTE: evidently changing an RT message's subject opens the ticket.
132                 #       the logic in this policy depends up a ticket only being 'open'
133         #       if a user has replied to it.
134         #       So, to preserve these semantics, we check the status before
135         #           sending, then after sending, reset the status to the
136         #           previous status.
137         #       There is a very tiny race here, where a user sends a reply
138         #           within the time it takes to check, send, and reset.
139         #       This sucks.  It's almost certainly fragile.
140
141                 # 
142                 # TODO: catch any errors here, and add an ActionRecord that contains
143                 #       those errors.
144                 
145                 args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
146                 args.update(kwargs)
147
148                 hostname = None
149                 if 'hostname' in args:
150                         hostname = args['hostname']
151
152                 if hasattr(mailtxt, type):
153
154                         message = getattr(mailtxt, type)
155                         viart = True
156                         if 'viart' in kwargs:
157                                 viart = kwargs['viart']
158
159                         if viart:
160                                 self.getTicketStatus()          # get current message status
161
162                         m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
163
164                         contacts = self.getContacts()
165                         contacts = [config.cc_email]    # TODO: remove after testing...
166
167                         print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
168
169                         ret = m.send(contacts)
170                         if viart:
171                                 self.db.message_id = ret
172                                 # reset to previous status, since a new subject 'opens' RT tickets.
173                                 self.setTicketStatus(self.db.message_status) 
174
175                                 # NOTE: only make a record of it if it's in RT.
176                                 act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
177                                                                 action_type=type, message_id=self.db.message_id)
178
179                 else:
180                         print "+-- WARNING! ------------------------------"
181                         print "| No such message name in emailTxt.mailtxt: %s" % type
182                         print "+------------------------------------------"
183
184                 return
185
186         def closeTicket(self):
187                 # TODO: close the rt ticket before overwriting the message_id
188                 mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
189                 act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
190                                                         action_type='end_notice', message_id=self.db.message_id)
191                 self.db.message_id = 0
192                 self.db.message_status = "new"
193
194         def runBootManager(self, hostname):
195                 print "attempting BM reboot of %s" % hostname
196                 ret = ""
197                 try:
198                         ret = bootman.restore(self, hostname)
199                         err = ""
200                 except:
201                         err = traceback.format_exc()
202                         print err
203
204                 act = ActionRecord(loginbase=self.db.loginbase,
205                                                         hostname=hostname,
206                                                         action='reboot',
207                                                         action_type='bootmanager_restore',
208                                                         error_string=err)
209                 return ret
210
211         def attemptReboot(self, hostname):
212                 print "attempting PCU reboot of %s" % hostname
213                 ret = reboot.reboot_str(hostname)
214                 if ret == 0 or ret == "0":
215                         ret = ""
216                 act = ActionRecord(loginbase=self.db.loginbase,
217                                                         hostname=hostname,
218                                                         action='reboot',
219                                                         action_type='first_try_reboot',
220                                                         error_string=ret)
221
222 def logic():
223
224         plc.nodeBootState(host, 'rins')
225         node_end_record(host)
226
227
228
229
230 def main(hostnames, sitenames):
231         # commands:
232         i = 1
233         node_count = 1
234         site_count = 1
235         #print "hosts: %s" % hostnames
236         for host in hostnames:
237                 try:
238                         lb = plccache.plcdb_hn2lb[host]
239                 except:
240                         print "unknown host in plcdb_hn2lb %s" % host
241                         continue
242
243                 nodeblack = BlacklistRecord.get_by(hostname=host)
244
245                 if nodeblack and not nodeblack.expired():
246                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
247                         continue
248
249                 sitehist = SiteInterface.get_or_make(loginbase=lb)
250
251                 recent_actions = sitehist.getRecentActions(hostname=host)
252
253                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
254
255                 print "%s %s" % ( nodehist.hostname, nodehist.status)
256                 if nodehist.status == 'good' and \
257                         changed_lessthan(nodehist.last_changed, 1.0) and \
258                         not found_within(recent_actions, 'online_notice', 0.5):
259                                 # NOTE: there is a narrow window in which this command must be
260                                 # evaluated, otherwise the notice will not go out.  this is not ideal.
261                                 sitehist.sendMessage('online_notice', hostname=host)
262                                 print "send message for host %s online" % host
263
264                                 pass
265
266                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
267                         changed_greaterthan(nodehist.last_changed,1.0) and \
268                         not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
269
270                                 sitehist.attemptReboot(host)
271                                 print "send message for host %s first_try_reboot" % host
272                                 pass
273
274                 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
275                 #               will be false for a day after the above condition is satisfied
276                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
277                         changed_greaterthan(nodehist.last_changed,1.5) and \
278                         found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
279                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
280                         # found_within(recent_actions, 'first_try_reboot', 3.5) and \
281                                 
282                                 # send pcu failure message
283                                 #act = ActionRecord(**kwargs)
284                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
285                                 print "send message for host %s PCU Failure" % host
286                                 pass
287
288                 if nodehist.status == 'monitordebug' and \
289                         changed_greaterthan(nodehist.last_changed, 1) and \
290                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
291                                 # send down node notice
292                                 # delay 0.5 days before retrying...
293
294                                 print "send message for host %s bootmanager_restore" % host
295                                 sitehist.runBootManager(host)
296                         #       sitehist.sendMessage('retry_bootman', hostname=host)
297
298                 if nodehist.status == 'down' and \
299                         changed_greaterthan(nodehist.last_changed, 2) and \
300                         not found_within(recent_actions, 'down_notice', 3.5):
301                                 # send down node notice
302
303                                 sitehist.sendMessage('down_notice', hostname=host)
304                                 print "send message for host %s offline" % host
305                                 pass
306
307                 node_count = node_count + 1
308
309         for site in sitenames:
310                 sitehist = SiteInterface.get_or_make(loginbase=site)
311                 # TODO: make query only return records within a certin time range,
312                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
313                 recent_actions = sitehist.getRecentActions(loginbase=site)
314
315                 #sitehist.sendMessage('test_notice', host)
316
317                 print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
318                 if sitehist.db.status == 'down':
319                         if  not found_within(recent_actions, 'pause_penalty', 30) and \
320                                 not found_within(recent_actions, 'increase_penalty', 7) and \
321                                 changed_greaterthan(sitehist.db.last_changed, 7):
322
323                                 # TODO: catch errors
324                                 sitehist.increasePenalty()
325                                 #sitehist.applyPenalty()
326                                 sitehist.sendMessage('increase_penalty')
327
328                                 print "send message for site %s penalty increase" % site
329
330                 if sitehist.db.status == 'good':
331                         # clear penalty
332                         # NOTE: because 'all clear' should have an indefinite status, we
333                         #               have a boolean value rather than a 'recent action'
334                         if sitehist.db.penalty_applied:
335                                 # send message that penalties are cleared.
336
337                                 sitehist.clearPenalty()
338                                 #sitehist.applyPenalty()
339                                 sitehist.sendMessage('clear_penalty')
340                                 sitehist.closeTicket()
341
342                                 print "send message for site %s penalty cleared" % site
343
344                 # find all ticket ids for site ( could be on the site record? )
345                 # determine if there are penalties within the last 30 days?
346                 # if so, add a 'pause_penalty' action.
347                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
348                         #       pause escalation
349                         print "Pausing penalties for %s" % site
350                         sitehist.pausePenalty()
351
352                 site_count = site_count + 1
353
354         session.flush()
355
356         return
357
358
359 if __name__ == "__main__":
360         parser = parsermodule.getParser(['nodesets'])
361         parser.set_defaults( timewait=0,
362                                                 skip=0,
363                                                 rins=False,
364                                                 reboot=False,
365                                                 findbad=False,
366                                                 force=False, 
367                                                 nosetup=False, 
368                                                 verbose=False, 
369                                                 quiet=False,
370                                                 )
371
372         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
373                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
374         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
375                                                 help="Re-run findbad on the nodes we're going to check before acting.")
376         parser.add_option("", "--force", dest="force", action="store_true", 
377                                                 help="Force action regardless of previous actions/logs.")
378         parser.add_option("", "--rins", dest="rins", action="store_true", 
379                                                 help="Set the boot_state to 'rins' for all nodes.")
380         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
381                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
382
383         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
384                                                 help="Extra debug output messages.")
385         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
386                                                 help="Do not perform the orginary setup phase.")
387         parser.add_option("", "--skip", dest="skip", 
388                                                 help="Number of machines to skip on the input queue.")
389         parser.add_option("", "--timewait", dest="timewait", 
390                                                 help="Minutes to wait between iterations of 10 nodes.")
391
392         parser = parsermodule.getParser(['defaults'], parser)
393         config = parsermodule.parse_args(parser)
394
395 #       # COLLECT nodegroups, nodes and node lists
396 #       if config.nodegroup:
397 #               ng = api.GetNodeGroups({'name' : config.nodegroup})
398 #               nodelist = api.GetNodes(ng[0]['node_ids'])
399 #               hostnames = [ n['hostname'] for n in nodelist ]
400
401         fbquery = HistoryNodeRecord.query.all()
402         hostnames = [ n.hostname for n in fbquery ]
403         
404         fbquery = HistorySiteRecord.query.all()
405         sitenames = [ s.loginbase for s in fbquery ]
406
407         if config.site:
408                 # TODO: replace with calls to local db.  the api fails so often that
409                 #               these calls should be regarded as unreliable.
410                 site = api.GetSites(config.site)
411                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
412                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
413
414                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
415                 sitenames = [config.site]
416
417         if config.node:
418                 hostnames = [ config.node ] 
419                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
420
421         try:
422                 main(hostnames, sitenames)
423         except KeyboardInterrupt:
424                 print "Killed by interrupt"
425                 sys.exit(0)
426         except:
427                 #email_exception()
428                 print traceback.print_exc();
429                 print "Continuing..."