add loginbase to blacklist
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 import bootman          # debug nodes
22
23 from monitor import util
24 from monitor import const
25 from monitor import reboot
26 from monitor import config
27 from monitor import database
28 from monitor import parser as parsermodule
29 from monitor.common import *
30 from monitor.model import *
31 from monitor.wrapper import plc
32 from monitor.wrapper import plccache
33 from monitor.wrapper.emailTxt import mailtxt
34 from monitor.database.info.model import *
35
36 from nodequery import verify,query_to_dict,node_select
37
38 api = plc.getAuthAPI()
39
40
41 class SiteInterface(HistorySiteRecord):
42         @classmethod
43         def get_or_make(cls, if_new_set={}, **kwargs):
44                 if 'hostname' in kwargs:
45                         kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
46                         del kwargs['hostname']
47                 res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
48                 return SiteInterface(res)
49         
50         def __init__(self, sitehist):
51                 self.db = sitehist
52
53         def getRecentActions(self, **kwargs):
54                 # TODO: make query only return records within a certin time range,
55                 # i.e. greater than 0.5 days ago. or 5 days, etc.
56
57                 #print "kwargs: ", kwargs
58
59                 recent_actions = []
60                 if 'loginbase' in kwargs:
61                         recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
62                 elif 'hostname' in kwargs:
63                         recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
64                 return recent_actions
65         
66         def increasePenalty(self):
67                 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
68                 self.db.penalty_level += 1
69                 # NOTE: this is to prevent overflow or index errors in applyPenalty.
70                 #       there's probably a better approach to this.
71                 if self.db.penalty_level >= 2:
72                         self.db.penalty_level = 2
73                 self.db.penalty_applied = True
74         
75         def applyPenalty(self):
76                 penalty_map = [] 
77                 penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
78                                                                                                                 'disable'  : lambda site: None } )
79                 penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
80                                                                                                                 'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
81                 penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
82                                                                                                                 'disable'  : lambda site: plc.enableSiteSlices(site) } )
83
84                 for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
85                         print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
86                         penalty_map[i]['disable'](self.db.loginbase) 
87
88                 for i in range(0,self.db.penalty_level+1):
89                         print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
90                         penalty_map[i]['enable'](self.db.loginbase)
91
92                 return
93
94         def pausePenalty(self):
95                 act = ActionRecord(loginbase=self.db.loginbase,
96                                                         action='penalty',
97                                                         action_type='pause_penalty',)
98         
99         def clearPenalty(self):
100                 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
101                 self.db.penalty_level = 0
102                 self.db.penalty_applied = False
103         
104         def getTicketStatus(self):
105                 if self.db.message_id != 0:
106                         rtstatus = mailer.getTicketStatus(self.db.message_id)
107                         self.db.message_status = rtstatus['Status']
108                         self.db.message_queue = rtstatus['Queue']
109                         self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
110
111         def setTicketStatus(self, status):
112                 print 'SETTING status %s' % status
113                 if self.db.message_id != 0:
114                         rtstatus = mailer.setTicketStatus(self.db.message_id, status)
115
116         def getContacts(self):
117                 contacts = []
118                 if self.db.penalty_level >= 0:
119                         contacts += plc.getTechEmails(self.db.loginbase)
120
121                 if self.db.penalty_level >= 1:
122                         contacts += plc.getPIEmails(self.db.loginbase)
123
124                 if self.db.penalty_level >= 2:
125                         contacts += plc.getSliceUserEmails(self.db.loginbase)
126
127                 return contacts
128
129         def sendMessage(self, type, **kwargs):
130
131                 # NOTE: evidently changing an RT message's subject opens the ticket.
132                 #       the logic in this policy depends up a ticket only being 'open'
133         #       if a user has replied to it.
134         #       So, to preserve these semantics, we check the status before
135         #           sending, then after sending, reset the status to the
136         #           previous status.
137         #       There is a very tiny race here, where a user sends a reply
138         #           within the time it takes to check, send, and reset.
139         #       This sucks.  It's almost certainly fragile.
140
141                 # 
142                 # TODO: catch any errors here, and add an ActionRecord that contains
143                 #       those errors.
144                 
145                 args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
146                 args.update(kwargs)
147
148                 hostname = None
149                 if 'hostname' in args:
150                         hostname = args['hostname']
151
152                 if hasattr(mailtxt, type):
153
154                         message = getattr(mailtxt, type)
155                         viart = True
156                         if 'viart' in kwargs:
157                                 viart = kwargs['viart']
158
159                         if viart:
160                                 self.getTicketStatus()          # get current message status
161
162                         m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
163
164                         contacts = self.getContacts()
165                         contacts = [config.cc_email]    # TODO: remove after testing...
166
167                         print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
168
169                         ret = m.send(contacts)
170                         if viart:
171                                 self.db.message_id = ret
172                                 # reset to previous status, since a new subject 'opens' RT tickets.
173                                 self.setTicketStatus(self.db.message_status) 
174
175                                 # NOTE: only make a record of it if it's in RT.
176                                 act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
177                                                                 action_type=type, message_id=self.db.message_id)
178
179                 else:
180                         print "+-- WARNING! ------------------------------"
181                         print "| No such message name in emailTxt.mailtxt: %s" % type
182                         print "+------------------------------------------"
183
184                 return
185
186         def closeTicket(self):
187                 # TODO: close the rt ticket before overwriting the message_id
188                 mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
189                 act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
190                                                         action_type='end_notice', message_id=self.db.message_id)
191                 self.db.message_id = 0
192                 self.db.message_status = "new"
193
194         def runBootManager(self, hostname):
195                 print "attempting BM reboot of %s" % hostname
196                 ret = ""
197                 try:
198                         ret = bootman.restore(self, hostname)
199                         err = ""
200                 except:
201                         err = traceback.format_exc()
202                         print err
203
204                 act = ActionRecord(loginbase=self.db.loginbase,
205                                                         hostname=hostname,
206                                                         action='reboot',
207                                                         action_type='bootmanager_restore',
208                                                         error_string=err)
209                 return ret
210
211         def attemptReboot(self, hostname):
212                 print "attempting PCU reboot of %s" % hostname
213                 err = ""
214                 try:
215                         ret = reboot.reboot_str(hostname)
216                 except Exception, e:
217                         err = traceback.format_exc()
218                         ret = str(e)
219
220                 if ret == 0 or ret == "0":
221                         ret = ""
222
223                 act = ActionRecord(loginbase=self.db.loginbase,
224                                                         hostname=hostname,
225                                                         action='reboot',
226                                                         action_type='first_try_reboot',
227                                                         error_string=err)
228
229 def logic():
230
231         plc.nodeBootState(host, 'rins')
232         node_end_record(host)
233
234
235
236
237 def main(hostnames, sitenames):
238         # commands:
239         i = 1
240         node_count = 1
241         site_count = 1
242         #print "hosts: %s" % hostnames
243         for host in hostnames:
244                 try:
245                         lb = plccache.plcdb_hn2lb[host]
246                 except:
247                         print "unknown host in plcdb_hn2lb %s" % host
248                         continue
249
250                 nodeblack = BlacklistRecord.get_by(hostname=host)
251
252                 if nodeblack and not nodeblack.expired():
253                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
254                         continue
255
256                 sitehist = SiteInterface.get_or_make(loginbase=lb)
257
258                 recent_actions = sitehist.getRecentActions(hostname=host)
259
260                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
261
262                 print "%s %s" % ( nodehist.hostname, nodehist.status)
263                 if nodehist.status == 'good' and \
264                         changed_lessthan(nodehist.last_changed, 1.0) and \
265                         not found_within(recent_actions, 'online_notice', 0.5):
266                                 # NOTE: there is a narrow window in which this command must be
267                                 # evaluated, otherwise the notice will not go out.  this is not ideal.
268                                 sitehist.sendMessage('online_notice', hostname=host)
269                                 print "send message for host %s online" % host
270
271                                 pass
272
273                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
274                         changed_greaterthan(nodehist.last_changed,1.0) and \
275                         not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
276
277                                 sitehist.attemptReboot(host)
278                                 print "send message for host %s first_try_reboot" % host
279                                 pass
280
281                 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
282                 #               will be false for a day after the above condition is satisfied
283                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
284                         changed_greaterthan(nodehist.last_changed,1.5) and \
285                         found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
286                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
287                         # found_within(recent_actions, 'first_try_reboot', 3.5) and \
288                                 
289                                 # send pcu failure message
290                                 #act = ActionRecord(**kwargs)
291                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
292                                 print "send message for host %s PCU Failure" % host
293                                 pass
294
295                 if nodehist.status == 'monitordebug' and \
296                         changed_greaterthan(nodehist.last_changed, 1) and \
297                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
298                                 # send down node notice
299                                 # delay 0.5 days before retrying...
300
301                                 print "send message for host %s bootmanager_restore" % host
302                                 sitehist.runBootManager(host)
303                         #       sitehist.sendMessage('retry_bootman', hostname=host)
304
305                 if nodehist.status == 'down' and \
306                         changed_greaterthan(nodehist.last_changed, 2) and \
307                         not found_within(recent_actions, 'down_notice', 3.5):
308                                 # send down node notice
309
310                                 sitehist.sendMessage('down_notice', hostname=host)
311                                 print "send message for host %s down" % host
312                                 pass
313
314                 node_count = node_count + 1
315                 session.flush()
316
317         for site in sitenames:
318                 sitehist = SiteInterface.get_or_make(loginbase=site)
319                 # TODO: make query only return records within a certin time range,
320                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
321                 recent_actions = sitehist.getRecentActions(loginbase=site)
322
323                 #sitehist.sendMessage('test_notice', host)
324
325                 print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
326                 if sitehist.db.status == 'down':
327                         if  not found_within(recent_actions, 'pause_penalty', 30) and \
328                                 not found_within(recent_actions, 'increase_penalty', 7) and \
329                                 changed_greaterthan(sitehist.db.last_changed, 7):
330
331                                 # TODO: catch errors
332                                 sitehist.increasePenalty()
333                                 #sitehist.applyPenalty()
334                                 sitehist.sendMessage('increase_penalty')
335
336                                 print "send message for site %s penalty increase" % site
337
338                 if sitehist.db.status == 'good':
339                         # clear penalty
340                         # NOTE: because 'all clear' should have an indefinite status, we
341                         #               have a boolean value rather than a 'recent action'
342                         if sitehist.db.penalty_applied:
343                                 # send message that penalties are cleared.
344
345                                 sitehist.clearPenalty()
346                                 #sitehist.applyPenalty()
347                                 sitehist.sendMessage('clear_penalty')
348                                 sitehist.closeTicket()
349
350                                 print "send message for site %s penalty cleared" % site
351
352                 # find all ticket ids for site ( could be on the site record? )
353                 # determine if there are penalties within the last 30 days?
354                 # if so, add a 'pause_penalty' action.
355                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
356                         #       pause escalation
357                         print "Pausing penalties for %s" % site
358                         sitehist.pausePenalty()
359
360                 site_count = site_count + 1
361
362                 session.flush()
363
364         session.flush()
365         return
366
367
368 if __name__ == "__main__":
369         parser = parsermodule.getParser(['nodesets'])
370         parser.set_defaults( timewait=0,
371                                                 skip=0,
372                                                 rins=False,
373                                                 reboot=False,
374                                                 findbad=False,
375                                                 force=False, 
376                                                 nosetup=False, 
377                                                 verbose=False, 
378                                                 quiet=False,
379                                                 )
380
381         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
382                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
383         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
384                                                 help="Re-run findbad on the nodes we're going to check before acting.")
385         parser.add_option("", "--force", dest="force", action="store_true", 
386                                                 help="Force action regardless of previous actions/logs.")
387         parser.add_option("", "--rins", dest="rins", action="store_true", 
388                                                 help="Set the boot_state to 'rins' for all nodes.")
389         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
390                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
391
392         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
393                                                 help="Extra debug output messages.")
394         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
395                                                 help="Do not perform the orginary setup phase.")
396         parser.add_option("", "--skip", dest="skip", 
397                                                 help="Number of machines to skip on the input queue.")
398         parser.add_option("", "--timewait", dest="timewait", 
399                                                 help="Minutes to wait between iterations of 10 nodes.")
400
401         parser = parsermodule.getParser(['defaults'], parser)
402         config = parsermodule.parse_args(parser)
403
404 #       # COLLECT nodegroups, nodes and node lists
405 #       if config.nodegroup:
406 #               ng = api.GetNodeGroups({'name' : config.nodegroup})
407 #               nodelist = api.GetNodes(ng[0]['node_ids'])
408 #               hostnames = [ n['hostname'] for n in nodelist ]
409
410         fbquery = HistoryNodeRecord.query.all()
411         hostnames = [ n.hostname for n in fbquery ]
412         
413         fbquery = HistorySiteRecord.query.all()
414         sitenames = [ s.loginbase for s in fbquery ]
415
416         if config.site:
417                 # TODO: replace with calls to local db.  the api fails so often that
418                 #               these calls should be regarded as unreliable.
419                 site = api.GetSites(config.site)
420                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
421                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
422
423                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
424                 sitenames = [config.site]
425
426         if config.node:
427                 hostnames = [ config.node ] 
428                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
429
430         try:
431                 main(hostnames, sitenames)
432         except KeyboardInterrupt:
433                 print "Killed by interrupt"
434                 session.flush()
435                 sys.exit(0)
436         except:
437                 #email_exception()
438                 print traceback.print_exc();
439                 print "fail all..."