added policy.py and updated bootman.py to work with the new policy framework.
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 import bootman          # debug nodes
22
23 from monitor import util
24 from monitor import const
25 from monitor import reboot
26 from monitor import config
27 from monitor import database
28 from monitor import parser as parsermodule
29 from monitor.common import *
30 from monitor.model import *
31 from monitor.wrapper import plc
32 from monitor.wrapper import plccache
33 from monitor.wrapper.emailTxt import mailtxt
34 from monitor.database.info.model import *
35
36 from nodequery import verify,query_to_dict,node_select
37
38 api = plc.getAuthAPI()
39
40
41 class SiteInterface(HistorySiteRecord):
42         @classmethod
43         def get_or_make(cls, if_new_set={}, **kwargs):
44                 if 'hostname' in kwargs:
45                         kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
46                         del kwargs['hostname']
47                 res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
48                 return SiteInterface(res)
49         
50         def __init__(self, sitehist):
51                 self.db = sitehist
52
53         def getRecentActions(self, **kwargs):
54                 # TODO: make query only return records within a certin time range,
55                 # i.e. greater than 0.5 days ago. or 5 days, etc.
56
57                 #print "kwargs: ", kwargs
58
59                 recent_actions = []
60                 if 'loginbase' in kwargs:
61                         recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
62                 elif 'hostname' in kwargs:
63                         recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
64                 return recent_actions
65         
66         def increasePenalty(self):
67                 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
68                 self.db.penalty_level += 1
69                 # NOTE: this is to prevent overflow or index errors in applyPenalty.
70                 #       there's probably a better approach to this.
71                 if self.db.penalty_level >= 2:
72                         self.db.penalty_level = 2
73                 self.db.penalty_applied = True
74         
75         def applyPenalty(self):
76                 penalty_map = [] 
77                 penalty_map.append( { 'name': 'noop',                   'enable'   : lambda site: None,
78                                                                                                                 'disable'  : lambda site: None } )
79                 penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda site: plc.removeSiteSliceCreation(site),
80                                                                                                                 'disable'  : lambda site: plc.enableSiteSliceCreation(site) } )
81                 penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda site: plc.suspendSiteSlices(site),
82                                                                                                                 'disable'  : lambda site: plc.enableSiteSlices(site) } )
83
84                 for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
85                         print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
86                         penalty_map[i]['disable'](self.db.loginbase) 
87
88                 for i in range(0,self.db.penalty_level+1):
89                         print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
90                         penalty_map[i]['enable'](self.db.loginbase)
91
92                 return
93
94         def pausePenalty(self):
95                 act = ActionRecord(loginbase=self.db.loginbase,
96                                                         action='penalty',
97                                                         action_type='pause_penalty',)
98         
99         def clearPenalty(self):
100                 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
101                 self.db.penalty_level = 0
102                 self.db.penalty_applied = False
103         
104         def getTicketStatus(self):
105                 if self.db.message_id != 0:
106                         rtstatus = mailer.getTicketStatus(self.db.message_id)
107                         self.db.message_status = rtstatus['Status']
108                         self.db.message_queue = rtstatus['Queue']
109                         self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
110
111         def setTicketStatus(self, status):
112                 print 'SETTING status %s' % status
113                 if self.db.message_id != 0:
114                         rtstatus = mailer.setTicketStatus(self.db.message_id, status)
115
116         def getContacts(self):
117                 contacts = []
118                 if self.db.penalty_level >= 0:
119                         contacts += plc.getTechEmails(self.db.loginbase)
120
121                 if self.db.penalty_level >= 1:
122                         contacts += plc.getPIEmails(self.db.loginbase)
123
124                 if self.db.penalty_level >= 2:
125                         contacts += plc.getSliceUserEmails(self.db.loginbase)
126
127                 return contacts
128
129         def sendMessage(self, type, **kwargs):
130
131                 # NOTE: evidently changing an RT message's subject opens the ticket.
132                 #       the logic in this policy depends up a ticket only being 'open'
133         #       if a user has replied to it.
134         #       So, to preserve these semantics, we check the status before
135         #           sending, then after sending, reset the status to the
136         #           previous status.
137         #       There is a very tiny race here, where a user sends a reply
138         #           within the time it takes to check, send, and reset.
139         #       This sucks.  It's almost certainly fragile.
140
141                 # 
142                 # TODO: catch any errors here, and add an ActionRecord that contains
143                 #       those errors.
144                 
145                 args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
146                 args.update(kwargs)
147
148                 hostname = None
149                 if 'hostname' in args:
150                         hostname = args['hostname']
151
152                 if hasattr(mailtxt, type):
153
154                         message = getattr(mailtxt, type)
155                         viart = True
156                         if 'viart' in kwargs:
157                                 viart = kwargs['viart']
158
159                         if viart:
160                                 self.getTicketStatus()          # get current message status
161
162                         m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
163
164                         contacts = self.getContacts()
165                         contacts = [config.cc_email]    # TODO: remove after testing...
166
167                         print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
168
169                         ret = m.send(contacts)
170                         if viart:
171                                 self.db.message_id = ret
172                                 # reset to previous status, since a new subject 'opens' RT tickets.
173                                 self.setTicketStatus(self.db.message_status) 
174
175                                 # NOTE: only make a record of it if it's in RT.
176                                 act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', 
177                                                                 action_type=type, message_id=self.db.message_id)
178
179                 else:
180                         print "+-- WARNING! ------------------------------"
181                         print "| No such message name in emailTxt.mailtxt: %s" % type
182                         print "+------------------------------------------"
183
184                 return
185
186         def closeTicket(self):
187                 # TODO: close the rt ticket before overwriting the message_id
188                 mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
189                 act = ActionRecord(loginbase=self.db.loginbase, action='notice', 
190                                                         action_type='end_notice', message_id=self.db.message_id)
191                 self.db.message_id = 0
192                 self.db.message_status = "new"
193
194         def runBootManager(self, hostname):
195                 print "attempting BM reboot of %s" % hostname
196                 ret = ""
197                 try:
198                         ret = bootman.restore(self, hostname)
199                         err = ""
200                 except:
201                         err = traceback.format_exc()
202                         print err
203
204                 act = ActionRecord(loginbase=self.db.loginbase,
205                                                         hostname=hostname,
206                                                         action='reboot',
207                                                         action_type='bootmanager_restore',
208                                                         error_string=err)
209                 return ret
210
211         def attemptReboot(self, hostname):
212                 print "attempting PCU reboot of %s" % hostname
213                 ret = reboot.reboot_str(hostname)
214                 if ret == 0 or ret == "0":
215                         ret = ""
216                 act = ActionRecord(loginbase=self.db.loginbase,
217                                                         hostname=hostname,
218                                                         action='reboot',
219                                                         action_type='first_try_reboot',
220                                                         error_string=ret)
221
222 def logic():
223
224         plc.nodeBootState(host, 'rins')
225         node_end_record(host)
226
227
228
229
230 def main(hostnames, sitenames):
231         l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
232         # commands:
233         i = 1
234         node_count = 1
235         site_count = 1
236         #print "hosts: %s" % hostnames
237         for host in hostnames:
238                 try:
239                         lb = plccache.plcdb_hn2lb[host]
240                 except:
241                         print "unknown host in plcdb_hn2lb %s" % host
242                         continue
243
244                 sitehist = SiteInterface.get_or_make(loginbase=lb)
245
246                 recent_actions = sitehist.getRecentActions(hostname=host)
247
248                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
249
250                 print "%s %s" % ( nodehist.hostname, nodehist.status)
251                 if nodehist.status == 'good' and \
252                         changed_lessthan(nodehist.last_changed, 1.0) and \
253                         not found_within(recent_actions, 'online_notice', 0.5):
254                                 # NOTE: there is a narrow window in which this command must be
255                                 # evaluated, otherwise the notice will not go out.  this is not ideal.
256                                 sitehist.sendMessage('online_notice', hostname=host)
257                                 print "send message for host %s online" % host
258
259                                 pass
260
261                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
262                         changed_greaterthan(nodehist.last_changed,1.0) and \
263                         not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
264
265                                 sitehist.attemptReboot(host)
266                                 print "send message for host %s first_try_reboot" % host
267                                 pass
268
269                 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
270                 #               will be false for a day after the above condition is satisfied
271                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
272                         changed_greaterthan(nodehist.last_changed,1.5) and \
273                         found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
274                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
275                         # found_within(recent_actions, 'first_try_reboot', 3.5) and \
276                                 
277                                 # send pcu failure message
278                                 #act = ActionRecord(**kwargs)
279                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
280                                 print "send message for host %s PCU Failure" % host
281                                 pass
282
283                 if nodehist.status == 'monitordebug' and \
284                         changed_greaterthan(nodehist.last_changed, 1) and \
285                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
286                                 # send down node notice
287                                 # delay 0.5 days before retrying...
288
289                                 print "send message for host %s bootmanager_restore" % host
290                                 sitehist.runBootManager(host)
291                         #       sitehist.sendMessage('retry_bootman', hostname=host)
292
293                 if nodehist.status == 'down' and \
294                         changed_greaterthan(nodehist.last_changed, 2) and \
295                         not found_within(recent_actions, 'down_notice', 3.5):
296                                 # send down node notice
297
298                                 sitehist.sendMessage('down_notice', hostname=host)
299                                 print "send message for host %s offline" % host
300                                 pass
301
302                 node_count = node_count + 1
303
304         for site in sitenames:
305                 sitehist = SiteInterface.get_or_make(loginbase=site)
306                 # TODO: make query only return records within a certin time range,
307                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
308                 recent_actions = sitehist.getRecentActions(loginbase=site)
309
310                 #sitehist.sendMessage('test_notice', host)
311
312                 print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
313                 if sitehist.db.status == 'down':
314                         if  not found_within(recent_actions, 'pause_penalty', 30) and \
315                                 not found_within(recent_actions, 'increase_penalty', 7) and \
316                                 changed_greaterthan(sitehist.db.last_changed, 7):
317
318                                 # TODO: catch errors
319                                 sitehist.increasePenalty()
320                                 #sitehist.applyPenalty()
321                                 sitehist.sendMessage('increase_penalty')
322
323                                 print "send message for site %s penalty increase" % site
324
325                 if sitehist.db.status == 'good':
326                         # clear penalty
327                         # NOTE: because 'all clear' should have an indefinite status, we
328                         #               have a boolean value rather than a 'recent action'
329                         if sitehist.db.penalty_applied:
330                                 # send message that penalties are cleared.
331
332                                 sitehist.clearPenalty()
333                                 #sitehist.applyPenalty()
334                                 sitehist.sendMessage('clear_penalty')
335                                 sitehist.closeTicket()
336
337                                 print "send message for site %s penalty cleared" % site
338
339                 # find all ticket ids for site ( could be on the site record? )
340                 # determine if there are penalties within the last 30 days?
341                 # if so, add a 'pause_penalty' action.
342                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
343                         #       pause escalation
344                         print "Pausing penalties for %s" % site
345                         sitehist.pausePenalty()
346
347                 site_count = site_count + 1
348
349         session.flush()
350
351         return
352
353
354 if __name__ == "__main__":
355         parser = parsermodule.getParser(['nodesets'])
356         parser.set_defaults( timewait=0,
357                                                 skip=0,
358                                                 rins=False,
359                                                 reboot=False,
360                                                 findbad=False,
361                                                 force=False, 
362                                                 nosetup=False, 
363                                                 verbose=False, 
364                                                 quiet=False,
365                                                 )
366
367         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
368                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
369         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
370                                                 help="Re-run findbad on the nodes we're going to check before acting.")
371         parser.add_option("", "--force", dest="force", action="store_true", 
372                                                 help="Force action regardless of previous actions/logs.")
373         parser.add_option("", "--rins", dest="rins", action="store_true", 
374                                                 help="Set the boot_state to 'rins' for all nodes.")
375         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
376                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
377
378         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
379                                                 help="Extra debug output messages.")
380         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
381                                                 help="Do not perform the orginary setup phase.")
382         parser.add_option("", "--skip", dest="skip", 
383                                                 help="Number of machines to skip on the input queue.")
384         parser.add_option("", "--timewait", dest="timewait", 
385                                                 help="Minutes to wait between iterations of 10 nodes.")
386
387         parser = parsermodule.getParser(['defaults'], parser)
388         config = parsermodule.parse_args(parser)
389
390 #       # COLLECT nodegroups, nodes and node lists
391 #       if config.nodegroup:
392 #               ng = api.GetNodeGroups({'name' : config.nodegroup})
393 #               nodelist = api.GetNodes(ng[0]['node_ids'])
394 #               hostnames = [ n['hostname'] for n in nodelist ]
395
396 #       if config.node or config.nodelist:
397 #               if config.node: hostnames = [ config.node ] 
398 #               else: hostnames = util.file.getListFromFile(config.nodelist)
399 #
400 #       fbquery = FindbadNodeRecord.get_all_latest()
401 #       fb_nodelist = [ n.hostname for n in fbquery ]
402
403 #       if config.nodeselect:
404 #               hostnames = node_select(config.nodeselect, fb_nodelist)
405
406         fbquery = HistoryNodeRecord.query.all()
407         hostnames = [ n.hostname for n in fbquery ]
408         
409         fbquery = HistorySiteRecord.query.all()
410         sitenames = [ s.loginbase for s in fbquery ]
411
412         if config.site:
413                 site = api.GetSites(config.site)
414                 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
415                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
416
417                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
418                 sitenames = [config.site]
419
420         if config.node:
421                 hostnames = [ config.node ] 
422                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
423
424         try:
425                 main(hostnames, sitenames)
426         except KeyboardInterrupt:
427                 print "Killed by interrupt"
428                 sys.exit(0)
429         except:
430                 #email_exception()
431                 print traceback.print_exc();
432                 print "Continuing..."