516a8de8076b57d431c4c8bbb515cc989e71cf3d
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 from unified_model import cmpCategoryVal
6 import sys
7 import emailTxt
8 import string
9 from monitor.wrapper import plccache
10 from datetime import datetime
11
12 from rt import is_host_in_rt_tickets
13 import plc
14
15 # Time to enforce policy
16 POLSLEEP = 7200
17
18 # Where to email the summary
19 SUMTO = "soltesz@cs.princeton.edu"
20
21 from const import *
22
23 from unified_model import *
24
25 class MonitorMergeDiagnoseSendEscellate:
26         act_all = None
27
28         def __init__(self, hostname, act):
29                 self.hostname = hostname
30                 self.act = act
31                 self.plcdb_hn2lb = None
32                 if self.plcdb_hn2lb is None:
33                         self.plcdb_hn2lb = plccache.plcdb_hn2lb 
34                 self.loginbase = self.plcdb_hn2lb[self.hostname]
35                 return
36
37         def getFBRecords(self):
38                 fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname)
39                 fbnodes = None
40                 if fbrec: 
41                         fbnodes = fbrecs
42                 else:
43                         fbnodes = None
44                 return fbnodes
45
46         def getLastActionRecord(self):
47                 actrec = ActionRecord.get_latest_by(hostname=self.hostname)
48                 actnode = None
49                 if actrec:
50                         actnode = actrec
51                 else:
52                         actnode = None
53                 return actnode
54
55         def getPreviousCategory(self, actrec):
56                 ret = None
57                 if actrec:
58                         ret = actrec.findbad_records[0].observed_category
59                 else:
60                         ret = "ERROR"
61                 return ret
62
63
64         def mergeRecord(self, fbnodes, actrec):
65
66                 actdefault = {}
67                 actdefault['date_created'] = datetime.now()
68                 actdefault['date_action_taken'] = datetime.now()
69
70                 actdefault['stage'] = "initial"
71                 actdefault['message_series'] = None
72                 actdefault['message_index'] = None
73                 actdefault['message_arguments'] = None
74
75                 actdefault['send_email_to'] = TECH
76                 actdefault['penalty_level'] = 0
77                 actdefault['action'] = [ 'noop' ]
78                 actdefault['take_action'] = False
79
80                 actdefault['ticket_id'] = ""
81                 actdefault['findbad_records'] = fbnodes
82                 actdefault['last_action_record'] = actrec
83
84                 actdefault['prev_category'] = self.getPreviousCategory(actrec)
85                 actdefault['category']          = fbnodes[0].observed_category
86
87                 actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id)
88
89                 return actdefault
90
91         def run(self):
92                 fbnodes = self.getFBRecords()
93                 actnode= self.getLastActionRecord()
94                 actrec = self.mergeRecord(fbnodes, actnode)
95                 record = Record(self.hostname, actrec)
96                 diag   = self.diagnose(record)
97                 if self.act and diag is not None:
98                         self.action(record,diag)
99         
100         def diagnose(self, record):
101
102                 diag = {}
103                 # NOTE: change record stage based on RT status.
104                 if record.stageIswaitforever():
105                         ticket = record.data['rt']
106                         if 'new' in ticket['Status']:
107                                 print "Resetting Stage!!!!!"
108                                 record.reset_stage()
109                                 
110                         if 'resolved' in ticket['Status']:
111                                 diag['RTEndRecord'] = True
112
113                 # NOTE: take category, and prepare action
114                 category = record.getCategory()
115                 if category == "error":
116                         diag['SendNodedown'] = True
117                         record.data['message_series'] = emailTxt.mailtxt.newdown
118                         record.data['log'] = self.getDownLog(record)
119
120                 elif category == "prod" or category == "alpha":
121                         state = record.getState()
122                         if state == "boot":
123                                 if record.severity() != 0:
124                                         diag['SendThankyou'] = True
125                                         print "RESETTING STAGE: improvement"
126                                         record.data['stage'] = 'improvement'
127                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
128                                         record.data['log'] = self.getThankyouLog(record)
129                                 else:
130                                         # NOTE: do nothing, since we've already done the above.
131                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
132                                         return None
133                         elif state == "debug":
134                                 pass
135                         else:
136                                 print "unknown state %s for host %s" % (state, self.hostname)
137                 else:
138                         print "unknown category: %s" % category
139
140
141                 # TODO: how to not send email?...
142                 record = self.checkStageAndTime(record)
143                 #if record:
144                 print "diagnose: checkStageAndTime Returned Valid Record"
145                 siterec = HistorySiteRecord.by_loginbase(self.loginbase)
146
147                 if "good" not in siterec.status: #  != "good":
148                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
149                         diag['Squeeze'] = True
150                 else:
151                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
152                         diag['BackOff'] = True
153
154                 return diag
155
156         def action(self, record, diag):
157
158                 message = None
159
160                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
161                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
162                         "monitor-end-record" in record.data['stage']:
163                         print "action: getting message"
164                         #### Send EMAIL
165                         message = record.getMessage(record.data['ticket_id'])
166                         if message:
167                                 print "action: sending email"
168                                 message.send(record.getContacts())
169                                 if message.rt.ticket_id:
170                                         print "action: setting record ticket_id"
171                                         record.data['ticket_id'] = message.rt.ticket_id
172
173                         #### APPLY PENALTY
174                         if ( record.data['take_action'] and diag['Squeeze'] ): 
175                                 print "action: taking action"
176                                 record.takeAction(record.data['penalty_level'])
177                                 del diag['Squeeze']
178                         if diag.getFlag('BackOff'):
179                                 record.takeAction(0)
180                                 del diag['BackOff']
181
182                         #### SAVE TO DB
183                         if record.saveAction():
184                                 print "action: saving act_all db"
185                                 self.add_and_save_act_all(record)
186                         else:
187                                 print "action: NOT saving act_all db"
188                                 print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] )
189
190                         #### END RECORD
191                         if record.improved() or diag['RTEndRecord']:
192                                 print "action: end record for %s" % self.hostname
193                                 record.end_record()
194                                 diag['CloseRT'] = True
195                                 del diag['RTEndRecord']
196
197                         #### CLOSE RT TICKET
198                         if message:
199                                 if diag['CloseRT']:
200                                         message.rt.closeTicket()
201                                         del diag['CloseRT']
202
203                 else:
204                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
205
206                 return
207
208         def add_and_save_act_all(self, record):
209                 """
210                         Read the sync record for this node, and increment the round and
211                         create an ActionRecord for this host using the record.data values.
212                 """
213                 recsync = RecordActionSync.get_by(hostname=self.hostname)
214                 rec = RecordAction(hostname=self.hostname)
215                 recsync.round += 1
216                 record.data['round'] = recsync.round
217                 # TODO: we will need to delete some of these before setting them in the DB.
218                 rec.set(**record.data)
219                 rec.flush()
220
221         def getDownLog(self, record):
222
223                 record.data['args'] = {'nodename': self.hostname}
224                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
225
226                 #for key in record.data.keys():
227                 #       print "%10s %s %s " % (key, "==", record.data[key])
228
229                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
230                         log = "DOWN: %20s : %-40s == %20s %s" % \
231                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
232                 else:
233                         log = "DOWN: %20s : %-40s == %20s %s" % \
234                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
235                 return log
236
237         def getThankyouLog(self, record):
238
239                 record.data['args'] = {'nodename': self.hostname}
240                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
241
242                 try:
243                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
244                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
245                                                 (self.loginbase, self.hostname, record.data['stage'], 
246                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
247                         else:
248                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
249                                                 (self.loginbase, self.hostname, record.data['stage'], 
250                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
251                 except:
252                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
253                 return log
254
255         def makeRecord(self, **kwargs):
256                 rec = {}
257                 for key in kwargs.keys():
258                         rec[key] = kwargs[key]
259                 return rec
260
261         def checkStageAndTime(self, record):
262         """
263                 The core variables are:
264
265                         send_email_to  : defines who to send messages to at this time
266                         take_action    : whether or not to take action
267                         penalty_level  : how much of a penalty to apply
268                         message_index  : where in the escellation sequence we are.
269                         save_act_all   : whether or not to save the action record in the db.
270
271                         action/stage   : stage tracks which state we're in.
272         """
273                 stages = {
274                         "initial"               : [ { action='noop', next="weekone"}],
275                         "weekone"               : [ { action='noop',         index=0, save=True, email=TECH,         length=7*SPERDAY,  next="weektwo" }, ],
276                         "weektwo"               : [ { action='nocreate',     index=1, save=True, email=TECH|PI,      length=7*SPERDAY,  next="waitforever" }, ],
277                         "waitforever"   : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY,  next="waitforever" }, ],
278                         "paused"                : [ { action='noop',                              save=True                                              length=30*SPERDAY, next="weekone" }, ]
279                         "improvement"   : [ { action='close_rt',     index=0, save=True, email=TECH,         next="monitor-end-record" }, ],
280                 }
281                 # TODO: make this time relative to the PREVIOUS action taken.
282                 current_time = time.time()
283                 current_stage = record.getMostRecentStage()
284                 recent_time   = record.getMostRecentTime()
285
286                 delta = current_time - recent_time
287
288                 if current_stage in stages:
289                         values = stages[current_stage][0]
290
291                 if delta >= values['length']:
292                         print "checkStageAndTime: transition to next stage"
293                         new_stage = values['next']
294                         values = stages[new_stage]
295
296                 elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data:
297                         print "checkStageAndTime: second message in one week for stage two"
298                         take_action=False
299                         pass
300                 else:
301                         # DO NOTHING
302                         take_action=False, 
303                         save_act_all=False, 
304                         message_index=None, 
305                         print "checkStageAndTime: second message in one week for stage two"
306
307                 rec = self.makeRecord( stage=new_stage, send_email_to=values['email'],
308                                                            action=values['action'], message_index=values['index'], 
309                                                            save_act_all=values['save'], penalty_level=values['index'], 
310                                                            date_action_taken=current_time)
311                 record.data.update(rec)
312
313
314                 if   'initial' in record.data['stage']:
315                         # The node is bad, and there's no previous record of it.
316                         rec = self.makeRecord(
317                                                         stage="weekone", send_email_to=TECH, 
318                                                         action=['noop'], take_action=False, 
319                                                         message_index=0, save_act_all=True, 
320                                                         penalty_level=0, )
321                         record.data.update(rec)
322
323                 elif 'improvement' in record.data['stage']:
324                         print "checkStageAndTime: backing off of %s" % self.hostname
325                         rec = self.makeRecord(
326                                                         stage='monitor-end-record', send_email_to=TECH, 
327                                                         action=['close_rt'], take_action=True, 
328                                                         message_index=0, save_act_all=True, 
329                                                         penalty_level=0, )
330                         record.data.update(rec)
331
332                 else:
333                         # There is no action to be taken, possibly b/c the stage has
334                         # already been performed, but diagnose picked it up again.
335                         # two cases, 
336                         #       1. stage is unknown, or 
337                         #       2. delta is not big enough to bump it to the next stage.
338                         # TODO: figure out which. for now assume 2.
339                         print "UNKNOWN stage for %s; nothing done" % self.hostname
340                         rec = self.makeRecord(
341                                                         stage='weekone', send_email_to=TECH,
342                                                         action=['noop'], 
343                                                         take_action=False, 
344                                                         save_act_all=True, 
345                                                         date_action_taken=current_time,
346                                                         message_index=0, 
347                                                         penalty_level=0, )
348                         record.data.update(rec)
349
350                 print "%s" % record.data['log'],
351                 print "%15s" % record.data['action']
352                 return record
353