add sorting tables to the pcu view.
[monitor.git] / monitor / policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 import sys
6 import emailTxt
7 import string
8 from monitor.wrapper import plccache
9 from datetime import datetime
10
11 from rt import is_host_in_rt_tickets
12 import plc
13
14 # Time to enforce policy
15 POLSLEEP = 7200
16
17 # Where to email the summary
18 SUMTO = "soltesz@cs.princeton.edu"
19
20 from const import *
21
22 from monitor.model import *
23
24 class MonitorMergeDiagnoseSendEscellate:
25         act_all = None
26
27         def __init__(self, hostname, act):
28                 self.hostname = hostname
29                 self.act = act
30                 self.plcdb_hn2lb = None
31                 if self.plcdb_hn2lb is None:
32                         self.plcdb_hn2lb = plccache.plcdb_hn2lb 
33                 self.loginbase = self.plcdb_hn2lb[self.hostname]
34                 return
35
36         def getFBRecords(self):
37                 fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname)
38                 fbnodes = None
39                 if fbrec: 
40                         fbnodes = fbrecs
41                 else:
42                         fbnodes = None
43                 return fbnodes
44
45         def getLastActionRecord(self):
46                 actrec = ActionRecord.get_latest_by(hostname=self.hostname)
47                 actnode = None
48                 if actrec:
49                         actnode = actrec
50                 else:
51                         actnode = None
52                 return actnode
53
54         def getPreviousCategory(self, actrec):
55                 ret = None
56                 if actrec:
57                         ret = actrec.findbad_records[0].observed_category
58                 else:
59                         ret = "ERROR"
60                 return ret
61
62
63         def mergeRecord(self, fbnodes, actrec):
64
65                 actdefault = {}
66                 actdefault['date_created'] = datetime.now()
67                 actdefault['date_action_taken'] = datetime.now()
68
69                 actdefault['stage'] = "initial"
70                 actdefault['message_series'] = None
71                 actdefault['message_index'] = None
72                 actdefault['message_arguments'] = None
73
74                 actdefault['send_email_to'] = TECH
75                 actdefault['penalty_level'] = 0
76                 actdefault['action'] = [ 'noop' ]
77                 actdefault['take_action'] = False
78
79                 actdefault['ticket_id'] = ""
80                 actdefault['findbad_records'] = fbnodes
81                 actdefault['last_action_record'] = actrec
82
83                 actdefault['prev_category'] = self.getPreviousCategory(actrec)
84                 actdefault['category']          = fbnodes[0].observed_category
85
86                 actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id)
87
88                 return actdefault
89
90         def run(self):
91                 fbnodes = self.getFBRecords()
92                 actnode= self.getLastActionRecord()
93                 actrec = self.mergeRecord(fbnodes, actnode)
94                 record = Record(self.hostname, actrec)
95                 diag   = self.diagnose(record)
96                 if self.act and diag is not None:
97                         self.action(record,diag)
98         
99         def diagnose(self, record):
100
101                 diag = {}
102                 # NOTE: change record stage based on RT status.
103                 if record.stageIswaitforever():
104                         ticket = record.data['rt']
105                         if 'new' in ticket['Status']:
106                                 print "Resetting Stage!!!!!"
107                                 record.reset_stage()
108                                 
109                         if 'resolved' in ticket['Status']:
110                                 diag['RTEndRecord'] = True
111
112                 # NOTE: take category, and prepare action
113                 category = record.getCategory()
114                 if category == "error":
115                         diag['SendNodedown'] = True
116                         record.data['message_series'] = emailTxt.mailtxt.newdown
117                         record.data['log'] = self.getDownLog(record)
118
119                 elif category == "prod" or category == "alpha":
120                         state = record.getState()
121                         if state == "boot":
122                                 if record.severity() != 0:
123                                         diag['SendThankyou'] = True
124                                         print "RESETTING STAGE: improvement"
125                                         record.data['stage'] = 'improvement'
126                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
127                                         record.data['log'] = self.getThankyouLog(record)
128                                 else:
129                                         # NOTE: do nothing, since we've already done the above.
130                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
131                                         return None
132                         elif state == "debug":
133                                 pass
134                         else:
135                                 print "unknown state %s for host %s" % (state, self.hostname)
136                 else:
137                         print "unknown category: %s" % category
138
139
140                 # TODO: how to not send email?...
141                 record = self.checkStageAndTime(record)
142                 #if record:
143                 print "diagnose: checkStageAndTime Returned Valid Record"
144                 siterec = HistorySiteRecord.by_loginbase(self.loginbase)
145
146                 if "good" not in siterec.status: #  != "good":
147                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
148                         diag['Squeeze'] = True
149                 else:
150                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
151                         diag['BackOff'] = True
152
153                 return diag
154
155         def action(self, record, diag):
156
157                 message = None
158
159                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
160                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
161                         "monitor-end-record" in record.data['stage']:
162                         print "action: getting message"
163                         #### Send EMAIL
164                         message = record.getMessage(record.data['ticket_id'])
165                         if message:
166                                 print "action: sending email"
167                                 message.send(record.getContacts())
168                                 if message.rt.ticket_id:
169                                         print "action: setting record ticket_id"
170                                         record.data['ticket_id'] = message.rt.ticket_id
171
172                         #### APPLY PENALTY
173                         if ( record.data['take_action'] and diag['Squeeze'] ): 
174                                 print "action: taking action"
175                                 record.takeAction(record.data['penalty_level'])
176                                 del diag['Squeeze']
177                         if diag.getFlag('BackOff'):
178                                 record.takeAction(0)
179                                 del diag['BackOff']
180
181                         #### SAVE TO DB
182                         if record.saveAction():
183                                 print "action: saving act_all db"
184                                 self.add_and_save_act_all(record)
185                         else:
186                                 print "action: NOT saving act_all db"
187                                 print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] )
188
189                         #### END RECORD
190                         if record.improved() or diag['RTEndRecord']:
191                                 print "action: end record for %s" % self.hostname
192                                 record.end_record()
193                                 diag['CloseRT'] = True
194                                 del diag['RTEndRecord']
195
196                         #### CLOSE RT TICKET
197                         if message:
198                                 if diag['CloseRT']:
199                                         message.rt.closeTicket()
200                                         del diag['CloseRT']
201
202                 else:
203                         print "NOT sending email : %s" % config.mail
204
205                 return
206
207         def add_and_save_act_all(self, record):
208                 """
209                         Read the sync record for this node, and increment the round and
210                         create an ActionRecord for this host using the record.data values.
211                 """
212                 recsync = RecordActionSync.get_by(hostname=self.hostname)
213                 rec = RecordAction(hostname=self.hostname)
214                 recsync.round += 1
215                 record.data['round'] = recsync.round
216                 # TODO: we will need to delete some of these before setting them in the DB.
217                 rec.set(**record.data)
218                 rec.flush()
219
220         def getDownLog(self, record):
221
222                 record.data['args'] = {'nodename': self.hostname}
223                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
224
225                 #for key in record.data.keys():
226                 #       print "%10s %s %s " % (key, "==", record.data[key])
227
228                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
229                         log = "DOWN: %20s : %-40s == %20s %s" % \
230                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
231                 else:
232                         log = "DOWN: %20s : %-40s == %20s %s" % \
233                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
234                 return log
235
236         def getThankyouLog(self, record):
237
238                 record.data['args'] = {'nodename': self.hostname}
239                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
240
241                 try:
242                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
243                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
244                                                 (self.loginbase, self.hostname, record.data['stage'], 
245                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
246                         else:
247                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
248                                                 (self.loginbase, self.hostname, record.data['stage'], 
249                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
250                 except:
251                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
252                 return log
253
254         def makeRecord(self, **kwargs):
255                 rec = {}
256                 for key in kwargs.keys():
257                         rec[key] = kwargs[key]
258                 return rec
259
260         def checkStageAndTime(self, record):
261                 """
262                 The core variables are:
263
264                         send_email_to  : defines who to send messages to at this time
265                         take_action    : whether or not to take action
266                         penalty_level  : how much of a penalty to apply
267                         message_index  : where in the escellation sequence we are.
268                         save_act_all   : whether or not to save the action record in the db.
269
270                         action/stage   : stage tracks which state we're in.
271                 """
272                 #stages = {
273                 #       "initial"               : [ { action='noop', next="weekone"}],
274                 #       "weekone"               : [ { action='noop',         index=0, save=True, email=TECH,         length=7*SPERDAY,  next="weektwo" }, ],
275                 #       "weektwo"               : [ { action='nocreate',     index=1, save=True, email=TECH|PI,      length=7*SPERDAY,  next="waitforever" }, ],
276                 #       "waitforever"   : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY,  next="waitforever" }, ],
277                 #       "paused"                : [ { action='noop',                              save=True                                              length=30*SPERDAY, next="weekone" }, ]
278                 #       "improvement"   : [ { action='close_rt',     index=0, save=True, email=TECH,         next="monitor-end-record" }, ],
279                 #}
280                 # TODO: make this time relative to the PREVIOUS action taken.
281                 current_time = time.time()
282                 current_stage = record.getMostRecentStage()
283                 recent_time   = record.getMostRecentTime()
284
285                 delta = current_time - recent_time
286
287                 if current_stage in stages:
288                         values = stages[current_stage][0]
289
290                 if delta >= values['length']:
291                         print "checkStageAndTime: transition to next stage"
292                         new_stage = values['next']
293                         values = stages[new_stage]
294
295                 elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data:
296                         print "checkStageAndTime: second message in one week for stage two"
297                         take_action=False
298                         pass
299                 else:
300                         # DO NOTHING
301                         take_action=False, 
302                         save_act_all=False, 
303                         message_index=None, 
304                         print "checkStageAndTime: second message in one week for stage two"
305
306                 rec = self.makeRecord( stage=new_stage, send_email_to=values['email'],
307                                                            action=values['action'], message_index=values['index'], 
308                                                            save_act_all=values['save'], penalty_level=values['index'], 
309                                                            date_action_taken=current_time)
310                 record.data.update(rec)
311
312
313                 if   'initial' in record.data['stage']:
314                         # The node is bad, and there's no previous record of it.
315                         rec = self.makeRecord(
316                                                         stage="weekone", send_email_to=TECH, 
317                                                         action=['noop'], take_action=False, 
318                                                         message_index=0, save_act_all=True, 
319                                                         penalty_level=0, )
320                         record.data.update(rec)
321
322                 elif 'improvement' in record.data['stage']:
323                         print "checkStageAndTime: backing off of %s" % self.hostname
324                         rec = self.makeRecord(
325                                                         stage='monitor-end-record', send_email_to=TECH, 
326                                                         action=['close_rt'], take_action=True, 
327                                                         message_index=0, save_act_all=True, 
328                                                         penalty_level=0, )
329                         record.data.update(rec)
330
331                 else:
332                         # There is no action to be taken, possibly b/c the stage has
333                         # already been performed, but diagnose picked it up again.
334                         # two cases, 
335                         #       1. stage is unknown, or 
336                         #       2. delta is not big enough to bump it to the next stage.
337                         # TODO: figure out which. for now assume 2.
338                         print "UNKNOWN stage for %s; nothing done" % self.hostname
339                         rec = self.makeRecord(
340                                                         stage='weekone', send_email_to=TECH,
341                                                         action=['noop'], 
342                                                         take_action=False, 
343                                                         save_act_all=True, 
344                                                         date_action_taken=current_time,
345                                                         message_index=0, 
346                                                         penalty_level=0, )
347                         record.data.update(rec)
348
349                 print "%s" % record.data['log'],
350                 print "%15s" % record.data['action']
351                 return record
352