remove monitor-client.cron
[monitor.git] / monitor / policy.py
1 import config
2 import database 
3 import time
4 import sys
5 import string
6 from monitor.wrapper import mailer
7 from monitor.wrapper import emailTxt
8 from monitor.wrapper import plccache
9 from datetime import datetime
10
11 from monitor.wrapper.rt import is_host_in_rt_tickets
12 from monitor.wrapper import plc
13
14 # Time to enforce policy
15 POLSLEEP = 7200
16
17 # Where to email the summary
18 SUMTO = "soltesz@cs.princeton.edu"
19
20 from const import *
21
22 from monitor.model import *
23
24 class MonitorMergeDiagnoseSendEscellate:
25         act_all = None
26
27         def __init__(self, hostname, act):
28                 self.hostname = hostname
29                 self.act = act
30                 self.plcdb_hn2lb = None
31                 if self.plcdb_hn2lb is None:
32                         self.plcdb_hn2lb = plccache.plcdb_hn2lb 
33                 self.loginbase = self.plcdb_hn2lb[self.hostname]
34                 return
35
36         def getFBRecords(self):
37                 fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname)
38                 fbnodes = None
39                 if fbrec: 
40                         fbnodes = fbrecs
41                 else:
42                         fbnodes = None
43                 return fbnodes
44
45         def getLastActionRecord(self):
46                 actrec = ActionRecord.get_latest_by(hostname=self.hostname)
47                 actnode = None
48                 if actrec:
49                         actnode = actrec
50                 else:
51                         actnode = None
52                 return actnode
53
54         def getPreviousCategory(self, actrec):
55                 ret = None
56                 if actrec:
57                         ret = actrec.findbad_records[0].observed_category
58                 else:
59                         ret = "ERROR"
60                 return ret
61
62
63         def mergeRecord(self, fbnodes, actrec):
64
65                 actdefault = {}
66                 actdefault['date_created'] = datetime.now()
67                 actdefault['date_action_taken'] = datetime.now()
68
69                 actdefault['stage'] = "initial"
70                 actdefault['message_series'] = None
71                 actdefault['message_index'] = None
72                 actdefault['message_arguments'] = None
73
74                 actdefault['send_email_to'] = TECH
75                 actdefault['penalty_level'] = 0
76                 actdefault['action'] = [ 'noop' ]
77                 actdefault['take_action'] = False
78
79                 actdefault['ticket_id'] = ""
80                 actdefault['findbad_records'] = fbnodes
81                 actdefault['last_action_record'] = actrec
82
83                 actdefault['prev_category'] = self.getPreviousCategory(actrec)
84                 actdefault['category']          = fbnodes[0].observed_category
85
86                 actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id)
87
88                 return actdefault
89
90         def run(self):
91                 fbnodes = self.getFBRecords()
92                 actnode= self.getLastActionRecord()
93                 actrec = self.mergeRecord(fbnodes, actnode)
94                 record = Record(self.hostname, actrec)
95                 diag   = self.diagnose(record)
96                 if self.act and diag is not None:
97                         self.action(record,diag)
98         
99         def diagnose(self, record):
100
101                 diag = {}
102                 # NOTE: change record stage based on RT status.
103                 if record.stageIswaitforever():
104                         ticket = record.data['rt']
105                         if 'new' in ticket['Status']:
106                                 print "Resetting Stage!!!!!"
107                                 record.reset_stage()
108                                 
109                         if 'resolved' in ticket['Status']:
110                                 diag['RTEndRecord'] = True
111
112                 # NOTE: take category, and prepare action
113                 category = record.getCategory()
114                 if category == "error":
115                         diag['SendNodedown'] = True
116                         record.data['message_series'] = emailTxt.mailtxt.newdown
117                         record.data['log'] = self.getDownLog(record)
118
119                 elif category == "prod" or category == "alpha":
120                         state = record.getState()
121                         if state == "boot":
122                                 if record.severity() != 0:
123                                         diag['SendThankyou'] = True
124                                         print "RESETTING STAGE: improvement"
125                                         record.data['stage'] = 'improvement'
126                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
127                                         record.data['log'] = self.getThankyouLog(record)
128                                 else:
129                                         # NOTE: do nothing, since we've already done the above.
130                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
131                                         return None
132                         elif state == "debug":
133                                 pass
134                         else:
135                                 print "unknown state %s for host %s" % (state, self.hostname)
136                 else:
137                         print "unknown category: %s" % category
138
139
140                 # TODO: how to not send email?...
141                 record = self.checkStageAndTime(record)
142                 #if record:
143                 print "diagnose: checkStageAndTime Returned Valid Record"
144                 siterec = HistorySiteRecord.by_loginbase(self.loginbase)
145
146                 if "good" not in siterec.status: #  != "good":
147                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
148                         diag['Squeeze'] = True
149                 else:
150                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
151                         diag['BackOff'] = True
152
153                 return diag
154
155         def action(self, record, diag):
156
157                 message = None
158
159                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
160                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
161                         "monitor-end-record" in record.data['stage']:
162                         print "action: getting message"
163                         #### Send EMAIL
164                         message = record.getMessage(record.data['ticket_id'])
165                         if message:
166                                 print "action: sending email"
167                                 message.send(record.getContacts())
168                                 if message.rt.ticket_id:
169                                         print "action: setting record ticket_id"
170                                         record.data['ticket_id'] = message.rt.ticket_id
171
172                         #### APPLY PENALTY
173                         if ( record.data['take_action'] and diag['Squeeze'] ): 
174                                 print "action: taking squeeze action"
175                                 record.takeAction(record.data['penalty_level'])
176                                 del diag['Squeeze']
177                         if diag.getFlag('BackOff'):
178                                 print "action: taking backoff action"
179                                 record.takeAction(0)
180                                 del diag['BackOff']
181
182                         #### SAVE TO DB
183                         if record.saveAction():
184                                 print "action: saving act_all db"
185                                 self.add_and_save_act_all(record)
186                         else:
187                                 print "action: NOT saving act_all db"
188                                 print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] )
189
190                         #### END RECORD
191                         if record.improved() or diag['RTEndRecord']:
192                                 print "action: end record for %s" % self.hostname
193                                 record.end_record()
194                                 diag['CloseRT'] = True
195                                 del diag['RTEndRecord']
196
197                         #### CLOSE RT TICKET
198                         if message:
199                                 if diag['CloseRT']:
200                                         message.rt.closeTicket()
201                                         del diag['CloseRT']
202
203                 else:
204                         print "NOT sending email : %s" % config.mail
205
206                 return
207
208         def add_and_save_act_all(self, record):
209                 """
210                         Read the sync record for this node, and increment the round and
211                         create an ActionRecord for this host using the record.data values.
212                 """
213                 recsync = RecordActionSync.get_by(hostname=self.hostname)
214                 rec = RecordAction(hostname=self.hostname)
215                 recsync.round += 1
216                 record.data['round'] = recsync.round
217                 # TODO: we will need to delete some of these before setting them in the DB.
218                 rec.set(**record.data)
219                 rec.flush()
220
221         def getDownLog(self, record):
222
223                 record.data['args'] = {'nodename': self.hostname}
224                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
225
226                 #for key in record.data.keys():
227                 #       print "%10s %s %s " % (key, "==", record.data[key])
228
229                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
230                         log = "DOWN: %20s : %-40s == %20s %s" % \
231                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
232                 else:
233                         log = "DOWN: %20s : %-40s == %20s %s" % \
234                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
235                 return log
236
237         def getThankyouLog(self, record):
238
239                 record.data['args'] = {'nodename': self.hostname}
240                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
241
242                 try:
243                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
244                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
245                                                 (self.loginbase, self.hostname, record.data['stage'], 
246                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
247                         else:
248                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
249                                                 (self.loginbase, self.hostname, record.data['stage'], 
250                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
251                 except:
252                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
253                 return log
254
255         def makeRecord(self, **kwargs):
256                 rec = {}
257                 for key in kwargs.keys():
258                         rec[key] = kwargs[key]
259                 return rec
260
261         def checkStageAndTime(self, record):
262                 """
263                 The core variables are:
264
265                         send_email_to  : defines who to send messages to at this time
266                         take_action    : whether or not to take action
267                         penalty_level  : how much of a penalty to apply
268                         message_index  : where in the escellation sequence we are.
269                         save_act_all   : whether or not to save the action record in the db.
270
271                         action/stage   : stage tracks which state we're in.
272                 """
273                 #stages = {
274                 #       "initial"               : [ { action='noop', next="weekone"}],
275                 #       "weekone"               : [ { action='noop',         index=0, save=True, email=TECH,         length=7*SPERDAY,  next="weektwo" }, ],
276                 #       "weektwo"               : [ { action='nocreate',     index=1, save=True, email=TECH|PI,      length=7*SPERDAY,  next="waitforever" }, ],
277                 #       "waitforever"   : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY,  next="waitforever" }, ],
278                 #       "paused"                : [ { action='noop',                              save=True                                              length=30*SPERDAY, next="weekone" }, ]
279                 #       "improvement"   : [ { action='close_rt',     index=0, save=True, email=TECH,         next="monitor-end-record" }, ],
280                 #}
281                 # TODO: make this time relative to the PREVIOUS action taken.
282                 current_time = time.time()
283                 current_stage = record.getMostRecentStage()
284                 recent_time   = record.getMostRecentTime()
285
286                 delta = current_time - recent_time
287
288                 if current_stage in stages:
289                         values = stages[current_stage][0]
290
291                 if delta >= values['length']:
292                         print "checkStageAndTime: transition to next stage"
293                         new_stage = values['next']
294                         values = stages[new_stage]
295
296                 elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data:
297                         print "checkStageAndTime: second message in one week for stage two"
298                         take_action=False
299                         pass
300                 else:
301                         # DO NOTHING
302                         take_action=False, 
303                         save_act_all=False, 
304                         message_index=None, 
305                         print "checkStageAndTime: second message in one week for stage two"
306
307                 rec = self.makeRecord( stage=new_stage, send_email_to=values['email'],
308                                                            action=values['action'], message_index=values['index'], 
309                                                            save_act_all=values['save'], penalty_level=values['index'], 
310                                                            date_action_taken=current_time)
311                 record.data.update(rec)
312
313
314                 if   'initial' in record.data['stage']:
315                         # The node is bad, and there's no previous record of it.
316                         rec = self.makeRecord(
317                                                         stage="weekone", send_email_to=TECH, 
318                                                         action=['noop'], take_action=False, 
319                                                         message_index=0, save_act_all=True, 
320                                                         penalty_level=0, )
321                         record.data.update(rec)
322
323                 elif 'improvement' in record.data['stage']:
324                         print "checkStageAndTime: backing off of %s" % self.hostname
325                         rec = self.makeRecord(
326                                                         stage='monitor-end-record', send_email_to=TECH, 
327                                                         action=['close_rt'], take_action=True, 
328                                                         message_index=0, save_act_all=True, 
329                                                         penalty_level=0, )
330                         record.data.update(rec)
331
332                 else:
333                         # There is no action to be taken, possibly b/c the stage has
334                         # already been performed, but diagnose picked it up again.
335                         # two cases, 
336                         #       1. stage is unknown, or 
337                         #       2. delta is not big enough to bump it to the next stage.
338                         # TODO: figure out which. for now assume 2.
339                         print "UNKNOWN stage for %s; nothing done" % self.hostname
340                         rec = self.makeRecord(
341                                                         stage='weekone', send_email_to=TECH,
342                                                         action=['noop'], 
343                                                         take_action=False, 
344                                                         save_act_all=True, 
345                                                         date_action_taken=current_time,
346                                                         message_index=0, 
347                                                         penalty_level=0, )
348                         record.data.update(rec)
349
350                 print "%s" % record.data['log'],
351                 print "%15s" % record.data['action']
352                 return record
353