add email_exception() to all except: statements.
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 import sys
6 import emailTxt
7 import string
8
9 from rt import is_host_in_rt_tickets
10 import plc
11
12 # Time to enforce policy
13 POLSLEEP = 7200
14
15 # Where to email the summary
16 SUMTO = "soltesz@cs.princeton.edu"
17
18 from const import *
19
20 from unified_model import *
21
22 def get_ticket_id(record):
23         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
24                 return record['ticket_id']
25         elif            'found_rt_ticket' in record and \
26                  record['found_rt_ticket'] is not "" and \
27                  record['found_rt_ticket'] is not None:
28                 return record['found_rt_ticket']
29         else:
30                 return None
31
32 class MonitorMergeDiagnoseSendEscellate:
33         act_all = None
34         fb = None
35
36         def __init__(self, hostname, act):
37                 self.hostname = hostname
38                 self.act = act
39                 self.plcdb_hn2lb = None
40                 if self.plcdb_hn2lb is None:
41                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
42                 self.loginbase = self.plcdb_hn2lb[self.hostname]
43                 return
44
45         def getFBRecord(self):
46                 if MonitorMergeDiagnoseSendEscellate.fb == None:
47                         MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
48
49                 fb = MonitorMergeDiagnoseSendEscellate.fb
50
51                 if self.hostname in fb['nodes']:
52                         fbnode = fb['nodes'][self.hostname]['values']
53                 else:
54                         raise Exception("Hostname %s not in scan database"% self.hostname)
55                 return fbnode
56
57         def getActionRecord(self):
58                 # update ticket status
59                 if MonitorMergeDiagnoseSendEscellate.act_all == None:
60                         MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
61
62                 act_all = MonitorMergeDiagnoseSendEscellate.act_all 
63
64                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
65                         actnode = act_all[self.hostname][0]
66                 else:
67                         actnode = None
68                 return actnode
69
70         def getKernel(self, unamestr):
71                 s = unamestr.split()
72                 if len(s) > 2:
73                         return s[2]
74                 else:
75                         return ""
76
77         def mergeRecord(self, fbnode, actnode):
78                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
79                 fbnode['stage'] = "findbad"
80                 fbnode['message'] = None
81                 fbnode['args'] = None
82                 fbnode['info'] = None
83                 fbnode['log'] = None
84                 fbnode['time'] = time.time()
85                 fbnode['email'] = TECH
86                 fbnode['action-level'] = 0
87                 fbnode['action'] = ['noop']
88                 fbnode['date_created'] = time.time()
89
90                 if actnode is None: # there is no entry in act_all
91                         actnode = {} 
92                         actnode.update(fbnode)
93                         actnode['ticket_id'] = ""
94                         actnode['prev_category'] = "ERROR" 
95                         actnode['prev_state'] = "DOWN" 
96                 else:
97                         actnode['prev_category']= actnode['category']
98                         actnode['prev_state']   = actnode['state']
99                         actnode['comonstats']   = fbnode['comonstats']
100                         actnode['category']             = fbnode['category']
101                         actnode['state']                = fbnode['state']
102                         actnode['kernel']               = fbnode['kernel']
103                         actnode['bootcd']               = fbnode['bootcd']
104                         actnode['plcnode']              = fbnode['plcnode']
105                         ticket = get_ticket_id(actnode)
106                         if ticket is None: actnode['ticket_id'] = ""
107                         actnode['rt'] = mailer.getTicketStatus(ticket)
108
109                         #for key in actnode.keys():
110                         #       print "%10s %s %s " % (key, "==", actnode[key])
111                         #print "----------------------------"
112
113                 return actnode
114
115         def run(self):
116                 fbnode = self.getFBRecord()
117                 actnode= self.getActionRecord()
118                 actrec = self.mergeRecord(fbnode, actnode)
119                 record = Record(self.hostname, actrec)
120                 #print record
121                 #print actrec
122                 #print record.data['time']
123                 #print time.time() - record.data['time']
124                 diag   = self.diagnose(record)
125                 if self.act and diag is not None:
126                         self.action(record,diag)
127         
128         def diagnose(self, record):
129
130                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
131                 # NOTE: change record stage based on RT status.
132                 #diag.setFlag('ResetStage')
133                 if record.stageIswaitforever():
134                         ticket = record.data['rt']
135                         if 'new' in ticket['Status']:
136                                 print "Resetting Stage!!!!!"
137                         #       diag.setFlag('ResetStage')
138                                 record.reset_stage()
139                         #if diag.getFlag('ResetStage'):
140                         #       print "diagnose: resetting stage"
141                         #       diag.resetFlag('ResetStage')
142                                 
143                         if 'resolved' in ticket['Status']:
144                                 diag.setFlag('RTEndRecord')
145
146                 # NOTE: take category, and prepare action
147                 category = record.getCategory()
148                 if category == "error":
149                         diag.setFlag('SendNodedown')
150                         record.data['message_series'] = emailTxt.mailtxt.newdown
151                         record.data['log'] = self.getDownLog(record)
152
153                 elif category == "prod" or category == "alpha":
154                         state = record.getState()
155                         if state == "boot":
156                                 if record.severity() != 0:
157                                         diag.setFlag('SendThankyou')
158                                         print "RESETTING STAGE: improvement"
159                                         record.data['stage'] = 'improvement'
160                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
161                                         record.data['log'] = self.getThankyouLog(record)
162                                 else:
163                                         # NOTE: do nothing, since we've already done the above.
164                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
165                                         return None
166                         elif state == "debug":
167                                 pass
168                         else:
169                                 print "unknown state %s for host %s" % (state, self.hostname)
170                 else:
171                         print "unknown category: %s" % category
172
173
174                 # TODO: how to not send email?...
175                 record = self.checkStageAndTime(diag,record)
176                 #if record:
177                 print "diagnose: checkStageAndTime Returned Valid Record"
178                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
179
180                 if "good" not in site.status: #  != "good":
181                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
182                         diag.setFlag('Squeeze')
183                 else:
184                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
185                         diag.setFlag('BackOff')
186
187                 diag.save()
188                 return diag
189                 #else:
190                 #       print "checkStageAndTime Returned NULL Record"
191                 #       return None
192
193         def action(self, record, diag):
194
195                 message = None
196
197                 #print record.data['stage']
198                 #print "improvement" in record.data['stage']
199                 #print self.getSendEmailFlag(record)
200                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
201                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
202                         "monitor-end-record" in record.data['stage']:
203                         print "action: getting message"
204                         message = record.getMessage(record.data['ticket_id'])
205                         if message:
206                                 print "action: sending email"
207                                 message.send(record.getContacts())
208                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
209                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
210                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
211                                 #print message
212                                 if message.rt.ticket_id:
213                                         print "action: setting record ticket_id"
214                                         record.data['ticket_id'] = message.rt.ticket_id
215
216                         if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
217                                 print "action: taking squeeze action"
218                                 record.takeAction(record.data['action-level'])
219                                 diag.resetFlag('Squeeze')
220                                 diag.save()
221                         if diag.getFlag('BackOff'):
222                                 print "action: taking backoff action"
223                                 record.takeAction(0)
224                                 diag.resetFlag('BackOff')
225                                 diag.save()
226
227                         if record.saveAction():
228                                 print "action: saving act_all db"
229                                 self.add_and_save_act_all(record)
230                         else:
231                                 print "action: NOT saving act_all db"
232                                 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
233
234                         if record.improved() or diag.getFlag('RTEndRecord'):
235                                 print "action: end record for %s" % self.hostname
236                                 record.end_record()
237                                 diag.setFlag('CloseRT')
238                                 diag.resetFlag('RTEndRecord')
239                                 diag.save()
240                                 #return None
241
242                         if message:
243                                 if diag.getFlag('CloseRT'):
244                                         message.rt.closeTicket()
245                                         diag.resetFlag('CloseRT')
246                                         diag.save()
247
248                 else:
249                         print "NOT sending email : %s" % config.mail
250
251                 return
252
253         def getSendEmailFlag(self, record):
254                 if not config.mail:
255                         return False
256
257                 # resend if open & created longer than 30 days ago.
258                 if  'rt' in record.data and \
259                         'Status' in record.data['rt'] and \
260                         "open" in record.data['rt']['Status'] and \
261                         record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
262                         # if created-time is greater than the thirty days ago from the current time
263                         return False
264
265                 return True
266
267         def add_and_save_act_all(self, record):
268                 self.act_all = database.dbLoad("act_all")
269                 if self.hostname not in self.act_all:
270                         self.act_all[self.hostname] = []
271                 self.act_all[self.hostname].insert(0,record.data)
272                 database.dbDump("act_all", self.act_all)
273                 
274         def getDownLog(self, record):
275
276                 record.data['args'] = {'nodename': self.hostname}
277                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
278
279                 #for key in record.data.keys():
280                 #       print "%10s %s %s " % (key, "==", record.data[key])
281
282                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
283                         log = "DOWN: %20s : %-40s == %20s %s" % \
284                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
285                 else:
286                         log = "DOWN: %20s : %-40s == %20s %s" % \
287                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
288                 return log
289
290         def getThankyouLog(self, record):
291
292                 record.data['args'] = {'nodename': self.hostname}
293                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
294
295                 try:
296                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
297                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
298                                                 (self.loginbase, self.hostname, record.data['stage'], 
299                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
300                         else:
301                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
302                                                 (self.loginbase, self.hostname, record.data['stage'], 
303                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
304                 except:
305                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
306                 return log
307
308         def checkStageAndTime(self, diag, record):
309                 current_time = time.time()
310                 delta = current_time - record.data['time']
311                 #print record.data
312                 if   'findbad' in record.data['stage']:
313                         # The node is bad, and there's no previous record of it.
314                         record.data['email'] = TECH
315                         record.data['action'] = ['noop']
316                         record.data['takeaction'] = False
317                         record.data['message'] = record.data['message_series'][0]
318                         record.data['stage'] = 'stage_actinoneweek'
319                         record.data['save-act-all'] = True
320                         record.data['action-level'] = 0
321
322                 elif 'reboot_node' in record.data['stage']:
323                         record.data['email'] = TECH
324                         record.data['action'] = ['noop']
325                         record.data['message'] = record.data['message_series'][0]
326                         record.data['stage'] = 'stage_actinoneweek'
327                         record.data['takeaction'] = False
328                         record.data['save-act-all'] = False
329                         record.data['action-level'] = 0
330                         
331                 elif 'improvement' in record.data['stage']:
332                         print "checkStageAndTime: backing off of %s" % self.hostname
333                         record.data['action'] = ['close_rt']
334                         record.data['takeaction'] = True
335                         record.data['message'] = record.data['message_series'][0]
336                         record.data['stage'] = 'monitor-end-record'
337                         record.data['save-act-all'] = True
338                         record.data['action-level'] = 0
339
340                 elif 'actinoneweek' in record.data['stage']:
341                         if delta >= 7 * SPERDAY: 
342                                 print "checkStageAndTime: transition to next stage actintwoweeks"
343                                 record.data['email'] = TECH | PI
344                                 record.data['stage'] = 'stage_actintwoweeks'
345                                 record.data['message'] = record.data['message_series'][1]
346                                 record.data['action'] = ['nocreate' ]
347                                 record.data['time'] = current_time              # reset clock for waitforever
348                                 record.data['takeaction'] = True
349                                 record.data['save-act-all'] = True
350                                 record.data['action-level'] = 1
351                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
352                                 print "checkStageAndTime: second message in one week"
353                                 record.data['email'] = TECH 
354                                 record.data['message'] = record.data['message_series'][0]
355                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
356                                 record.data['second-mail-at-oneweek'] = True
357                                 record.data['takeaction'] = False
358                                 record.data['save-act-all'] = True
359                                 record.data['action-level'] = 0
360                         else:
361                                 record.data['message'] = None
362                                 record.data['action'] = ['waitforoneweekaction' ]
363                                 record.data['takeaction'] = False
364                                 record.data['save-act-all'] = False
365                                 record.data['action-level'] = 0
366                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
367                                 #return None                    # don't send if there's no action
368
369                 elif 'actintwoweeks' in record.data['stage']:
370                         if delta >= 7 * SPERDAY:
371                                 print "checkStageAndTime: transition to next stage waitforever"
372                                 record.data['email'] = TECH | PI | USER
373                                 record.data['stage'] = 'stage_waitforever'
374                                 record.data['message'] = record.data['message_series'][2]
375                                 record.data['action'] = ['suspendslices']
376                                 record.data['time'] = current_time              # reset clock for waitforever
377                                 record.data['takeaction'] = True
378                                 record.data['save-act-all'] = True
379                                 record.data['action-level'] = 2
380                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
381                                 print "checkStageAndTime: second message in one week for stage two"
382                                 record.data['email'] = TECH | PI
383                                 record.data['message'] = record.data['message_series'][1]
384                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
385                                 record.data['second-mail-at-twoweeks'] = True
386                                 record.data['takeaction'] = False
387                                 record.data['save-act-all'] = True
388                                 record.data['action-level'] = 1
389                         else:
390                                 record.data['message'] = None
391                                 record.data['takeaction'] = False
392                                 record.data['action'] = ['waitfortwoweeksaction']
393                                 record.data['save-act-all'] = False
394                                 print "checkStageAndTime: second message in one week for stage two"
395                                 record.data['action-level'] = 1
396                                 #return None                    # don't send if there's no action
397
398                 elif 'ticket_waitforever' in record.data['stage']:
399                         record.data['email'] = TECH
400                         record.data['takeaction'] = True
401                         if 'first-found' not in record.data:
402                                 record.data['first-found'] = True
403                                 record.data['log'] += " firstfound"
404                                 record.data['action'] = ['ticket_waitforever']
405                                 record.data['message'] = None
406                                 record.data['time'] = current_time
407                                 record.data['save-act-all'] = True
408                                 record.data['action-level'] = 2
409                         else:
410                                 if delta >= 7*SPERDAY:
411                                         record.data['action'] = ['ticket_waitforever']
412                                         record.data['message'] = None
413                                         record.data['time'] = current_time              # reset clock
414                                         record.data['save-act-all'] = True
415                                         record.data['action-level'] = 2
416                                 else:
417                                         record.data['action'] = ['ticket_waitforever']
418                                         record.data['message'] = None
419                                         record.data['takeaction'] = False
420                                         record.data['save-act-all'] = False
421                                         record.data['action-level'] = 2
422                                         #return None
423
424                 elif 'waitforever' in record.data['stage']:
425                         # more than 3 days since last action
426                         # TODO: send only on weekdays.
427                         # NOTE: expects that 'time' has been reset before entering waitforever stage
428                         record.data['takeaction'] = True
429                         if delta >= 3*SPERDAY:
430                                 record.data['action'] = ['email-againwaitforever']
431                                 record.data['message'] = record.data['message_series'][2]
432                                 record.data['time'] = current_time              # reset clock
433                                 record.data['save-act-all'] = True
434                                 record.data['action-level'] = 2
435                         else:
436                                 record.data['action'] = ['waitforever']
437                                 record.data['message'] = None
438                                 record.data['takeaction'] = False
439                                 record.data['save-act-all'] = False
440                                 record.data['action-level'] = 2
441                                 #return None                    # don't send if there's no action
442
443                 else:
444                         # There is no action to be taken, possibly b/c the stage has
445                         # already been performed, but diagnose picked it up again.
446                         # two cases, 
447                         #       1. stage is unknown, or 
448                         #       2. delta is not big enough to bump it to the next stage.
449                         # TODO: figure out which. for now assume 2.
450                         print "UNKNOWN stage for %s; nothing done" % self.hostname
451                         record.data['action'] = ['unknown']
452                         record.data['message'] = record.data['message_series'][0]
453
454                         record.data['email'] = TECH
455                         record.data['action'] = ['noop']
456                         record.data['message'] = record.data['message_series'][0]
457                         record.data['stage'] = 'stage_actinoneweek'
458                         record.data['time'] = current_time              # reset clock
459                         record.data['takeaction'] = False
460                         record.data['save-act-all'] = True
461
462                 print "%s" % record.data['log'],
463                 print "%15s" % record.data['action']
464                 return record
465