M emailTxt.py
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 from unified_model import cmpCategoryVal
6 import sys
7 import emailTxt
8 import string
9
10 from rt import is_host_in_rt_tickets
11 import plc
12
13 # Time to enforce policy
14 POLSLEEP = 7200
15
16 # Where to email the summary
17 SUMTO = "soltesz@cs.princeton.edu"
18
19 from const import *
20
21 from unified_model import *
22
23 def get_ticket_id(record):
24         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
25                 return record['ticket_id']
26         elif            'found_rt_ticket' in record and \
27                  record['found_rt_ticket'] is not "" and \
28                  record['found_rt_ticket'] is not None:
29                 return record['found_rt_ticket']
30         else:
31                 return None
32
33 class MonitorMergeDiagnoseSendEscellate:
34         act_all = None
35         fb = None
36
37         def __init__(self, hostname, act):
38                 self.hostname = hostname
39                 self.act = act
40                 self.plcdb_hn2lb = None
41                 if self.plcdb_hn2lb is None:
42                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
43                 self.loginbase = self.plcdb_hn2lb[self.hostname]
44                 return
45
46         def getFBRecord(self):
47                 if MonitorMergeDiagnoseSendEscellate.fb == None:
48                         MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
49
50                 fb = MonitorMergeDiagnoseSendEscellate.fb
51
52                 if self.hostname in fb['nodes']:
53                         fbnode = fb['nodes'][self.hostname]['values']
54                 else:
55                         raise Exception("Hostname %s not in scan database"% self.hostname)
56                 return fbnode
57
58         def getActionRecord(self):
59                 # update ticket status
60                 if MonitorMergeDiagnoseSendEscellate.act_all == None:
61                         MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
62
63                 act_all = MonitorMergeDiagnoseSendEscellate.act_all 
64
65                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
66                         actnode = act_all[self.hostname][0]
67                 else:
68                         actnode = None
69                 return actnode
70
71         def getKernel(self, unamestr):
72                 s = unamestr.split()
73                 if len(s) > 2:
74                         return s[2]
75                 else:
76                         return ""
77
78         def mergeRecord(self, fbnode, actnode):
79                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
80                 fbnode['stage'] = "findbad"
81                 fbnode['message'] = None
82                 fbnode['args'] = None
83                 fbnode['info'] = None
84                 fbnode['log'] = None
85                 fbnode['time'] = time.time()
86                 fbnode['email'] = TECH
87                 fbnode['action-level'] = 0
88                 fbnode['action'] = ['noop']
89                 fbnode['date_created'] = time.time()
90
91                 if actnode is None: # there is no entry in act_all
92                         actnode = {} 
93                         actnode.update(fbnode)
94                         actnode['ticket_id'] = ""
95                         actnode['prev_category'] = "ERROR" 
96                 else:
97                         actnode['prev_category']= actnode['category']
98                         actnode['comonstats']   = fbnode['comonstats']
99                         actnode['category']             = fbnode['category']
100                         actnode['state']                = fbnode['state']
101                         actnode['kernel']               = fbnode['kernel']
102                         actnode['bootcd']               = fbnode['bootcd']
103                         actnode['plcnode']              = fbnode['plcnode']
104                         ticket = get_ticket_id(actnode)
105                         if ticket is None: actnode['ticket_id'] = ""
106                         actnode['rt'] = mailer.getTicketStatus(ticket)
107
108                         #for key in actnode.keys():
109                         #       print "%10s %s %s " % (key, "==", actnode[key])
110                         #print "----------------------------"
111
112                 return actnode
113
114         def run(self):
115                 fbnode = self.getFBRecord()
116                 actnode= self.getActionRecord()
117                 actrec = self.mergeRecord(fbnode, actnode)
118                 record = Record(self.hostname, actrec)
119                 diag   = self.diagnose(record)
120                 if self.act and diag is not None:
121                         self.action(record,diag)
122         
123         def diagnose(self, record):
124
125                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
126                 # NOTE: change record stage based on RT status.
127                 #diag.setFlag('ResetStage')
128                 if record.stageIswaitforever():
129                         ticket = record.data['rt']
130                         if 'new' in ticket['Status']:
131                                 print "Resetting Stage!!!!!"
132                         #       diag.setFlag('ResetStage')
133                                 record.reset_stage()
134                         #if diag.getFlag('ResetStage'):
135                         #       print "diagnose: resetting stage"
136                         #       diag.resetFlag('ResetStage')
137                                 
138                         if 'resolved' in ticket['Status']:
139                                 diag.setFlag('RTEndRecord')
140
141                 # NOTE: take category, and prepare action
142                 category = record.getCategory()
143                 if category == "error":
144                         diag.setFlag('SendNodedown')
145                         record.data['message_series'] = emailTxt.mailtxt.newdown
146                         record.data['log'] = self.getDownLog(record)
147
148                 elif category == "prod" or category == "alpha":
149                         state = record.getState()
150                         if state == "boot":
151                                 if record.severity() != 0:
152                                         diag.setFlag('SendThankyou')
153                                         print "RESETTING STAGE: improvement"
154                                         record.data['stage'] = 'improvement'
155                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
156                                         record.data['log'] = self.getThankyouLog(record)
157                                 else:
158                                         # NOTE: do nothing, since we've already done the above.
159                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
160                                         return None
161                         elif state == "debug":
162                                 pass
163                         else:
164                                 print "unknown state %s for host %s" % (state, self.hostname)
165                 else:
166                         print "unknown category: %s" % category
167
168
169                 # TODO: how to not send email?...
170                 record = self.checkStageAndTime(diag,record)
171                 #if record:
172                 print "diagnose: checkStageAndTime Returned Valid Record"
173                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
174
175                 if "good" not in site.status: #  != "good":
176                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
177                         diag.setFlag('Squeeze')
178                 else:
179                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
180                         diag.setFlag('BackOff')
181
182                 diag.save()
183                 return diag
184                 #else:
185                 #       print "checkStageAndTime Returned NULL Record"
186                 #       return None
187
188         def action(self, record, diag):
189
190                 message = None
191
192                 #print record.data['stage']
193                 #print "improvement" in record.data['stage']
194                 #print self.getSendEmailFlag(record)
195                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
196                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
197                         "monitor-end-record" in record.data['stage']:
198                         print "action: getting message"
199                         message = record.getMessage(record.data['ticket_id'])
200                         if message:
201                                 #message.reset()
202                                 print "action: sending email"
203                                 message.send(record.getContacts())
204                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
205                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
206                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
207                                 #print message
208                                 if message.rt.ticket_id:
209                                         print "action: setting record ticket_id"
210                                         record.data['ticket_id'] = message.rt.ticket_id
211
212                         if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
213                                 print "action: taking action"
214                                 record.takeAction(record.data['action-level'])
215                                 diag.resetFlag('Squeeze')
216                                 diag.save()
217                         if diag.getFlag('BackOff'):
218                                 record.takeAction(0)
219                                 diag.resetFlag('BackOff')
220                                 diag.save()
221
222                         if record.saveAction():
223                                 print "action: saving act_all db"
224                                 self.add_and_save_act_all(record)
225                         else:
226                                 print "action: NOT saving act_all db"
227                                 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
228
229                         if record.improved() or diag.getFlag('RTEndRecord'):
230                                 print "action: end record for %s" % self.hostname
231                                 record.end_record()
232                                 diag.setFlag('CloseRT')
233                                 diag.resetFlag('RTEndRecord')
234                                 diag.save()
235                                 #return None
236
237                         if message:
238                                 if diag.getFlag('CloseRT'):
239                                         message.rt.closeTicket()
240                                         diag.resetFlag('CloseRT')
241                                         diag.save()
242
243                 else:
244                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
245
246                 return
247
248         def getSendEmailFlag(self, record):
249                 if not config.mail:
250                         return False
251
252                 # resend if open & created longer than 30 days ago.
253                 if  'rt' in record.data and \
254                         'Status' in record.data['rt'] and \
255                         "open" in record.data['rt']['Status'] and \
256                         record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
257                         # if created-time is greater than the thirty days ago from the current time
258                         return False
259
260                 return True
261
262         def add_and_save_act_all(self, record):
263                 self.act_all = database.dbLoad("act_all")
264                 if self.hostname not in self.act_all:
265                         self.act_all[self.hostname] = []
266                 self.act_all[self.hostname].insert(0,record.data)
267                 database.dbDump("act_all", self.act_all)
268                 
269         def getDownLog(self, record):
270
271                 record.data['args'] = {'nodename': self.hostname}
272                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
273
274                 #for key in record.data.keys():
275                 #       print "%10s %s %s " % (key, "==", record.data[key])
276
277                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
278                         log = "DOWN: %20s : %-40s == %20s %s" % \
279                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
280                 else:
281                         log = "DOWN: %20s : %-40s == %20s %s" % \
282                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
283                 return log
284
285         def getThankyouLog(self, record):
286
287                 record.data['args'] = {'nodename': self.hostname}
288                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
289
290                 try:
291                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
292                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
293                                                 (self.loginbase, self.hostname, record.data['stage'], 
294                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
295                         else:
296                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
297                                                 (self.loginbase, self.hostname, record.data['stage'], 
298                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
299                 except:
300                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
301                 return log
302
303         def checkStageAndTime(self, diag, record):
304                 current_time = time.time()
305                 delta = current_time - record.data['time']
306                 #print record.data
307                 if   'findbad' in record.data['stage']:
308                         # The node is bad, and there's no previous record of it.
309                         record.data['email'] = TECH
310                         record.data['action'] = ['noop']
311                         record.data['takeaction'] = False
312                         record.data['message'] = record.data['message_series'][0]
313                         record.data['stage'] = 'stage_actinoneweek'
314                         record.data['save-act-all'] = True
315                         record.data['action-level'] = 0
316
317                 elif 'reboot_node' in record.data['stage']:
318                         record.data['email'] = TECH
319                         record.data['action'] = ['noop']
320                         record.data['message'] = record.data['message_series'][0]
321                         record.data['stage'] = 'stage_actinoneweek'
322                         record.data['takeaction'] = False
323                         record.data['save-act-all'] = False
324                         record.data['action-level'] = 0
325                         
326                 elif 'improvement' in record.data['stage']:
327                         print "checkStageAndTime: backing off of %s" % self.hostname
328                         record.data['action'] = ['close_rt']
329                         record.data['takeaction'] = True
330                         record.data['message'] = record.data['message_series'][0]
331                         record.data['stage'] = 'monitor-end-record'
332                         record.data['save-act-all'] = True
333                         record.data['action-level'] = 0
334
335                 elif 'actinoneweek' in record.data['stage']:
336                         if delta >= 7 * SPERDAY: 
337                                 print "checkStageAndTime: transition to next stage actintwoweeks"
338                                 record.data['email'] = TECH | PI
339                                 record.data['stage'] = 'stage_actintwoweeks'
340                                 record.data['message'] = record.data['message_series'][1]
341                                 record.data['action'] = ['nocreate' ]
342                                 record.data['time'] = current_time              # reset clock for waitforever
343                                 record.data['takeaction'] = True
344                                 record.data['save-act-all'] = True
345                                 record.data['action-level'] = 1
346                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
347                                 print "checkStageAndTime: second message in one week"
348                                 record.data['email'] = TECH 
349                                 record.data['message'] = record.data['message_series'][0]
350                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
351                                 record.data['second-mail-at-oneweek'] = True
352                                 record.data['takeaction'] = False
353                                 record.data['save-act-all'] = True
354                                 record.data['action-level'] = 0
355                         else:
356                                 record.data['message'] = None
357                                 record.data['action'] = ['waitforoneweekaction' ]
358                                 record.data['takeaction'] = False
359                                 record.data['save-act-all'] = False
360                                 record.data['action-level'] = 0
361                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
362                                 #return None                    # don't send if there's no action
363
364                 elif 'actintwoweeks' in record.data['stage']:
365                         if delta >= 7 * SPERDAY:
366                                 print "checkStageAndTime: transition to next stage waitforever"
367                                 record.data['email'] = TECH | PI | USER
368                                 record.data['stage'] = 'stage_waitforever'
369                                 record.data['message'] = record.data['message_series'][2]
370                                 record.data['action'] = ['suspendslices']
371                                 record.data['time'] = current_time              # reset clock for waitforever
372                                 record.data['takeaction'] = True
373                                 record.data['save-act-all'] = True
374                                 record.data['action-level'] = 2
375                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
376                                 print "checkStageAndTime: second message in one week for stage two"
377                                 record.data['email'] = TECH | PI
378                                 record.data['message'] = record.data['message_series'][1]
379                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
380                                 record.data['second-mail-at-twoweeks'] = True
381                                 record.data['takeaction'] = False
382                                 record.data['save-act-all'] = True
383                                 record.data['action-level'] = 1
384                         else:
385                                 record.data['message'] = None
386                                 record.data['takeaction'] = False
387                                 record.data['action'] = ['waitfortwoweeksaction']
388                                 record.data['save-act-all'] = False
389                                 print "checkStageAndTime: second message in one week for stage two"
390                                 record.data['action-level'] = 1
391                                 #return None                    # don't send if there's no action
392
393                 elif 'ticket_waitforever' in record.data['stage']:
394                         record.data['email'] = TECH
395                         record.data['takeaction'] = True
396                         if 'first-found' not in record.data:
397                                 record.data['first-found'] = True
398                                 record.data['log'] += " firstfound"
399                                 record.data['action'] = ['ticket_waitforever']
400                                 record.data['message'] = None
401                                 record.data['time'] = current_time
402                                 record.data['save-act-all'] = True
403                                 record.data['action-level'] = 2
404                         else:
405                                 if delta >= 7*SPERDAY:
406                                         record.data['action'] = ['ticket_waitforever']
407                                         record.data['message'] = None
408                                         record.data['time'] = current_time              # reset clock
409                                         record.data['save-act-all'] = True
410                                         record.data['action-level'] = 2
411                                 else:
412                                         record.data['action'] = ['ticket_waitforever']
413                                         record.data['message'] = None
414                                         record.data['takeaction'] = False
415                                         record.data['save-act-all'] = False
416                                         record.data['action-level'] = 2
417                                         #return None
418
419                 elif 'waitforever' in record.data['stage']:
420                         # more than 3 days since last action
421                         # TODO: send only on weekdays.
422                         # NOTE: expects that 'time' has been reset before entering waitforever stage
423                         record.data['takeaction'] = True
424                         if delta >= 3*SPERDAY:
425                                 record.data['action'] = ['email-againwaitforever']
426                                 record.data['message'] = record.data['message_series'][2]
427                                 record.data['time'] = current_time              # reset clock
428                                 record.data['save-act-all'] = True
429                                 record.data['action-level'] = 2
430                         else:
431                                 record.data['action'] = ['waitforever']
432                                 record.data['message'] = None
433                                 record.data['takeaction'] = False
434                                 record.data['save-act-all'] = False
435                                 record.data['action-level'] = 2
436                                 #return None                    # don't send if there's no action
437
438                 else:
439                         # There is no action to be taken, possibly b/c the stage has
440                         # already been performed, but diagnose picked it up again.
441                         # two cases, 
442                         #       1. stage is unknown, or 
443                         #       2. delta is not big enough to bump it to the next stage.
444                         # TODO: figure out which. for now assume 2.
445                         print "UNKNOWN stage for %s; nothing done" % self.hostname
446                         record.data['action'] = ['unknown']
447                         record.data['message'] = record.data['message_series'][0]
448
449                         record.data['email'] = TECH
450                         record.data['action'] = ['noop']
451                         record.data['message'] = record.data['message_series'][0]
452                         record.data['stage'] = 'stage_actinoneweek'
453                         record.data['time'] = current_time              # reset clock
454                         record.data['takeaction'] = False
455                         record.data['save-act-all'] = True
456
457                 print "%s" % record.data['log'],
458                 print "%15s" % record.data['action']
459                 return record
460