update of all changes in the last week that fine-tuned the behavior of Monitor
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 from unified_model import cmpCategoryVal
6 import sys
7 import emailTxt
8 import string
9
10 from rt import is_host_in_rt_tickets
11 import plc
12
13 # Time to enforce policy
14 POLSLEEP = 7200
15
16 # Where to email the summary
17 SUMTO = "soltesz@cs.princeton.edu"
18
19 from const import *
20
21 from unified_model import *
22
23 def get_ticket_id(record):
24         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
25                 return record['ticket_id']
26         elif            'found_rt_ticket' in record and \
27                  record['found_rt_ticket'] is not "" and \
28                  record['found_rt_ticket'] is not None:
29                 return record['found_rt_ticket']
30         else:
31                 return None
32
33 class MonitorMergeDiagnoseSendEscellate:
34         act_all = None
35         fb = None
36
37         def __init__(self, hostname, act):
38                 self.hostname = hostname
39                 self.act = act
40                 self.plcdb_hn2lb = None
41                 if self.plcdb_hn2lb is None:
42                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
43                 self.loginbase = self.plcdb_hn2lb[self.hostname]
44                 return
45
46         def getFBRecord(self):
47                 if MonitorMergeDiagnoseSendEscellate.fb == None:
48                         MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
49
50                 fb = MonitorMergeDiagnoseSendEscellate.fb
51
52                 if self.hostname in fb['nodes']:
53                         fbnode = fb['nodes'][self.hostname]['values']
54                 else:
55                         raise Exception("Hostname %s not in scan database"% self.hostname)
56                 return fbnode
57
58         def getActionRecord(self):
59                 # update ticket status
60                 if MonitorMergeDiagnoseSendEscellate.act_all == None:
61                         MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
62
63                 act_all = MonitorMergeDiagnoseSendEscellate.act_all 
64
65                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
66                         actnode = act_all[self.hostname][0]
67                 else:
68                         actnode = None
69                 return actnode
70
71         def getKernel(self, unamestr):
72                 s = unamestr.split()
73                 if len(s) > 2:
74                         return s[2]
75                 else:
76                         return ""
77
78         def mergeRecord(self, fbnode, actnode):
79                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
80                 fbnode['stage'] = "findbad"
81                 fbnode['message'] = None
82                 fbnode['args'] = None
83                 fbnode['info'] = None
84                 fbnode['log'] = None
85                 fbnode['time'] = time.time()
86                 fbnode['email'] = TECH
87                 fbnode['action'] = ['noop']
88                 fbnode['date_created'] = time.time()
89
90                 if actnode is None: # there is no entry in act_all
91                         actnode = {} 
92                         actnode.update(fbnode)
93                         actnode['ticket_id'] = ""
94                         actnode['prev_category'] = "ERROR" 
95                 else:
96                         actnode['prev_category']= actnode['category']
97                         actnode['comonstats']   = fbnode['comonstats']
98                         actnode['category']             = fbnode['category']
99                         actnode['state']                = fbnode['state']
100                         actnode['kernel']               = fbnode['kernel']
101                         actnode['bootcd']               = fbnode['bootcd']
102                         actnode['plcnode']              = fbnode['plcnode']
103                         ticket = get_ticket_id(actnode)
104                         if ticket is None: actnode['ticket_id'] = ""
105                         actnode['rt'] = mailer.getTicketStatus(ticket)
106
107                         #for key in actnode.keys():
108                         #       print "%10s %s %s " % (key, "==", actnode[key])
109                         #print "----------------------------"
110
111                 return actnode
112
113         def run(self):
114                 fbnode = self.getFBRecord()
115                 actnode= self.getActionRecord()
116                 actrec = self.mergeRecord(fbnode, actnode)
117                 record = Record(self.hostname, actrec)
118                 diag   = self.diagnose(record)
119                 if self.act and diag is not None:
120                         self.action(record,diag)
121         
122         def diagnose(self, record):
123
124                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
125                 # NOTE: change record stage based on RT status.
126                 #diag.setFlag('ResetStage')
127                 if record.stageIswaitforever():
128                         ticket = record.data['rt']
129                         if 'new' in ticket['Status']:
130                                 print "Resetting Stage!!!!!"
131                         #       diag.setFlag('ResetStage')
132                                 record.reset_stage()
133                         #if diag.getFlag('ResetStage'):
134                         #       print "diagnose: resetting stage"
135                         #       diag.resetFlag('ResetStage')
136                                 
137                         if 'resolved' in ticket['Status']:
138                                 diag.setFlag('RTEndRecord')
139
140                 # NOTE: take category, and prepare action
141                 category = record.getCategory()
142                 if category == "error":
143                         diag.setFlag('SendNodedown')
144                         record.data['message_series'] = emailTxt.mailtxt.newdown
145                         record.data['log'] = self.getDownLog(record)
146
147                 elif category == "prod" or category == "alpha":
148                         state = record.getState()
149                         if state == "boot":
150                                 if record.severity() != 0:
151                                         diag.setFlag('SendThankyou')
152                                         print "RESETTING STAGE: improvement"
153                                         record.data['stage'] = 'improvement'
154                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
155                                         record.data['log'] = self.getThankyouLog(record)
156                                 else:
157                                         # NOTE: do nothing, since we've already done the above.
158                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
159                                         return None
160                         elif state == "debug":
161                                 pass
162                         else:
163                                 print "unknown state %s for host %s" % (state, self.hostname)
164                 else:
165                         print "unknown category: %s" % category
166
167
168                 # TODO: how to not send email?...
169                 record = self.checkStageAndTime(diag,record)
170                 #if record:
171                 print "diagnose: checkStageAndTime Returned Valid Record"
172                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
173
174                 if site.status != "good":
175                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
176                         diag.setFlag('Squeeze')
177                 else:
178                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
179                         diag.setFlag('BackOff')
180
181                 diag.save()
182                 return diag
183                 #else:
184                 #       print "checkStageAndTime Returned NULL Record"
185                 #       return None
186
187         def action(self, record, diag):
188
189                 message = None
190
191                 #print record.data['stage']
192                 #print "improvement" in record.data['stage']
193                 #print self.getSendEmailFlag(record)
194                 if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: 
195                         print "action: getting message"
196                         message = record.getMessage(record.data['ticket_id'])
197                         if message:
198                                 #message.reset()
199                                 print "action: sending email"
200                                 message.send(record.getContacts())
201                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
202                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
203                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
204                                 #print message
205                                 if message.rt.ticket_id:
206                                         print "action: setting record ticket_id"
207                                         record.data['ticket_id'] = message.rt.ticket_id
208
209                         if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
210                                 print "action: taking action"
211                                 record.takeAction()
212                                 diag.resetFlag('Squeeze')
213                                 diag.resetFlag('BackOff')
214                                 diag.save()
215
216                         if record.saveAction():
217                                 print "action: saving act_all db"
218                                 self.add_and_save_act_all(record)
219                         else:
220                                 print "action: NOT saving act_all db"
221                                 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
222
223                         if record.improved() or diag.getFlag('RTEndRecord'):
224                                 print "action: end record for %s" % self.hostname
225                                 record.end_record()
226                                 diag.setFlag('CloseRT')
227                                 diag.resetFlag('RTEndRecord')
228                                 diag.save()
229                                 #return None
230
231                         if message:
232                                 if diag.getFlag('CloseRT'):
233                                         message.rt.closeTicket()
234                                         diag.resetFlag('CloseRT')
235                                         diag.save()
236
237                 else:
238                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
239
240                 return
241
242         def getSendEmailFlag(self, record):
243                 if not config.mail:
244                         return False
245
246                 # resend if open & created longer than 30 days ago.
247                 if  'rt' in record.data and \
248                         'Status' in record.data['rt'] and \
249                         "open" in record.data['rt']['Status'] and \
250                         record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
251                         # if created-time is greater than the thirty days ago from the current time
252                         return False
253
254                 return True
255
256         def add_and_save_act_all(self, record):
257                 self.act_all = database.dbLoad("act_all")
258                 if self.hostname not in self.act_all:
259                         self.act_all[self.hostname] = []
260                 self.act_all[self.hostname].insert(0,record.data)
261                 database.dbDump("act_all", self.act_all)
262                 
263         def getDownLog(self, record):
264
265                 record.data['args'] = {'nodename': self.hostname}
266                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
267
268                 #for key in record.data.keys():
269                 #       print "%10s %s %s " % (key, "==", record.data[key])
270
271                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
272                         log = "DOWN: %20s : %-40s == %20s %s" % \
273                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
274                 else:
275                         log = "DOWN: %20s : %-40s == %20s %s" % \
276                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
277                 return log
278
279         def getThankyouLog(self, record):
280
281                 record.data['args'] = {'nodename': self.hostname}
282                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
283
284                 try:
285                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
286                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
287                                                 (self.loginbase, self.hostname, record.data['stage'], 
288                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
289                         else:
290                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
291                                                 (self.loginbase, self.hostname, record.data['stage'], 
292                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
293                 except:
294                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
295                 return log
296
297         def checkStageAndTime(self, diag, record):
298                 current_time = time.time()
299                 delta = current_time - record.data['time']
300                 #print record.data
301                 if   'findbad' in record.data['stage']:
302                         # The node is bad, and there's no previous record of it.
303                         record.data['email'] = TECH
304                         record.data['action'] = ['noop']
305                         record.data['takeaction'] = False
306                         record.data['message'] = record.data['message_series'][0]
307                         record.data['stage'] = 'stage_actinoneweek'
308                         record.data['save-act-all'] = True
309
310                 elif 'reboot_node' in record.data['stage']:
311                         record.data['email'] = TECH
312                         record.data['action'] = ['noop']
313                         record.data['message'] = record.data['message_series'][0]
314                         record.data['stage'] = 'stage_actinoneweek'
315                         record.data['takeaction'] = False
316                         record.data['save-act-all'] = False
317                         
318                 elif 'improvement' in record.data['stage']:
319                         print "checkStageAndTime: backing off of %s" % self.hostname
320                         record.data['action'] = ['close_rt']
321                         record.data['takeaction'] = True
322                         record.data['message'] = record.data['message_series'][0]
323                         record.data['stage'] = 'monitor-end-record'
324                         record.data['save-act-all'] = True
325
326                 elif 'actinoneweek' in record.data['stage']:
327                         if delta >= 7 * SPERDAY: 
328                                 print "checkStageAndTime: transition to next stage actintwoweeks"
329                                 record.data['email'] = TECH | PI
330                                 record.data['stage'] = 'stage_actintwoweeks'
331                                 record.data['message'] = record.data['message_series'][1]
332                                 record.data['action'] = ['nocreate' ]
333                                 record.data['time'] = current_time              # reset clock for waitforever
334                                 record.data['takeaction'] = True
335                                 record.data['save-act-all'] = True
336                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
337                                 print "checkStageAndTime: second message in one week"
338                                 record.data['email'] = TECH 
339                                 record.data['message'] = record.data['message_series'][0]
340                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
341                                 record.data['second-mail-at-oneweek'] = True
342                                 record.data['takeaction'] = False
343                                 record.data['save-act-all'] = True
344                         else:
345                                 record.data['message'] = None
346                                 record.data['action'] = ['waitforoneweekaction' ]
347                                 record.data['takeaction'] = False
348                                 record.data['save-act-all'] = False
349                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
350                                 #return None                    # don't send if there's no action
351
352                 elif 'actintwoweeks' in record.data['stage']:
353                         if delta >= 7 * SPERDAY:
354                                 print "checkStageAndTime: transition to next stage waitforever"
355                                 record.data['email'] = TECH | PI | USER
356                                 record.data['stage'] = 'stage_waitforever'
357                                 record.data['message'] = record.data['message_series'][2]
358                                 record.data['action'] = ['suspendslices']
359                                 record.data['time'] = current_time              # reset clock for waitforever
360                                 record.data['takeaction'] = True
361                                 record.data['save-act-all'] = True
362                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
363                                 print "checkStageAndTime: second message in one week for stage two"
364                                 record.data['email'] = TECH | PI
365                                 record.data['message'] = record.data['message_series'][1]
366                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
367                                 record.data['second-mail-at-twoweeks'] = True
368                                 record.data['takeaction'] = False
369                                 record.data['save-act-all'] = True
370                         else:
371                                 record.data['message'] = None
372                                 record.data['takeaction'] = False
373                                 record.data['action'] = ['waitfortwoweeksaction']
374                                 record.data['save-act-all'] = False
375                                 print "checkStageAndTime: second message in one week for stage two"
376                                 #return None                    # don't send if there's no action
377
378                 elif 'ticket_waitforever' in record.data['stage']:
379                         record.data['email'] = TECH
380                         record.data['takeaction'] = True
381                         if 'first-found' not in record.data:
382                                 record.data['first-found'] = True
383                                 record.data['log'] += " firstfound"
384                                 record.data['action'] = ['ticket_waitforever']
385                                 record.data['message'] = None
386                                 record.data['time'] = current_time
387                                 record.data['save-act-all'] = True
388                         else:
389                                 if delta >= 7*SPERDAY:
390                                         record.data['action'] = ['ticket_waitforever']
391                                         record.data['message'] = None
392                                         record.data['time'] = current_time              # reset clock
393                                         record.data['save-act-all'] = True
394                                 else:
395                                         record.data['action'] = ['ticket_waitforever']
396                                         record.data['message'] = None
397                                         record.data['takeaction'] = False
398                                         record.data['save-act-all'] = False
399                                         return None
400
401                 elif 'waitforever' in record.data['stage']:
402                         # more than 3 days since last action
403                         # TODO: send only on weekdays.
404                         # NOTE: expects that 'time' has been reset before entering waitforever stage
405                         record.data['takeaction'] = True
406                         if delta >= 3*SPERDAY:
407                                 record.data['action'] = ['email-againwaitforever']
408                                 record.data['message'] = record.data['message_series'][2]
409                                 record.data['time'] = current_time              # reset clock
410                                 record.data['save-act-all'] = True
411                         else:
412                                 record.data['action'] = ['waitforever']
413                                 record.data['message'] = None
414                                 record.data['takeaction'] = False
415                                 record.data['save-act-all'] = False
416                                 #return None                    # don't send if there's no action
417
418                 else:
419                         # There is no action to be taken, possibly b/c the stage has
420                         # already been performed, but diagnose picked it up again.
421                         # two cases, 
422                         #       1. stage is unknown, or 
423                         #       2. delta is not big enough to bump it to the next stage.
424                         # TODO: figure out which. for now assume 2.
425                         print "UNKNOWN stage for %s; nothing done" % self.hostname
426                         record.data['action'] = ['unknown']
427                         record.data['message'] = record.data['message_series'][0]
428
429                         record.data['email'] = TECH
430                         record.data['action'] = ['noop']
431                         record.data['message'] = record.data['message_series'][0]
432                         record.data['stage'] = 'stage_actinoneweek'
433                         record.data['time'] = current_time              # reset clock
434                         record.data['takeaction'] = False
435                         record.data['save-act-all'] = True
436
437                 print "%s" % record.data['log'],
438                 print "%15s" % record.data['action']
439                 return record
440