updates to improve generalization and auto-installation.
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 import sys
6 import emailTxt
7 import string
8
9 from rt import is_host_in_rt_tickets
10 import plc
11
12 # Time to enforce policy
13 POLSLEEP = 7200
14
15 # Where to email the summary
16 SUMTO = "soltesz@cs.princeton.edu"
17
18 from const import *
19
20 from unified_model import *
21
22 def get_ticket_id(record):
23         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
24                 return record['ticket_id']
25         elif            'found_rt_ticket' in record and \
26                  record['found_rt_ticket'] is not "" and \
27                  record['found_rt_ticket'] is not None:
28                 return record['found_rt_ticket']
29         else:
30                 return None
31
32 class MonitorMergeDiagnoseSendEscellate:
33         act_all = None
34         fb = None
35
36         def __init__(self, hostname, act):
37                 self.hostname = hostname
38                 self.act = act
39                 self.plcdb_hn2lb = None
40                 if self.plcdb_hn2lb is None:
41                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
42                 self.loginbase = self.plcdb_hn2lb[self.hostname]
43                 return
44
45         def getFBRecord(self):
46                 if MonitorMergeDiagnoseSendEscellate.fb == None:
47                         MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
48
49                 fb = MonitorMergeDiagnoseSendEscellate.fb
50
51                 if self.hostname in fb['nodes']:
52                         fbnode = fb['nodes'][self.hostname]['values']
53                 else:
54                         raise Exception("Hostname %s not in scan database"% self.hostname)
55                 return fbnode
56
57         def getActionRecord(self):
58                 # update ticket status
59                 if MonitorMergeDiagnoseSendEscellate.act_all == None:
60                         MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
61
62                 act_all = MonitorMergeDiagnoseSendEscellate.act_all 
63
64                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
65                         actnode = act_all[self.hostname][0]
66                 else:
67                         actnode = None
68                 return actnode
69
70         def getKernel(self, unamestr):
71                 s = unamestr.split()
72                 if len(s) > 2:
73                         return s[2]
74                 else:
75                         return ""
76
77         def mergeRecord(self, fbnode, actnode):
78                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
79                 fbnode['stage'] = "findbad"
80                 fbnode['message'] = None
81                 fbnode['args'] = None
82                 fbnode['info'] = None
83                 fbnode['log'] = None
84                 fbnode['time'] = time.time()
85                 fbnode['email'] = TECH
86                 fbnode['action-level'] = 0
87                 fbnode['action'] = ['noop']
88                 fbnode['date_created'] = time.time()
89
90                 if actnode is None: # there is no entry in act_all
91                         actnode = {} 
92                         actnode.update(fbnode)
93                         actnode['ticket_id'] = ""
94                         actnode['prev_category'] = "ERROR" 
95                 else:
96                         actnode['prev_category']= actnode['category']
97                         actnode['comonstats']   = fbnode['comonstats']
98                         actnode['category']             = fbnode['category']
99                         actnode['state']                = fbnode['state']
100                         actnode['kernel']               = fbnode['kernel']
101                         actnode['bootcd']               = fbnode['bootcd']
102                         actnode['plcnode']              = fbnode['plcnode']
103                         ticket = get_ticket_id(actnode)
104                         if ticket is None: actnode['ticket_id'] = ""
105                         actnode['rt'] = mailer.getTicketStatus(ticket)
106
107                         #for key in actnode.keys():
108                         #       print "%10s %s %s " % (key, "==", actnode[key])
109                         #print "----------------------------"
110
111                 return actnode
112
113         def run(self):
114                 fbnode = self.getFBRecord()
115                 actnode= self.getActionRecord()
116                 actrec = self.mergeRecord(fbnode, actnode)
117                 record = Record(self.hostname, actrec)
118                 diag   = self.diagnose(record)
119                 if self.act and diag is not None:
120                         self.action(record,diag)
121         
122         def diagnose(self, record):
123
124                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
125                 # NOTE: change record stage based on RT status.
126                 #diag.setFlag('ResetStage')
127                 if record.stageIswaitforever():
128                         ticket = record.data['rt']
129                         if 'new' in ticket['Status']:
130                                 print "Resetting Stage!!!!!"
131                         #       diag.setFlag('ResetStage')
132                                 record.reset_stage()
133                         #if diag.getFlag('ResetStage'):
134                         #       print "diagnose: resetting stage"
135                         #       diag.resetFlag('ResetStage')
136                                 
137                         if 'resolved' in ticket['Status']:
138                                 diag.setFlag('RTEndRecord')
139
140                 # NOTE: take category, and prepare action
141                 category = record.getCategory()
142                 if category == "error":
143                         diag.setFlag('SendNodedown')
144                         record.data['message_series'] = emailTxt.mailtxt.newdown
145                         record.data['log'] = self.getDownLog(record)
146
147                 elif category == "prod" or category == "alpha":
148                         state = record.getState()
149                         if state == "boot":
150                                 if record.severity() != 0:
151                                         diag.setFlag('SendThankyou')
152                                         print "RESETTING STAGE: improvement"
153                                         record.data['stage'] = 'improvement'
154                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
155                                         record.data['log'] = self.getThankyouLog(record)
156                                 else:
157                                         # NOTE: do nothing, since we've already done the above.
158                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
159                                         return None
160                         elif state == "debug":
161                                 pass
162                         else:
163                                 print "unknown state %s for host %s" % (state, self.hostname)
164                 else:
165                         print "unknown category: %s" % category
166
167
168                 # TODO: how to not send email?...
169                 record = self.checkStageAndTime(diag,record)
170                 #if record:
171                 print "diagnose: checkStageAndTime Returned Valid Record"
172                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
173
174                 if "good" not in site.status: #  != "good":
175                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
176                         diag.setFlag('Squeeze')
177                 else:
178                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
179                         diag.setFlag('BackOff')
180
181                 diag.save()
182                 return diag
183                 #else:
184                 #       print "checkStageAndTime Returned NULL Record"
185                 #       return None
186
187         def action(self, record, diag):
188
189                 message = None
190
191                 #print record.data['stage']
192                 #print "improvement" in record.data['stage']
193                 #print self.getSendEmailFlag(record)
194                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
195                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
196                         "monitor-end-record" in record.data['stage']:
197                         print "action: getting message"
198                         message = record.getMessage(record.data['ticket_id'])
199                         if message:
200                                 print "action: sending email"
201                                 message.send(record.getContacts())
202                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
203                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
204                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
205                                 #print message
206                                 if message.rt.ticket_id:
207                                         print "action: setting record ticket_id"
208                                         record.data['ticket_id'] = message.rt.ticket_id
209
210                         if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
211                                 print "action: taking action"
212                                 record.takeAction(record.data['action-level'])
213                                 diag.resetFlag('Squeeze')
214                                 diag.save()
215                         if diag.getFlag('BackOff'):
216                                 record.takeAction(0)
217                                 diag.resetFlag('BackOff')
218                                 diag.save()
219
220                         if record.saveAction():
221                                 print "action: saving act_all db"
222                                 self.add_and_save_act_all(record)
223                         else:
224                                 print "action: NOT saving act_all db"
225                                 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
226
227                         if record.improved() or diag.getFlag('RTEndRecord'):
228                                 print "action: end record for %s" % self.hostname
229                                 record.end_record()
230                                 diag.setFlag('CloseRT')
231                                 diag.resetFlag('RTEndRecord')
232                                 diag.save()
233                                 #return None
234
235                         if message:
236                                 if diag.getFlag('CloseRT'):
237                                         message.rt.closeTicket()
238                                         diag.resetFlag('CloseRT')
239                                         diag.save()
240
241                 else:
242                         print "NOT sending email : %s" % config.mail
243
244                 return
245
246         def getSendEmailFlag(self, record):
247                 if not config.mail:
248                         return False
249
250                 # resend if open & created longer than 30 days ago.
251                 if  'rt' in record.data and \
252                         'Status' in record.data['rt'] and \
253                         "open" in record.data['rt']['Status'] and \
254                         record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
255                         # if created-time is greater than the thirty days ago from the current time
256                         return False
257
258                 return True
259
260         def add_and_save_act_all(self, record):
261                 self.act_all = database.dbLoad("act_all")
262                 if self.hostname not in self.act_all:
263                         self.act_all[self.hostname] = []
264                 self.act_all[self.hostname].insert(0,record.data)
265                 database.dbDump("act_all", self.act_all)
266                 
267         def getDownLog(self, record):
268
269                 record.data['args'] = {'nodename': self.hostname}
270                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
271
272                 #for key in record.data.keys():
273                 #       print "%10s %s %s " % (key, "==", record.data[key])
274
275                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
276                         log = "DOWN: %20s : %-40s == %20s %s" % \
277                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
278                 else:
279                         log = "DOWN: %20s : %-40s == %20s %s" % \
280                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
281                 return log
282
283         def getThankyouLog(self, record):
284
285                 record.data['args'] = {'nodename': self.hostname}
286                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
287
288                 try:
289                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
290                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
291                                                 (self.loginbase, self.hostname, record.data['stage'], 
292                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
293                         else:
294                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
295                                                 (self.loginbase, self.hostname, record.data['stage'], 
296                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
297                 except:
298                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
299                 return log
300
301         def checkStageAndTime(self, diag, record):
302                 current_time = time.time()
303                 delta = current_time - record.data['time']
304                 #print record.data
305                 if   'findbad' in record.data['stage']:
306                         # The node is bad, and there's no previous record of it.
307                         record.data['email'] = TECH
308                         record.data['action'] = ['noop']
309                         record.data['takeaction'] = False
310                         record.data['message'] = record.data['message_series'][0]
311                         record.data['stage'] = 'stage_actinoneweek'
312                         record.data['save-act-all'] = True
313                         record.data['action-level'] = 0
314
315                 elif 'reboot_node' in record.data['stage']:
316                         record.data['email'] = TECH
317                         record.data['action'] = ['noop']
318                         record.data['message'] = record.data['message_series'][0]
319                         record.data['stage'] = 'stage_actinoneweek'
320                         record.data['takeaction'] = False
321                         record.data['save-act-all'] = False
322                         record.data['action-level'] = 0
323                         
324                 elif 'improvement' in record.data['stage']:
325                         print "checkStageAndTime: backing off of %s" % self.hostname
326                         record.data['action'] = ['close_rt']
327                         record.data['takeaction'] = True
328                         record.data['message'] = record.data['message_series'][0]
329                         record.data['stage'] = 'monitor-end-record'
330                         record.data['save-act-all'] = True
331                         record.data['action-level'] = 0
332
333                 elif 'actinoneweek' in record.data['stage']:
334                         if delta >= 7 * SPERDAY: 
335                                 print "checkStageAndTime: transition to next stage actintwoweeks"
336                                 record.data['email'] = TECH | PI
337                                 record.data['stage'] = 'stage_actintwoweeks'
338                                 record.data['message'] = record.data['message_series'][1]
339                                 record.data['action'] = ['nocreate' ]
340                                 record.data['time'] = current_time              # reset clock for waitforever
341                                 record.data['takeaction'] = True
342                                 record.data['save-act-all'] = True
343                                 record.data['action-level'] = 1
344                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
345                                 print "checkStageAndTime: second message in one week"
346                                 record.data['email'] = TECH 
347                                 record.data['message'] = record.data['message_series'][0]
348                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
349                                 record.data['second-mail-at-oneweek'] = True
350                                 record.data['takeaction'] = False
351                                 record.data['save-act-all'] = True
352                                 record.data['action-level'] = 0
353                         else:
354                                 record.data['message'] = None
355                                 record.data['action'] = ['waitforoneweekaction' ]
356                                 record.data['takeaction'] = False
357                                 record.data['save-act-all'] = False
358                                 record.data['action-level'] = 0
359                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
360                                 #return None                    # don't send if there's no action
361
362                 elif 'actintwoweeks' in record.data['stage']:
363                         if delta >= 7 * SPERDAY:
364                                 print "checkStageAndTime: transition to next stage waitforever"
365                                 record.data['email'] = TECH | PI | USER
366                                 record.data['stage'] = 'stage_waitforever'
367                                 record.data['message'] = record.data['message_series'][2]
368                                 record.data['action'] = ['suspendslices']
369                                 record.data['time'] = current_time              # reset clock for waitforever
370                                 record.data['takeaction'] = True
371                                 record.data['save-act-all'] = True
372                                 record.data['action-level'] = 2
373                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
374                                 print "checkStageAndTime: second message in one week for stage two"
375                                 record.data['email'] = TECH | PI
376                                 record.data['message'] = record.data['message_series'][1]
377                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
378                                 record.data['second-mail-at-twoweeks'] = True
379                                 record.data['takeaction'] = False
380                                 record.data['save-act-all'] = True
381                                 record.data['action-level'] = 1
382                         else:
383                                 record.data['message'] = None
384                                 record.data['takeaction'] = False
385                                 record.data['action'] = ['waitfortwoweeksaction']
386                                 record.data['save-act-all'] = False
387                                 print "checkStageAndTime: second message in one week for stage two"
388                                 record.data['action-level'] = 1
389                                 #return None                    # don't send if there's no action
390
391                 elif 'ticket_waitforever' in record.data['stage']:
392                         record.data['email'] = TECH
393                         record.data['takeaction'] = True
394                         if 'first-found' not in record.data:
395                                 record.data['first-found'] = True
396                                 record.data['log'] += " firstfound"
397                                 record.data['action'] = ['ticket_waitforever']
398                                 record.data['message'] = None
399                                 record.data['time'] = current_time
400                                 record.data['save-act-all'] = True
401                                 record.data['action-level'] = 2
402                         else:
403                                 if delta >= 7*SPERDAY:
404                                         record.data['action'] = ['ticket_waitforever']
405                                         record.data['message'] = None
406                                         record.data['time'] = current_time              # reset clock
407                                         record.data['save-act-all'] = True
408                                         record.data['action-level'] = 2
409                                 else:
410                                         record.data['action'] = ['ticket_waitforever']
411                                         record.data['message'] = None
412                                         record.data['takeaction'] = False
413                                         record.data['save-act-all'] = False
414                                         record.data['action-level'] = 2
415                                         #return None
416
417                 elif 'waitforever' in record.data['stage']:
418                         # more than 3 days since last action
419                         # TODO: send only on weekdays.
420                         # NOTE: expects that 'time' has been reset before entering waitforever stage
421                         record.data['takeaction'] = True
422                         if delta >= 3*SPERDAY:
423                                 record.data['action'] = ['email-againwaitforever']
424                                 record.data['message'] = record.data['message_series'][2]
425                                 record.data['time'] = current_time              # reset clock
426                                 record.data['save-act-all'] = True
427                                 record.data['action-level'] = 2
428                         else:
429                                 record.data['action'] = ['waitforever']
430                                 record.data['message'] = None
431                                 record.data['takeaction'] = False
432                                 record.data['save-act-all'] = False
433                                 record.data['action-level'] = 2
434                                 #return None                    # don't send if there's no action
435
436                 else:
437                         # There is no action to be taken, possibly b/c the stage has
438                         # already been performed, but diagnose picked it up again.
439                         # two cases, 
440                         #       1. stage is unknown, or 
441                         #       2. delta is not big enough to bump it to the next stage.
442                         # TODO: figure out which. for now assume 2.
443                         print "UNKNOWN stage for %s; nothing done" % self.hostname
444                         record.data['action'] = ['unknown']
445                         record.data['message'] = record.data['message_series'][0]
446
447                         record.data['email'] = TECH
448                         record.data['action'] = ['noop']
449                         record.data['message'] = record.data['message_series'][0]
450                         record.data['stage'] = 'stage_actinoneweek'
451                         record.data['time'] = current_time              # reset clock
452                         record.data['takeaction'] = False
453                         record.data['save-act-all'] = True
454
455                 print "%s" % record.data['log'],
456                 print "%15s" % record.data['action']
457                 return record
458