adds checks for readonly fs, dns errors, resets message timer if
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 from unified_model import cmpCategoryVal
6 import sys
7 import emailTxt
8 import string
9
10 from rt import is_host_in_rt_tickets
11 import plc
12
13 # Time to enforce policy
14 POLSLEEP = 7200
15
16 # Where to email the summary
17 SUMTO = "soltesz@cs.princeton.edu"
18
19 from const import *
20
21 from unified_model import *
22
23 def get_ticket_id(record):
24         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
25                 return record['ticket_id']
26         elif            'found_rt_ticket' in record and \
27                  record['found_rt_ticket'] is not "" and \
28                  record['found_rt_ticket'] is not None:
29                 return record['found_rt_ticket']
30         else:
31                 return None
32
33 class MonitorMergeDiagnoseSendEscellate:
34         act_all = None
35         fb = None
36
37         def __init__(self, hostname, act):
38                 self.hostname = hostname
39                 self.act = act
40                 self.plcdb_hn2lb = None
41                 if self.plcdb_hn2lb is None:
42                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
43                 self.loginbase = self.plcdb_hn2lb[self.hostname]
44                 return
45
46         def getFBRecord(self):
47                 if MonitorMergeDiagnoseSendEscellate.fb == None:
48                         MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
49
50                 fb = MonitorMergeDiagnoseSendEscellate.fb
51
52                 if self.hostname in fb['nodes']:
53                         fbnode = fb['nodes'][self.hostname]['values']
54                 else:
55                         raise Exception("Hostname %s not in scan database"% self.hostname)
56                 return fbnode
57
58         def getActionRecord(self):
59                 # update ticket status
60                 if MonitorMergeDiagnoseSendEscellate.act_all == None:
61                         MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
62
63                 act_all = MonitorMergeDiagnoseSendEscellate.act_all 
64
65                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
66                         actnode = act_all[self.hostname][0]
67                 else:
68                         actnode = None
69                 return actnode
70
71         def getKernel(self, unamestr):
72                 s = unamestr.split()
73                 if len(s) > 2:
74                         return s[2]
75                 else:
76                         return ""
77
78         def mergeRecord(self, fbnode, actnode):
79                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
80                 fbnode['stage'] = "findbad"
81                 fbnode['message'] = None
82                 fbnode['args'] = None
83                 fbnode['info'] = None
84                 fbnode['log'] = None
85                 fbnode['time'] = time.time()
86                 fbnode['email'] = TECH
87                 fbnode['action-level'] = 0
88                 fbnode['action'] = ['noop']
89                 fbnode['date_created'] = time.time()
90
91                 if actnode is None: # there is no entry in act_all
92                         actnode = {} 
93                         actnode.update(fbnode)
94                         actnode['ticket_id'] = ""
95                         actnode['prev_category'] = "ERROR" 
96                 else:
97                         actnode['prev_category']= actnode['category']
98                         actnode['comonstats']   = fbnode['comonstats']
99                         actnode['category']             = fbnode['category']
100                         actnode['state']                = fbnode['state']
101                         actnode['kernel']               = fbnode['kernel']
102                         actnode['bootcd']               = fbnode['bootcd']
103                         actnode['plcnode']              = fbnode['plcnode']
104                         ticket = get_ticket_id(actnode)
105                         if ticket is None: actnode['ticket_id'] = ""
106                         actnode['rt'] = mailer.getTicketStatus(ticket)
107
108                         #for key in actnode.keys():
109                         #       print "%10s %s %s " % (key, "==", actnode[key])
110                         #print "----------------------------"
111
112                 return actnode
113
114         def run(self):
115                 fbnode = self.getFBRecord()
116                 actnode= self.getActionRecord()
117                 actrec = self.mergeRecord(fbnode, actnode)
118                 record = Record(self.hostname, actrec)
119                 diag   = self.diagnose(record)
120                 if self.act and diag is not None:
121                         self.action(record,diag)
122         
123         def diagnose(self, record):
124
125                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
126                 # NOTE: change record stage based on RT status.
127                 #diag.setFlag('ResetStage')
128                 if record.stageIswaitforever():
129                         ticket = record.data['rt']
130                         if 'new' in ticket['Status']:
131                                 print "Resetting Stage!!!!!"
132                         #       diag.setFlag('ResetStage')
133                                 record.reset_stage()
134                         #if diag.getFlag('ResetStage'):
135                         #       print "diagnose: resetting stage"
136                         #       diag.resetFlag('ResetStage')
137                                 
138                         if 'resolved' in ticket['Status']:
139                                 diag.setFlag('RTEndRecord')
140
141                 # NOTE: take category, and prepare action
142                 category = record.getCategory()
143                 if category == "error":
144                         diag.setFlag('SendNodedown')
145                         record.data['message_series'] = emailTxt.mailtxt.newdown
146                         record.data['log'] = self.getDownLog(record)
147
148                 elif category == "prod" or category == "alpha":
149                         state = record.getState()
150                         if state == "boot":
151                                 if record.severity() != 0:
152                                         diag.setFlag('SendThankyou')
153                                         print "RESETTING STAGE: improvement"
154                                         record.data['stage'] = 'improvement'
155                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
156                                         record.data['log'] = self.getThankyouLog(record)
157                                 else:
158                                         # NOTE: do nothing, since we've already done the above.
159                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
160                                         return None
161                         elif state == "debug":
162                                 pass
163                         else:
164                                 print "unknown state %s for host %s" % (state, self.hostname)
165                 else:
166                         print "unknown category: %s" % category
167
168
169                 # TODO: how to not send email?...
170                 record = self.checkStageAndTime(diag,record)
171                 #if record:
172                 print "diagnose: checkStageAndTime Returned Valid Record"
173                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
174
175                 if "good" not in site.status: #  != "good":
176                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
177                         diag.setFlag('Squeeze')
178                 else:
179                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
180                         diag.setFlag('BackOff')
181
182                 diag.save()
183                 return diag
184                 #else:
185                 #       print "checkStageAndTime Returned NULL Record"
186                 #       return None
187
188         def action(self, record, diag):
189
190                 message = None
191
192                 #print record.data['stage']
193                 #print "improvement" in record.data['stage']
194                 #print self.getSendEmailFlag(record)
195                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
196                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
197                         "monitor-end-record" in record.data['stage']:
198                         print "action: getting message"
199                         message = record.getMessage(record.data['ticket_id'])
200                         if message:
201                                 print "action: sending email"
202                                 message.send(record.getContacts())
203                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
204                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
205                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
206                                 #print message
207                                 if message.rt.ticket_id:
208                                         print "action: setting record ticket_id"
209                                         record.data['ticket_id'] = message.rt.ticket_id
210
211                         if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
212                                 print "action: taking action"
213                                 record.takeAction(record.data['action-level'])
214                                 diag.resetFlag('Squeeze')
215                                 diag.save()
216                         if diag.getFlag('BackOff'):
217                                 record.takeAction(0)
218                                 diag.resetFlag('BackOff')
219                                 diag.save()
220
221                         if record.saveAction():
222                                 print "action: saving act_all db"
223                                 self.add_and_save_act_all(record)
224                         else:
225                                 print "action: NOT saving act_all db"
226                                 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
227
228                         if record.improved() or diag.getFlag('RTEndRecord'):
229                                 print "action: end record for %s" % self.hostname
230                                 record.end_record()
231                                 diag.setFlag('CloseRT')
232                                 diag.resetFlag('RTEndRecord')
233                                 diag.save()
234                                 #return None
235
236                         if message:
237                                 if diag.getFlag('CloseRT'):
238                                         message.rt.closeTicket()
239                                         diag.resetFlag('CloseRT')
240                                         diag.save()
241
242                 else:
243                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
244
245                 return
246
247         def getSendEmailFlag(self, record):
248                 if not config.mail:
249                         return False
250
251                 # resend if open & created longer than 30 days ago.
252                 if  'rt' in record.data and \
253                         'Status' in record.data['rt'] and \
254                         "open" in record.data['rt']['Status'] and \
255                         record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
256                         # if created-time is greater than the thirty days ago from the current time
257                         return False
258
259                 return True
260
261         def add_and_save_act_all(self, record):
262                 self.act_all = database.dbLoad("act_all")
263                 if self.hostname not in self.act_all:
264                         self.act_all[self.hostname] = []
265                 self.act_all[self.hostname].insert(0,record.data)
266                 database.dbDump("act_all", self.act_all)
267                 
268         def getDownLog(self, record):
269
270                 record.data['args'] = {'nodename': self.hostname}
271                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
272
273                 #for key in record.data.keys():
274                 #       print "%10s %s %s " % (key, "==", record.data[key])
275
276                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
277                         log = "DOWN: %20s : %-40s == %20s %s" % \
278                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
279                 else:
280                         log = "DOWN: %20s : %-40s == %20s %s" % \
281                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
282                 return log
283
284         def getThankyouLog(self, record):
285
286                 record.data['args'] = {'nodename': self.hostname}
287                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
288
289                 try:
290                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
291                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
292                                                 (self.loginbase, self.hostname, record.data['stage'], 
293                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
294                         else:
295                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
296                                                 (self.loginbase, self.hostname, record.data['stage'], 
297                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
298                 except:
299                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
300                 return log
301
302         def checkStageAndTime(self, diag, record):
303                 current_time = time.time()
304                 delta = current_time - record.data['time']
305                 #print record.data
306                 if   'findbad' in record.data['stage']:
307                         # The node is bad, and there's no previous record of it.
308                         record.data['email'] = TECH
309                         record.data['action'] = ['noop']
310                         record.data['takeaction'] = False
311                         record.data['message'] = record.data['message_series'][0]
312                         record.data['stage'] = 'stage_actinoneweek'
313                         record.data['save-act-all'] = True
314                         record.data['action-level'] = 0
315
316                 elif 'reboot_node' in record.data['stage']:
317                         record.data['email'] = TECH
318                         record.data['action'] = ['noop']
319                         record.data['message'] = record.data['message_series'][0]
320                         record.data['stage'] = 'stage_actinoneweek'
321                         record.data['takeaction'] = False
322                         record.data['save-act-all'] = False
323                         record.data['action-level'] = 0
324                         
325                 elif 'improvement' in record.data['stage']:
326                         print "checkStageAndTime: backing off of %s" % self.hostname
327                         record.data['action'] = ['close_rt']
328                         record.data['takeaction'] = True
329                         record.data['message'] = record.data['message_series'][0]
330                         record.data['stage'] = 'monitor-end-record'
331                         record.data['save-act-all'] = True
332                         record.data['action-level'] = 0
333
334                 elif 'actinoneweek' in record.data['stage']:
335                         if delta >= 7 * SPERDAY: 
336                                 print "checkStageAndTime: transition to next stage actintwoweeks"
337                                 record.data['email'] = TECH | PI
338                                 record.data['stage'] = 'stage_actintwoweeks'
339                                 record.data['message'] = record.data['message_series'][1]
340                                 record.data['action'] = ['nocreate' ]
341                                 record.data['time'] = current_time              # reset clock for waitforever
342                                 record.data['takeaction'] = True
343                                 record.data['save-act-all'] = True
344                                 record.data['action-level'] = 1
345                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
346                                 print "checkStageAndTime: second message in one week"
347                                 record.data['email'] = TECH 
348                                 record.data['message'] = record.data['message_series'][0]
349                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
350                                 record.data['second-mail-at-oneweek'] = True
351                                 record.data['takeaction'] = False
352                                 record.data['save-act-all'] = True
353                                 record.data['action-level'] = 0
354                         else:
355                                 record.data['message'] = None
356                                 record.data['action'] = ['waitforoneweekaction' ]
357                                 record.data['takeaction'] = False
358                                 record.data['save-act-all'] = False
359                                 record.data['action-level'] = 0
360                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
361                                 #return None                    # don't send if there's no action
362
363                 elif 'actintwoweeks' in record.data['stage']:
364                         if delta >= 7 * SPERDAY:
365                                 print "checkStageAndTime: transition to next stage waitforever"
366                                 record.data['email'] = TECH | PI | USER
367                                 record.data['stage'] = 'stage_waitforever'
368                                 record.data['message'] = record.data['message_series'][2]
369                                 record.data['action'] = ['suspendslices']
370                                 record.data['time'] = current_time              # reset clock for waitforever
371                                 record.data['takeaction'] = True
372                                 record.data['save-act-all'] = True
373                                 record.data['action-level'] = 2
374                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
375                                 print "checkStageAndTime: second message in one week for stage two"
376                                 record.data['email'] = TECH | PI
377                                 record.data['message'] = record.data['message_series'][1]
378                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
379                                 record.data['second-mail-at-twoweeks'] = True
380                                 record.data['takeaction'] = False
381                                 record.data['save-act-all'] = True
382                                 record.data['action-level'] = 1
383                         else:
384                                 record.data['message'] = None
385                                 record.data['takeaction'] = False
386                                 record.data['action'] = ['waitfortwoweeksaction']
387                                 record.data['save-act-all'] = False
388                                 print "checkStageAndTime: second message in one week for stage two"
389                                 record.data['action-level'] = 1
390                                 #return None                    # don't send if there's no action
391
392                 elif 'ticket_waitforever' in record.data['stage']:
393                         record.data['email'] = TECH
394                         record.data['takeaction'] = True
395                         if 'first-found' not in record.data:
396                                 record.data['first-found'] = True
397                                 record.data['log'] += " firstfound"
398                                 record.data['action'] = ['ticket_waitforever']
399                                 record.data['message'] = None
400                                 record.data['time'] = current_time
401                                 record.data['save-act-all'] = True
402                                 record.data['action-level'] = 2
403                         else:
404                                 if delta >= 7*SPERDAY:
405                                         record.data['action'] = ['ticket_waitforever']
406                                         record.data['message'] = None
407                                         record.data['time'] = current_time              # reset clock
408                                         record.data['save-act-all'] = True
409                                         record.data['action-level'] = 2
410                                 else:
411                                         record.data['action'] = ['ticket_waitforever']
412                                         record.data['message'] = None
413                                         record.data['takeaction'] = False
414                                         record.data['save-act-all'] = False
415                                         record.data['action-level'] = 2
416                                         #return None
417
418                 elif 'waitforever' in record.data['stage']:
419                         # more than 3 days since last action
420                         # TODO: send only on weekdays.
421                         # NOTE: expects that 'time' has been reset before entering waitforever stage
422                         record.data['takeaction'] = True
423                         if delta >= 3*SPERDAY:
424                                 record.data['action'] = ['email-againwaitforever']
425                                 record.data['message'] = record.data['message_series'][2]
426                                 record.data['time'] = current_time              # reset clock
427                                 record.data['save-act-all'] = True
428                                 record.data['action-level'] = 2
429                         else:
430                                 record.data['action'] = ['waitforever']
431                                 record.data['message'] = None
432                                 record.data['takeaction'] = False
433                                 record.data['save-act-all'] = False
434                                 record.data['action-level'] = 2
435                                 #return None                    # don't send if there's no action
436
437                 else:
438                         # There is no action to be taken, possibly b/c the stage has
439                         # already been performed, but diagnose picked it up again.
440                         # two cases, 
441                         #       1. stage is unknown, or 
442                         #       2. delta is not big enough to bump it to the next stage.
443                         # TODO: figure out which. for now assume 2.
444                         print "UNKNOWN stage for %s; nothing done" % self.hostname
445                         record.data['action'] = ['unknown']
446                         record.data['message'] = record.data['message_series'][0]
447
448                         record.data['email'] = TECH
449                         record.data['action'] = ['noop']
450                         record.data['message'] = record.data['message_series'][0]
451                         record.data['stage'] = 'stage_actinoneweek'
452                         record.data['time'] = current_time              # reset clock
453                         record.data['takeaction'] = False
454                         record.data['save-act-all'] = True
455
456                 print "%s" % record.data['log'],
457                 print "%15s" % record.data['action']
458                 return record
459