changes for 3.0
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 import sys
6 import emailTxt
7 import string
8
9 from rt import is_host_in_rt_tickets
10 import plc
11
12 # Time to enforce policy
13 POLSLEEP = 7200
14
15 # Where to email the summary
16 SUMTO = "soltesz@cs.princeton.edu"
17
18 from const import *
19
20 from unified_model import *
21
22 def get_ticket_id(record):
23         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
24                 return record['ticket_id']
25         elif            'found_rt_ticket' in record and \
26                  record['found_rt_ticket'] is not "" and \
27                  record['found_rt_ticket'] is not None:
28                 return record['found_rt_ticket']
29         else:
30                 return None
31
32 class MonitorMergeDiagnoseSendEscellate:
33         act_all = None
34         fb = None
35
36         def __init__(self, hostname, act):
37                 self.hostname = hostname
38                 self.act = act
39                 self.plcdb_hn2lb = None
40                 if self.plcdb_hn2lb is None:
41                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
42                 self.loginbase = self.plcdb_hn2lb[self.hostname]
43                 return
44
45         def getFBRecord(self):
46                 if MonitorMergeDiagnoseSendEscellate.fb == None:
47                         MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
48
49                 fb = MonitorMergeDiagnoseSendEscellate.fb
50
51                 if self.hostname in fb['nodes']:
52                         fbnode = fb['nodes'][self.hostname]['values']
53                 else:
54                         raise Exception("Hostname %s not in scan database"% self.hostname)
55                 return fbnode
56
57         def getActionRecord(self):
58                 # update ticket status
59                 if MonitorMergeDiagnoseSendEscellate.act_all == None:
60                         MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
61
62                 act_all = MonitorMergeDiagnoseSendEscellate.act_all 
63
64                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
65                         actnode = act_all[self.hostname][0]
66                 else:
67                         actnode = None
68                 return actnode
69
70         def getKernel(self, unamestr):
71                 s = unamestr.split()
72                 if len(s) > 2:
73                         return s[2]
74                 else:
75                         return ""
76
77         def mergeRecord(self, fbnode, actnode):
78                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
79                 fbnode['stage'] = "findbad"
80                 fbnode['message'] = None
81                 fbnode['args'] = None
82                 fbnode['info'] = None
83                 fbnode['log'] = None
84                 fbnode['time'] = time.time()
85                 fbnode['email'] = TECH
86                 fbnode['action-level'] = 0
87                 fbnode['action'] = ['noop']
88                 fbnode['date_created'] = time.time()
89
90                 if actnode is None: # there is no entry in act_all
91                         actnode = {} 
92                         actnode.update(fbnode)
93                         actnode['ticket_id'] = ""
94                         actnode['prev_category'] = "ERROR" 
95                         actnode['prev_state'] = "DOWN" 
96                 else:
97                         actnode['prev_category']= actnode['category']
98                         actnode['prev_state']   = actnode['state']
99                         actnode['comonstats']   = fbnode['comonstats']
100                         actnode['category']             = fbnode['category']
101                         actnode['state']                = fbnode['state']
102                         actnode['kernel']               = fbnode['kernel']
103                         actnode['bootcd']               = fbnode['bootcd']
104                         actnode['plcnode']              = fbnode['plcnode']
105                         ticket = get_ticket_id(actnode)
106                         if ticket is None: actnode['ticket_id'] = ""
107                         actnode['rt'] = mailer.getTicketStatus(ticket)
108
109                         #for key in actnode.keys():
110                         #       print "%10s %s %s " % (key, "==", actnode[key])
111                         #print "----------------------------"
112
113                 return actnode
114
115         def run(self):
116                 fbnode = self.getFBRecord()
117                 actnode= self.getActionRecord()
118                 actrec = self.mergeRecord(fbnode, actnode)
119                 record = Record(self.hostname, actrec)
120                 #print record
121                 #print actrec
122                 #print record.data['time']
123                 #print time.time() - record.data['time']
124                 diag   = self.diagnose(record)
125                 if self.act and diag is not None:
126                         self.action(record,diag)
127         
128         def diagnose(self, record):
129
130                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
131                 # NOTE: change record stage based on RT status.
132                 #diag.setFlag('ResetStage')
133                 if record.stageIswaitforever():
134                         ticket = record.data['rt']
135                         if 'new' in ticket['Status']:
136                                 print "Resetting Stage!!!!!"
137                         #       diag.setFlag('ResetStage')
138                                 record.reset_stage()
139                         #if diag.getFlag('ResetStage'):
140                         #       print "diagnose: resetting stage"
141                         #       diag.resetFlag('ResetStage')
142                                 
143                         if 'resolved' in ticket['Status']:
144                                 diag.setFlag('RTEndRecord')
145
146                 # NOTE: try to give a default value to catch the errors for
147                 # planetlab1.ias.csusb.edu which seems to have an out-of-date node config
148                 record.data['message_series'] = emailTxt.mailtxt.newdown
149                 # NOTE: take category, and prepare action
150                 category = record.getCategory()
151                 if category == "error":
152                         diag.setFlag('SendNodedown')
153                         record.data['message_series'] = emailTxt.mailtxt.newdown
154                         record.data['log'] = self.getDownLog(record)
155
156                 elif category == "prod" or category == "alpha":
157                         state = record.getState()
158                         if state == "boot":
159                                 if record.severity() != 0:
160                                         diag.setFlag('SendThankyou')
161                                         print "RESETTING STAGE: improvement"
162                                         record.data['stage'] = 'improvement'
163                                         record.data['message_series'] = emailTxt.mailtxt.newthankyou
164                                         record.data['log'] = self.getThankyouLog(record)
165                                 else:
166                                         # NOTE: do nothing, since we've already done the above.
167                                         print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
168                                         return None
169                         elif state == "debug":
170                                 pass
171                         else:
172                                 print "unknown state %s for host %s" % (state, self.hostname)
173                 else:
174                         print "unknown category: %s" % category
175
176
177                 # TODO: how to not send email?...
178                 record = self.checkStageAndTime(diag,record)
179                 #if record:
180                 print "diagnose: checkStageAndTime Returned Valid Record"
181                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
182
183                 if "good" not in site.status: #  != "good":
184                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
185                         diag.setFlag('Squeeze')
186                 else:
187                         print "diagnose: Setting site %s for 'backoff'" % self.loginbase
188                         diag.setFlag('BackOff')
189
190                 diag.save()
191                 return diag
192                 #else:
193                 #       print "checkStageAndTime Returned NULL Record"
194                 #       return None
195
196         def action(self, record, diag):
197
198                 message = None
199
200                 #print record.data['stage']
201                 #print "improvement" in record.data['stage']
202                 #print self.getSendEmailFlag(record)
203                 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
204                 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
205                         "monitor-end-record" in record.data['stage']:
206                         print "action: getting message"
207                         message = record.getMessage(record.data['ticket_id'])
208                         if message:
209                                 print "action: sending email"
210                                 message.send(record.getContacts())
211                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
212                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
213                                 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
214                                 #print message
215                                 if message.rt.ticket_id:
216                                         print "action: setting record ticket_id"
217                                         record.data['ticket_id'] = message.rt.ticket_id
218
219                         if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
220                                 print "action: taking squeeze action"
221                                 record.takeAction(record.data['action-level'])
222                                 diag.resetFlag('Squeeze')
223                                 diag.save()
224                         if diag.getFlag('BackOff'):
225                                 print "action: taking backoff action"
226                                 record.takeAction(0)
227                                 diag.resetFlag('BackOff')
228                                 diag.save()
229
230                         if record.saveAction():
231                                 print "action: saving act_all db"
232                                 self.add_and_save_act_all(record)
233                         else:
234                                 print "action: NOT saving act_all db"
235                                 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
236
237                         if record.improved() or diag.getFlag('RTEndRecord'):
238                                 print "action: end record for %s" % self.hostname
239                                 record.end_record()
240                                 diag.setFlag('CloseRT')
241                                 diag.resetFlag('RTEndRecord')
242                                 diag.save()
243                                 #return None
244
245                         if message:
246                                 if diag.getFlag('CloseRT'):
247                                         message.rt.closeTicket()
248                                         diag.resetFlag('CloseRT')
249                                         diag.save()
250
251                 else:
252                         print "NOT sending email : %s" % config.mail
253
254                 return
255
256         def getSendEmailFlag(self, record):
257                 if not config.mail:
258                         return False
259
260                 # resend if open & created longer than 30 days ago.
261                 if  'rt' in record.data and \
262                         'Status' in record.data['rt'] and \
263                         "open" in record.data['rt']['Status'] and \
264                         record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
265                         # if created-time is greater than the thirty days ago from the current time
266                         return False
267
268                 return True
269
270         def add_and_save_act_all(self, record):
271                 self.act_all = database.dbLoad("act_all")
272                 if self.hostname not in self.act_all:
273                         self.act_all[self.hostname] = []
274                 self.act_all[self.hostname].insert(0,record.data)
275                 database.dbDump("act_all", self.act_all)
276                 
277         def getDownLog(self, record):
278
279                 record.data['args'] = {'nodename': self.hostname}
280                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
281
282                 #for key in record.data.keys():
283                 #       print "%10s %s %s " % (key, "==", record.data[key])
284
285                 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
286                         log = "DOWN: %20s : %-40s == %20s %s" % \
287                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
288                 else:
289                         log = "DOWN: %20s : %-40s == %20s %s" % \
290                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
291                 return log
292
293         def getThankyouLog(self, record):
294
295                 record.data['args'] = {'nodename': self.hostname}
296                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
297
298                 try:
299                         if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
300                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
301                                                 (self.loginbase, self.hostname, record.data['stage'], 
302                                                  record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
303                         else:
304                                 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
305                                                 (self.loginbase, self.hostname, record.data['stage'], 
306                                                  record.data['prev_category'], record.data['category'], record.data['ticket_id'])
307                 except:
308                         log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
309                 return log
310
311         def checkStageAndTime(self, diag, record):
312                 current_time = time.time()
313                 delta = current_time - record.data['time']
314                 #print record.data
315                 if   'findbad' in record.data['stage']:
316                         # The node is bad, and there's no previous record of it.
317                         record.data['email'] = TECH
318                         record.data['action'] = ['noop']
319                         record.data['takeaction'] = False
320                         record.data['message'] = record.data['message_series'][0]
321                         record.data['stage'] = 'stage_actinoneweek'
322                         record.data['save-act-all'] = True
323                         record.data['action-level'] = 0
324
325                 elif 'reboot_node' in record.data['stage']:
326                         record.data['email'] = TECH
327                         record.data['action'] = ['noop']
328                         record.data['message'] = record.data['message_series'][0]
329                         record.data['stage'] = 'stage_actinoneweek'
330                         record.data['takeaction'] = False
331                         record.data['save-act-all'] = False
332                         record.data['action-level'] = 0
333                         
334                 elif 'improvement' in record.data['stage']:
335                         print "checkStageAndTime: backing off of %s" % self.hostname
336                         record.data['action'] = ['close_rt']
337                         record.data['takeaction'] = True
338                         record.data['message'] = record.data['message_series'][0]
339                         record.data['stage'] = 'monitor-end-record'
340                         record.data['save-act-all'] = True
341                         record.data['action-level'] = 0
342
343                 elif 'actinoneweek' in record.data['stage']:
344                         if delta >= 7 * SPERDAY: 
345                                 print "checkStageAndTime: transition to next stage actintwoweeks"
346                                 record.data['email'] = TECH | PI
347                                 record.data['stage'] = 'stage_actintwoweeks'
348                                 record.data['message'] = record.data['message_series'][1]
349                                 record.data['action'] = ['nocreate' ]
350                                 record.data['time'] = current_time              # reset clock for waitforever
351                                 record.data['takeaction'] = True
352                                 record.data['save-act-all'] = True
353                                 record.data['action-level'] = 1
354                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
355                                 print "checkStageAndTime: second message in one week"
356                                 record.data['email'] = TECH 
357                                 record.data['message'] = record.data['message_series'][0]
358                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
359                                 record.data['second-mail-at-oneweek'] = True
360                                 record.data['takeaction'] = False
361                                 record.data['save-act-all'] = True
362                                 record.data['action-level'] = 0
363                         else:
364                                 record.data['message'] = None
365                                 record.data['action'] = ['waitforoneweekaction' ]
366                                 record.data['takeaction'] = False
367                                 record.data['save-act-all'] = False
368                                 record.data['action-level'] = 0
369                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
370                                 #return None                    # don't send if there's no action
371
372                 elif 'actintwoweeks' in record.data['stage']:
373                         if delta >= 7 * SPERDAY:
374                                 print "checkStageAndTime: transition to next stage waitforever"
375                                 record.data['email'] = TECH | PI | USER
376                                 record.data['stage'] = 'stage_waitforever'
377                                 record.data['message'] = record.data['message_series'][2]
378                                 record.data['action'] = ['suspendslices']
379                                 record.data['time'] = current_time              # reset clock for waitforever
380                                 record.data['takeaction'] = True
381                                 record.data['save-act-all'] = True
382                                 record.data['action-level'] = 2
383                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
384                                 print "checkStageAndTime: second message in one week for stage two"
385                                 record.data['email'] = TECH | PI
386                                 record.data['message'] = record.data['message_series'][1]
387                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
388                                 record.data['second-mail-at-twoweeks'] = True
389                                 record.data['takeaction'] = False
390                                 record.data['save-act-all'] = True
391                                 record.data['action-level'] = 1
392                         else:
393                                 record.data['message'] = None
394                                 record.data['takeaction'] = False
395                                 record.data['action'] = ['waitfortwoweeksaction']
396                                 record.data['save-act-all'] = False
397                                 print "checkStageAndTime: second message in one week for stage two"
398                                 record.data['action-level'] = 1
399                                 #return None                    # don't send if there's no action
400
401                 elif 'ticket_waitforever' in record.data['stage']:
402                         record.data['email'] = TECH
403                         record.data['takeaction'] = True
404                         if 'first-found' not in record.data:
405                                 record.data['first-found'] = True
406                                 record.data['log'] += " firstfound"
407                                 record.data['action'] = ['ticket_waitforever']
408                                 record.data['message'] = None
409                                 record.data['time'] = current_time
410                                 record.data['save-act-all'] = True
411                                 record.data['action-level'] = 2
412                         else:
413                                 if delta >= 7*SPERDAY:
414                                         record.data['action'] = ['ticket_waitforever']
415                                         record.data['message'] = None
416                                         record.data['time'] = current_time              # reset clock
417                                         record.data['save-act-all'] = True
418                                         record.data['action-level'] = 2
419                                 else:
420                                         record.data['action'] = ['ticket_waitforever']
421                                         record.data['message'] = None
422                                         record.data['takeaction'] = False
423                                         record.data['save-act-all'] = False
424                                         record.data['action-level'] = 2
425                                         #return None
426
427                 elif 'waitforever' in record.data['stage']:
428                         # more than 3 days since last action
429                         # TODO: send only on weekdays.
430                         # NOTE: expects that 'time' has been reset before entering waitforever stage
431                         record.data['takeaction'] = True
432                         if delta >= 3*SPERDAY:
433                                 record.data['action'] = ['email-againwaitforever']
434                                 record.data['message'] = record.data['message_series'][2]
435                                 record.data['time'] = current_time              # reset clock
436                                 record.data['save-act-all'] = True
437                                 record.data['action-level'] = 2
438                         else:
439                                 record.data['action'] = ['waitforever']
440                                 record.data['message'] = None
441                                 record.data['takeaction'] = False
442                                 record.data['save-act-all'] = False
443                                 record.data['action-level'] = 2
444                                 #return None                    # don't send if there's no action
445
446                 else:
447                         # There is no action to be taken, possibly b/c the stage has
448                         # already been performed, but diagnose picked it up again.
449                         # two cases, 
450                         #       1. stage is unknown, or 
451                         #       2. delta is not big enough to bump it to the next stage.
452                         # TODO: figure out which. for now assume 2.
453                         print "UNKNOWN stage for %s; nothing done" % self.hostname
454                         record.data['action'] = ['unknown']
455                         record.data['message'] = record.data['message_series'][0]
456
457                         record.data['email'] = TECH
458                         record.data['action'] = ['noop']
459                         record.data['message'] = record.data['message_series'][0]
460                         record.data['stage'] = 'stage_actinoneweek'
461                         record.data['time'] = current_time              # reset clock
462                         record.data['takeaction'] = False
463                         record.data['save-act-all'] = True
464
465                 print "%s" % record.data['log'],
466                 print "%15s" % record.data['action']
467                 return record
468