5 from unified_model import cmpCategoryVal
10 from rt import is_host_in_rt_tickets
13 # Time to enforce policy
16 # Where to email the summary
17 SUMTO = "soltesz@cs.princeton.edu"
21 from unified_model import *
23 def get_ticket_id(record):
24 if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
25 return record['ticket_id']
26 elif 'found_rt_ticket' in record and \
27 record['found_rt_ticket'] is not "" and \
28 record['found_rt_ticket'] is not None:
29 return record['found_rt_ticket']
33 class MonitorMergeDiagnoseSendEscellate:
37 def __init__(self, hostname, act):
38 self.hostname = hostname
40 self.plcdb_hn2lb = None
41 if self.plcdb_hn2lb is None:
42 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
43 self.loginbase = self.plcdb_hn2lb[self.hostname]
46 def getFBRecord(self):
47 if MonitorMergeDiagnoseSendEscellate.fb == None:
48 MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
50 fb = MonitorMergeDiagnoseSendEscellate.fb
52 if self.hostname in fb['nodes']:
53 fbnode = fb['nodes'][self.hostname]['values']
55 raise Exception("Hostname %s not in scan database"% self.hostname)
58 def getActionRecord(self):
59 # update ticket status
60 if MonitorMergeDiagnoseSendEscellate.act_all == None:
61 MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
63 act_all = MonitorMergeDiagnoseSendEscellate.act_all
65 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
66 actnode = act_all[self.hostname][0]
71 def getKernel(self, unamestr):
78 def mergeRecord(self, fbnode, actnode):
79 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
80 fbnode['stage'] = "findbad"
81 fbnode['message'] = None
85 fbnode['time'] = time.time()
86 fbnode['email'] = TECH
87 fbnode['action-level'] = 0
88 fbnode['action'] = ['noop']
89 fbnode['date_created'] = time.time()
91 if actnode is None: # there is no entry in act_all
93 actnode.update(fbnode)
94 actnode['ticket_id'] = ""
95 actnode['prev_category'] = "ERROR"
97 actnode['prev_category']= actnode['category']
98 actnode['comonstats'] = fbnode['comonstats']
99 actnode['category'] = fbnode['category']
100 actnode['state'] = fbnode['state']
101 actnode['kernel'] = fbnode['kernel']
102 actnode['bootcd'] = fbnode['bootcd']
103 actnode['plcnode'] = fbnode['plcnode']
104 ticket = get_ticket_id(actnode)
105 if ticket is None: actnode['ticket_id'] = ""
106 actnode['rt'] = mailer.getTicketStatus(ticket)
108 #for key in actnode.keys():
109 # print "%10s %s %s " % (key, "==", actnode[key])
110 #print "----------------------------"
115 fbnode = self.getFBRecord()
116 actnode= self.getActionRecord()
117 actrec = self.mergeRecord(fbnode, actnode)
118 record = Record(self.hostname, actrec)
119 diag = self.diagnose(record)
120 if self.act and diag is not None:
121 self.action(record,diag)
123 def diagnose(self, record):
125 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
126 # NOTE: change record stage based on RT status.
127 #diag.setFlag('ResetStage')
128 if record.stageIswaitforever():
129 ticket = record.data['rt']
130 if 'new' in ticket['Status']:
131 print "Resetting Stage!!!!!"
132 # diag.setFlag('ResetStage')
134 #if diag.getFlag('ResetStage'):
135 # print "diagnose: resetting stage"
136 # diag.resetFlag('ResetStage')
138 if 'resolved' in ticket['Status']:
139 diag.setFlag('RTEndRecord')
141 # NOTE: take category, and prepare action
142 category = record.getCategory()
143 if category == "error":
144 diag.setFlag('SendNodedown')
145 record.data['message_series'] = emailTxt.mailtxt.newdown
146 record.data['log'] = self.getDownLog(record)
148 elif category == "prod" or category == "alpha":
149 state = record.getState()
151 if record.severity() != 0:
152 diag.setFlag('SendThankyou')
153 print "RESETTING STAGE: improvement"
154 record.data['stage'] = 'improvement'
155 record.data['message_series'] = emailTxt.mailtxt.newthankyou
156 record.data['log'] = self.getThankyouLog(record)
158 # NOTE: do nothing, since we've already done the above.
159 print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
161 elif state == "debug":
164 print "unknown state %s for host %s" % (state, self.hostname)
166 print "unknown category: %s" % category
169 # TODO: how to not send email?...
170 record = self.checkStageAndTime(diag,record)
172 print "diagnose: checkStageAndTime Returned Valid Record"
173 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
175 if "good" not in site.status: # != "good":
176 print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
177 diag.setFlag('Squeeze')
179 print "diagnose: Setting site %s for 'backoff'" % self.loginbase
180 diag.setFlag('BackOff')
185 # print "checkStageAndTime Returned NULL Record"
188 def action(self, record, diag):
192 #print record.data['stage']
193 #print "improvement" in record.data['stage']
194 #print self.getSendEmailFlag(record)
195 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
196 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
197 "monitor-end-record" in record.data['stage']:
198 print "action: getting message"
199 message = record.getMessage(record.data['ticket_id'])
201 print "action: sending email"
202 message.send(record.getContacts())
203 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
204 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
205 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
207 if message.rt.ticket_id:
208 print "action: setting record ticket_id"
209 record.data['ticket_id'] = message.rt.ticket_id
211 if ( record.data['takeaction'] and diag.getFlag('Squeeze') ):
212 print "action: taking action"
213 record.takeAction(record.data['action-level'])
214 diag.resetFlag('Squeeze')
216 if diag.getFlag('BackOff'):
218 diag.resetFlag('BackOff')
221 if record.saveAction():
222 print "action: saving act_all db"
223 self.add_and_save_act_all(record)
225 print "action: NOT saving act_all db"
226 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
228 if record.improved() or diag.getFlag('RTEndRecord'):
229 print "action: end record for %s" % self.hostname
231 diag.setFlag('CloseRT')
232 diag.resetFlag('RTEndRecord')
237 if diag.getFlag('CloseRT'):
238 message.rt.closeTicket()
239 diag.resetFlag('CloseRT')
243 print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
247 def getSendEmailFlag(self, record):
251 # resend if open & created longer than 30 days ago.
252 if 'rt' in record.data and \
253 'Status' in record.data['rt'] and \
254 "open" in record.data['rt']['Status'] and \
255 record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
256 # if created-time is greater than the thirty days ago from the current time
261 def add_and_save_act_all(self, record):
262 self.act_all = database.dbLoad("act_all")
263 if self.hostname not in self.act_all:
264 self.act_all[self.hostname] = []
265 self.act_all[self.hostname].insert(0,record.data)
266 database.dbDump("act_all", self.act_all)
268 def getDownLog(self, record):
270 record.data['args'] = {'nodename': self.hostname}
271 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
273 #for key in record.data.keys():
274 # print "%10s %s %s " % (key, "==", record.data[key])
276 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
277 log = "DOWN: %20s : %-40s == %20s %s" % \
278 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
280 log = "DOWN: %20s : %-40s == %20s %s" % \
281 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
284 def getThankyouLog(self, record):
286 record.data['args'] = {'nodename': self.hostname}
287 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
290 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
291 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
292 (self.loginbase, self.hostname, record.data['stage'],
293 record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
295 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
296 (self.loginbase, self.hostname, record.data['stage'],
297 record.data['prev_category'], record.data['category'], record.data['ticket_id'])
299 log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
302 def checkStageAndTime(self, diag, record):
303 current_time = time.time()
304 delta = current_time - record.data['time']
306 if 'findbad' in record.data['stage']:
307 # The node is bad, and there's no previous record of it.
308 record.data['email'] = TECH
309 record.data['action'] = ['noop']
310 record.data['takeaction'] = False
311 record.data['message'] = record.data['message_series'][0]
312 record.data['stage'] = 'stage_actinoneweek'
313 record.data['save-act-all'] = True
314 record.data['action-level'] = 0
316 elif 'reboot_node' in record.data['stage']:
317 record.data['email'] = TECH
318 record.data['action'] = ['noop']
319 record.data['message'] = record.data['message_series'][0]
320 record.data['stage'] = 'stage_actinoneweek'
321 record.data['takeaction'] = False
322 record.data['save-act-all'] = False
323 record.data['action-level'] = 0
325 elif 'improvement' in record.data['stage']:
326 print "checkStageAndTime: backing off of %s" % self.hostname
327 record.data['action'] = ['close_rt']
328 record.data['takeaction'] = True
329 record.data['message'] = record.data['message_series'][0]
330 record.data['stage'] = 'monitor-end-record'
331 record.data['save-act-all'] = True
332 record.data['action-level'] = 0
334 elif 'actinoneweek' in record.data['stage']:
335 if delta >= 7 * SPERDAY:
336 print "checkStageAndTime: transition to next stage actintwoweeks"
337 record.data['email'] = TECH | PI
338 record.data['stage'] = 'stage_actintwoweeks'
339 record.data['message'] = record.data['message_series'][1]
340 record.data['action'] = ['nocreate' ]
341 record.data['time'] = current_time # reset clock for waitforever
342 record.data['takeaction'] = True
343 record.data['save-act-all'] = True
344 record.data['action-level'] = 1
345 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
346 print "checkStageAndTime: second message in one week"
347 record.data['email'] = TECH
348 record.data['message'] = record.data['message_series'][0]
349 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
350 record.data['second-mail-at-oneweek'] = True
351 record.data['takeaction'] = False
352 record.data['save-act-all'] = True
353 record.data['action-level'] = 0
355 record.data['message'] = None
356 record.data['action'] = ['waitforoneweekaction' ]
357 record.data['takeaction'] = False
358 record.data['save-act-all'] = False
359 record.data['action-level'] = 0
360 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
361 #return None # don't send if there's no action
363 elif 'actintwoweeks' in record.data['stage']:
364 if delta >= 7 * SPERDAY:
365 print "checkStageAndTime: transition to next stage waitforever"
366 record.data['email'] = TECH | PI | USER
367 record.data['stage'] = 'stage_waitforever'
368 record.data['message'] = record.data['message_series'][2]
369 record.data['action'] = ['suspendslices']
370 record.data['time'] = current_time # reset clock for waitforever
371 record.data['takeaction'] = True
372 record.data['save-act-all'] = True
373 record.data['action-level'] = 2
374 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
375 print "checkStageAndTime: second message in one week for stage two"
376 record.data['email'] = TECH | PI
377 record.data['message'] = record.data['message_series'][1]
378 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
379 record.data['second-mail-at-twoweeks'] = True
380 record.data['takeaction'] = False
381 record.data['save-act-all'] = True
382 record.data['action-level'] = 1
384 record.data['message'] = None
385 record.data['takeaction'] = False
386 record.data['action'] = ['waitfortwoweeksaction']
387 record.data['save-act-all'] = False
388 print "checkStageAndTime: second message in one week for stage two"
389 record.data['action-level'] = 1
390 #return None # don't send if there's no action
392 elif 'ticket_waitforever' in record.data['stage']:
393 record.data['email'] = TECH
394 record.data['takeaction'] = True
395 if 'first-found' not in record.data:
396 record.data['first-found'] = True
397 record.data['log'] += " firstfound"
398 record.data['action'] = ['ticket_waitforever']
399 record.data['message'] = None
400 record.data['time'] = current_time
401 record.data['save-act-all'] = True
402 record.data['action-level'] = 2
404 if delta >= 7*SPERDAY:
405 record.data['action'] = ['ticket_waitforever']
406 record.data['message'] = None
407 record.data['time'] = current_time # reset clock
408 record.data['save-act-all'] = True
409 record.data['action-level'] = 2
411 record.data['action'] = ['ticket_waitforever']
412 record.data['message'] = None
413 record.data['takeaction'] = False
414 record.data['save-act-all'] = False
415 record.data['action-level'] = 2
418 elif 'waitforever' in record.data['stage']:
419 # more than 3 days since last action
420 # TODO: send only on weekdays.
421 # NOTE: expects that 'time' has been reset before entering waitforever stage
422 record.data['takeaction'] = True
423 if delta >= 3*SPERDAY:
424 record.data['action'] = ['email-againwaitforever']
425 record.data['message'] = record.data['message_series'][2]
426 record.data['time'] = current_time # reset clock
427 record.data['save-act-all'] = True
428 record.data['action-level'] = 2
430 record.data['action'] = ['waitforever']
431 record.data['message'] = None
432 record.data['takeaction'] = False
433 record.data['save-act-all'] = False
434 record.data['action-level'] = 2
435 #return None # don't send if there's no action
438 # There is no action to be taken, possibly b/c the stage has
439 # already been performed, but diagnose picked it up again.
441 # 1. stage is unknown, or
442 # 2. delta is not big enough to bump it to the next stage.
443 # TODO: figure out which. for now assume 2.
444 print "UNKNOWN stage for %s; nothing done" % self.hostname
445 record.data['action'] = ['unknown']
446 record.data['message'] = record.data['message_series'][0]
448 record.data['email'] = TECH
449 record.data['action'] = ['noop']
450 record.data['message'] = record.data['message_series'][0]
451 record.data['stage'] = 'stage_actinoneweek'
452 record.data['time'] = current_time # reset clock
453 record.data['takeaction'] = False
454 record.data['save-act-all'] = True
456 print "%s" % record.data['log'],
457 print "%15s" % record.data['action']