9 from rt import is_host_in_rt_tickets
12 # Time to enforce policy
15 # Where to email the summary
16 SUMTO = "soltesz@cs.princeton.edu"
20 from unified_model import *
22 def get_ticket_id(record):
23 if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
24 return record['ticket_id']
25 elif 'found_rt_ticket' in record and \
26 record['found_rt_ticket'] is not "" and \
27 record['found_rt_ticket'] is not None:
28 return record['found_rt_ticket']
32 class MonitorMergeDiagnoseSendEscellate:
36 def __init__(self, hostname, act):
37 self.hostname = hostname
39 self.plcdb_hn2lb = None
40 if self.plcdb_hn2lb is None:
41 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
42 self.loginbase = self.plcdb_hn2lb[self.hostname]
45 def getFBRecord(self):
46 if MonitorMergeDiagnoseSendEscellate.fb == None:
47 MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
49 fb = MonitorMergeDiagnoseSendEscellate.fb
51 if self.hostname in fb['nodes']:
52 fbnode = fb['nodes'][self.hostname]['values']
54 raise Exception("Hostname %s not in scan database"% self.hostname)
57 def getActionRecord(self):
58 # update ticket status
59 if MonitorMergeDiagnoseSendEscellate.act_all == None:
60 MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
62 act_all = MonitorMergeDiagnoseSendEscellate.act_all
64 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
65 actnode = act_all[self.hostname][0]
70 def getKernel(self, unamestr):
77 def mergeRecord(self, fbnode, actnode):
78 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
79 fbnode['stage'] = "findbad"
80 fbnode['message'] = None
84 fbnode['time'] = time.time()
85 fbnode['email'] = TECH
86 fbnode['action-level'] = 0
87 fbnode['action'] = ['noop']
88 fbnode['date_created'] = time.time()
90 if actnode is None: # there is no entry in act_all
92 actnode.update(fbnode)
93 actnode['ticket_id'] = ""
94 actnode['prev_category'] = "ERROR"
95 actnode['prev_state'] = "DOWN"
97 actnode['prev_category']= actnode['category']
98 actnode['prev_state'] = actnode['state']
99 actnode['comonstats'] = fbnode['comonstats']
100 actnode['category'] = fbnode['category']
101 actnode['state'] = fbnode['state']
102 actnode['kernel'] = fbnode['kernel']
103 actnode['bootcd'] = fbnode['bootcd']
104 actnode['plcnode'] = fbnode['plcnode']
105 ticket = get_ticket_id(actnode)
106 if ticket is None: actnode['ticket_id'] = ""
107 actnode['rt'] = mailer.getTicketStatus(ticket)
109 #for key in actnode.keys():
110 # print "%10s %s %s " % (key, "==", actnode[key])
111 #print "----------------------------"
116 fbnode = self.getFBRecord()
117 actnode= self.getActionRecord()
118 actrec = self.mergeRecord(fbnode, actnode)
119 record = Record(self.hostname, actrec)
122 #print record.data['time']
123 #print time.time() - record.data['time']
124 diag = self.diagnose(record)
125 if self.act and diag is not None:
126 self.action(record,diag)
128 def diagnose(self, record):
130 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
131 # NOTE: change record stage based on RT status.
132 #diag.setFlag('ResetStage')
133 if record.stageIswaitforever():
134 ticket = record.data['rt']
135 if 'new' in ticket['Status']:
136 print "Resetting Stage!!!!!"
137 # diag.setFlag('ResetStage')
139 #if diag.getFlag('ResetStage'):
140 # print "diagnose: resetting stage"
141 # diag.resetFlag('ResetStage')
143 if 'resolved' in ticket['Status']:
144 diag.setFlag('RTEndRecord')
146 # NOTE: try to give a default value to catch the errors for
147 # planetlab1.ias.csusb.edu which seems to have an out-of-date node config
148 record.data['message_series'] = emailTxt.mailtxt.newdown
149 # NOTE: take category, and prepare action
150 category = record.getCategory()
151 if category == "error":
152 diag.setFlag('SendNodedown')
153 record.data['message_series'] = emailTxt.mailtxt.newdown
154 record.data['log'] = self.getDownLog(record)
156 elif category == "prod" or category == "alpha":
157 state = record.getState()
159 if record.severity() != 0:
160 diag.setFlag('SendThankyou')
161 print "RESETTING STAGE: improvement"
162 record.data['stage'] = 'improvement'
163 record.data['message_series'] = emailTxt.mailtxt.newthankyou
164 record.data['log'] = self.getThankyouLog(record)
166 # NOTE: do nothing, since we've already done the above.
167 print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
169 elif state == "debug":
172 print "unknown state %s for host %s" % (state, self.hostname)
174 print "unknown category: %s" % category
177 # TODO: how to not send email?...
178 record = self.checkStageAndTime(diag,record)
180 print "diagnose: checkStageAndTime Returned Valid Record"
181 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
183 if "good" not in site.status: # != "good":
184 print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
185 diag.setFlag('Squeeze')
187 print "diagnose: Setting site %s for 'backoff'" % self.loginbase
188 diag.setFlag('BackOff')
193 # print "checkStageAndTime Returned NULL Record"
196 def action(self, record, diag):
200 #print record.data['stage']
201 #print "improvement" in record.data['stage']
202 #print self.getSendEmailFlag(record)
203 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
204 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
205 "monitor-end-record" in record.data['stage']:
206 print "action: getting message"
207 message = record.getMessage(record.data['ticket_id'])
209 print "action: sending email"
210 message.send(record.getContacts())
211 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
212 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
213 #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
215 if message.rt.ticket_id:
216 print "action: setting record ticket_id"
217 record.data['ticket_id'] = message.rt.ticket_id
219 if ( record.data['takeaction'] and diag.getFlag('Squeeze') ):
220 print "action: taking squeeze action"
221 record.takeAction(record.data['action-level'])
222 diag.resetFlag('Squeeze')
224 if diag.getFlag('BackOff'):
225 print "action: taking backoff action"
227 diag.resetFlag('BackOff')
230 if record.saveAction():
231 print "action: saving act_all db"
232 self.add_and_save_act_all(record)
234 print "action: NOT saving act_all db"
235 print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
237 if record.improved() or diag.getFlag('RTEndRecord'):
238 print "action: end record for %s" % self.hostname
240 diag.setFlag('CloseRT')
241 diag.resetFlag('RTEndRecord')
246 if diag.getFlag('CloseRT'):
247 message.rt.closeTicket()
248 diag.resetFlag('CloseRT')
252 print "NOT sending email : %s" % config.mail
256 def getSendEmailFlag(self, record):
260 # resend if open & created longer than 30 days ago.
261 if 'rt' in record.data and \
262 'Status' in record.data['rt'] and \
263 "open" in record.data['rt']['Status'] and \
264 record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
265 # if created-time is greater than the thirty days ago from the current time
270 def add_and_save_act_all(self, record):
271 self.act_all = database.dbLoad("act_all")
272 if self.hostname not in self.act_all:
273 self.act_all[self.hostname] = []
274 self.act_all[self.hostname].insert(0,record.data)
275 database.dbDump("act_all", self.act_all)
277 def getDownLog(self, record):
279 record.data['args'] = {'nodename': self.hostname}
280 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
282 #for key in record.data.keys():
283 # print "%10s %s %s " % (key, "==", record.data[key])
285 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
286 log = "DOWN: %20s : %-40s == %20s %s" % \
287 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
289 log = "DOWN: %20s : %-40s == %20s %s" % \
290 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
293 def getThankyouLog(self, record):
295 record.data['args'] = {'nodename': self.hostname}
296 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
299 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
300 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
301 (self.loginbase, self.hostname, record.data['stage'],
302 record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
304 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
305 (self.loginbase, self.hostname, record.data['stage'],
306 record.data['prev_category'], record.data['category'], record.data['ticket_id'])
308 log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
311 def checkStageAndTime(self, diag, record):
312 current_time = time.time()
313 delta = current_time - record.data['time']
315 if 'findbad' in record.data['stage']:
316 # The node is bad, and there's no previous record of it.
317 record.data['email'] = TECH
318 record.data['action'] = ['noop']
319 record.data['takeaction'] = False
320 record.data['message'] = record.data['message_series'][0]
321 record.data['stage'] = 'stage_actinoneweek'
322 record.data['save-act-all'] = True
323 record.data['action-level'] = 0
325 elif 'reboot_node' in record.data['stage']:
326 record.data['email'] = TECH
327 record.data['action'] = ['noop']
328 record.data['message'] = record.data['message_series'][0]
329 record.data['stage'] = 'stage_actinoneweek'
330 record.data['takeaction'] = False
331 record.data['save-act-all'] = False
332 record.data['action-level'] = 0
334 elif 'improvement' in record.data['stage']:
335 print "checkStageAndTime: backing off of %s" % self.hostname
336 record.data['action'] = ['close_rt']
337 record.data['takeaction'] = True
338 record.data['message'] = record.data['message_series'][0]
339 record.data['stage'] = 'monitor-end-record'
340 record.data['save-act-all'] = True
341 record.data['action-level'] = 0
343 elif 'actinoneweek' in record.data['stage']:
344 if delta >= 7 * SPERDAY:
345 print "checkStageAndTime: transition to next stage actintwoweeks"
346 record.data['email'] = TECH | PI
347 record.data['stage'] = 'stage_actintwoweeks'
348 record.data['message'] = record.data['message_series'][1]
349 record.data['action'] = ['nocreate' ]
350 record.data['time'] = current_time # reset clock for waitforever
351 record.data['takeaction'] = True
352 record.data['save-act-all'] = True
353 record.data['action-level'] = 1
354 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
355 print "checkStageAndTime: second message in one week"
356 record.data['email'] = TECH
357 record.data['message'] = record.data['message_series'][0]
358 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
359 record.data['second-mail-at-oneweek'] = True
360 record.data['takeaction'] = False
361 record.data['save-act-all'] = True
362 record.data['action-level'] = 0
364 record.data['message'] = None
365 record.data['action'] = ['waitforoneweekaction' ]
366 record.data['takeaction'] = False
367 record.data['save-act-all'] = False
368 record.data['action-level'] = 0
369 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
370 #return None # don't send if there's no action
372 elif 'actintwoweeks' in record.data['stage']:
373 if delta >= 7 * SPERDAY:
374 print "checkStageAndTime: transition to next stage waitforever"
375 record.data['email'] = TECH | PI | USER
376 record.data['stage'] = 'stage_waitforever'
377 record.data['message'] = record.data['message_series'][2]
378 record.data['action'] = ['suspendslices']
379 record.data['time'] = current_time # reset clock for waitforever
380 record.data['takeaction'] = True
381 record.data['save-act-all'] = True
382 record.data['action-level'] = 2
383 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
384 print "checkStageAndTime: second message in one week for stage two"
385 record.data['email'] = TECH | PI
386 record.data['message'] = record.data['message_series'][1]
387 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
388 record.data['second-mail-at-twoweeks'] = True
389 record.data['takeaction'] = False
390 record.data['save-act-all'] = True
391 record.data['action-level'] = 1
393 record.data['message'] = None
394 record.data['takeaction'] = False
395 record.data['action'] = ['waitfortwoweeksaction']
396 record.data['save-act-all'] = False
397 print "checkStageAndTime: second message in one week for stage two"
398 record.data['action-level'] = 1
399 #return None # don't send if there's no action
401 elif 'ticket_waitforever' in record.data['stage']:
402 record.data['email'] = TECH
403 record.data['takeaction'] = True
404 if 'first-found' not in record.data:
405 record.data['first-found'] = True
406 record.data['log'] += " firstfound"
407 record.data['action'] = ['ticket_waitforever']
408 record.data['message'] = None
409 record.data['time'] = current_time
410 record.data['save-act-all'] = True
411 record.data['action-level'] = 2
413 if delta >= 7*SPERDAY:
414 record.data['action'] = ['ticket_waitforever']
415 record.data['message'] = None
416 record.data['time'] = current_time # reset clock
417 record.data['save-act-all'] = True
418 record.data['action-level'] = 2
420 record.data['action'] = ['ticket_waitforever']
421 record.data['message'] = None
422 record.data['takeaction'] = False
423 record.data['save-act-all'] = False
424 record.data['action-level'] = 2
427 elif 'waitforever' in record.data['stage']:
428 # more than 3 days since last action
429 # TODO: send only on weekdays.
430 # NOTE: expects that 'time' has been reset before entering waitforever stage
431 record.data['takeaction'] = True
432 if delta >= 3*SPERDAY:
433 record.data['action'] = ['email-againwaitforever']
434 record.data['message'] = record.data['message_series'][2]
435 record.data['time'] = current_time # reset clock
436 record.data['save-act-all'] = True
437 record.data['action-level'] = 2
439 record.data['action'] = ['waitforever']
440 record.data['message'] = None
441 record.data['takeaction'] = False
442 record.data['save-act-all'] = False
443 record.data['action-level'] = 2
444 #return None # don't send if there's no action
447 # There is no action to be taken, possibly b/c the stage has
448 # already been performed, but diagnose picked it up again.
450 # 1. stage is unknown, or
451 # 2. delta is not big enough to bump it to the next stage.
452 # TODO: figure out which. for now assume 2.
453 print "UNKNOWN stage for %s; nothing done" % self.hostname
454 record.data['action'] = ['unknown']
455 record.data['message'] = record.data['message_series'][0]
457 record.data['email'] = TECH
458 record.data['action'] = ['noop']
459 record.data['message'] = record.data['message_series'][0]
460 record.data['stage'] = 'stage_actinoneweek'
461 record.data['time'] = current_time # reset clock
462 record.data['takeaction'] = False
463 record.data['save-act-all'] = True
465 print "%s" % record.data['log'],
466 print "%15s" % record.data['action']