6 from monitor.wrapper import mailer
7 from monitor.wrapper import emailTxt
8 from monitor.wrapper import plccache
9 from datetime import datetime
11 from monitor.wrapper.rt import is_host_in_rt_tickets
12 from monitor.wrapper import plc
14 # Time to enforce policy
17 # Where to email the summary
18 SUMTO = "soltesz@cs.princeton.edu"
22 from monitor.model import *
24 class MonitorMergeDiagnoseSendEscellate:
27 def __init__(self, hostname, act):
28 self.hostname = hostname
30 self.plcdb_hn2lb = None
31 if self.plcdb_hn2lb is None:
32 self.plcdb_hn2lb = plccache.plcdb_hn2lb
33 self.loginbase = self.plcdb_hn2lb[self.hostname]
36 def getFBRecords(self):
37 fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname)
45 def getLastActionRecord(self):
46 actrec = ActionRecord.get_latest_by(hostname=self.hostname)
54 def getPreviousCategory(self, actrec):
57 ret = actrec.findbad_records[0].observed_category
63 def mergeRecord(self, fbnodes, actrec):
66 actdefault['date_created'] = datetime.now()
67 actdefault['date_action_taken'] = datetime.now()
69 actdefault['stage'] = "initial"
70 actdefault['message_series'] = None
71 actdefault['message_index'] = None
72 actdefault['message_arguments'] = None
74 actdefault['send_email_to'] = TECH
75 actdefault['penalty_level'] = 0
76 actdefault['action'] = [ 'noop' ]
77 actdefault['take_action'] = False
79 actdefault['ticket_id'] = ""
80 actdefault['findbad_records'] = fbnodes
81 actdefault['last_action_record'] = actrec
83 actdefault['prev_category'] = self.getPreviousCategory(actrec)
84 actdefault['category'] = fbnodes[0].observed_category
86 actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id)
91 fbnodes = self.getFBRecords()
92 actnode= self.getLastActionRecord()
93 actrec = self.mergeRecord(fbnodes, actnode)
94 record = Record(self.hostname, actrec)
95 diag = self.diagnose(record)
96 if self.act and diag is not None:
97 self.action(record,diag)
99 def diagnose(self, record):
102 # NOTE: change record stage based on RT status.
103 if record.stageIswaitforever():
104 ticket = record.data['rt']
105 if 'new' in ticket['Status']:
106 print "Resetting Stage!!!!!"
109 if 'resolved' in ticket['Status']:
110 diag['RTEndRecord'] = True
112 # NOTE: take category, and prepare action
113 category = record.getCategory()
114 if category == "error":
115 diag['SendNodedown'] = True
116 record.data['message_series'] = emailTxt.mailtxt.newdown
117 record.data['log'] = self.getDownLog(record)
119 elif category == "prod" or category == "alpha":
120 state = record.getState()
122 if record.severity() != 0:
123 diag['SendThankyou'] = True
124 print "RESETTING STAGE: improvement"
125 record.data['stage'] = 'improvement'
126 record.data['message_series'] = emailTxt.mailtxt.newthankyou
127 record.data['log'] = self.getThankyouLog(record)
129 # NOTE: do nothing, since we've already done the above.
130 print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
132 elif state == "debug":
135 print "unknown state %s for host %s" % (state, self.hostname)
137 print "unknown category: %s" % category
140 # TODO: how to not send email?...
141 record = self.checkStageAndTime(record)
143 print "diagnose: checkStageAndTime Returned Valid Record"
144 siterec = HistorySiteRecord.by_loginbase(self.loginbase)
146 if "good" not in siterec.status: # != "good":
147 print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
148 diag['Squeeze'] = True
150 print "diagnose: Setting site %s for 'backoff'" % self.loginbase
151 diag['BackOff'] = True
155 def action(self, record, diag):
159 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
160 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
161 "monitor-end-record" in record.data['stage']:
162 print "action: getting message"
164 message = record.getMessage(record.data['ticket_id'])
166 print "action: sending email"
167 message.send(record.getContacts())
168 if message.rt.ticket_id:
169 print "action: setting record ticket_id"
170 record.data['ticket_id'] = message.rt.ticket_id
173 if ( record.data['take_action'] and diag['Squeeze'] ):
174 print "action: taking squeeze action"
175 record.takeAction(record.data['penalty_level'])
177 if diag.getFlag('BackOff'):
178 print "action: taking backoff action"
183 if record.saveAction():
184 print "action: saving act_all db"
185 self.add_and_save_act_all(record)
187 print "action: NOT saving act_all db"
188 print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] )
191 if record.improved() or diag['RTEndRecord']:
192 print "action: end record for %s" % self.hostname
194 diag['CloseRT'] = True
195 del diag['RTEndRecord']
200 message.rt.closeTicket()
204 print "NOT sending email : %s" % config.mail
208 def add_and_save_act_all(self, record):
210 Read the sync record for this node, and increment the round and
211 create an ActionRecord for this host using the record.data values.
213 recsync = RecordActionSync.get_by(hostname=self.hostname)
214 rec = RecordAction(hostname=self.hostname)
216 record.data['round'] = recsync.round
217 # TODO: we will need to delete some of these before setting them in the DB.
218 rec.set(**record.data)
221 def getDownLog(self, record):
223 record.data['args'] = {'nodename': self.hostname}
224 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
226 #for key in record.data.keys():
227 # print "%10s %s %s " % (key, "==", record.data[key])
229 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
230 log = "DOWN: %20s : %-40s == %20s %s" % \
231 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
233 log = "DOWN: %20s : %-40s == %20s %s" % \
234 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
237 def getThankyouLog(self, record):
239 record.data['args'] = {'nodename': self.hostname}
240 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
243 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
244 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
245 (self.loginbase, self.hostname, record.data['stage'],
246 record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
248 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
249 (self.loginbase, self.hostname, record.data['stage'],
250 record.data['prev_category'], record.data['category'], record.data['ticket_id'])
252 log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
255 def makeRecord(self, **kwargs):
257 for key in kwargs.keys():
258 rec[key] = kwargs[key]
261 def checkStageAndTime(self, record):
263 The core variables are:
265 send_email_to : defines who to send messages to at this time
266 take_action : whether or not to take action
267 penalty_level : how much of a penalty to apply
268 message_index : where in the escellation sequence we are.
269 save_act_all : whether or not to save the action record in the db.
271 action/stage : stage tracks which state we're in.
274 # "initial" : [ { action='noop', next="weekone"}],
275 # "weekone" : [ { action='noop', index=0, save=True, email=TECH, length=7*SPERDAY, next="weektwo" }, ],
276 # "weektwo" : [ { action='nocreate', index=1, save=True, email=TECH|PI, length=7*SPERDAY, next="waitforever" }, ],
277 # "waitforever" : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY, next="waitforever" }, ],
278 # "paused" : [ { action='noop', save=True length=30*SPERDAY, next="weekone" }, ]
279 # "improvement" : [ { action='close_rt', index=0, save=True, email=TECH, next="monitor-end-record" }, ],
281 # TODO: make this time relative to the PREVIOUS action taken.
282 current_time = time.time()
283 current_stage = record.getMostRecentStage()
284 recent_time = record.getMostRecentTime()
286 delta = current_time - recent_time
288 if current_stage in stages:
289 values = stages[current_stage][0]
291 if delta >= values['length']:
292 print "checkStageAndTime: transition to next stage"
293 new_stage = values['next']
294 values = stages[new_stage]
296 elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data:
297 print "checkStageAndTime: second message in one week for stage two"
305 print "checkStageAndTime: second message in one week for stage two"
307 rec = self.makeRecord( stage=new_stage, send_email_to=values['email'],
308 action=values['action'], message_index=values['index'],
309 save_act_all=values['save'], penalty_level=values['index'],
310 date_action_taken=current_time)
311 record.data.update(rec)
314 if 'initial' in record.data['stage']:
315 # The node is bad, and there's no previous record of it.
316 rec = self.makeRecord(
317 stage="weekone", send_email_to=TECH,
318 action=['noop'], take_action=False,
319 message_index=0, save_act_all=True,
321 record.data.update(rec)
323 elif 'improvement' in record.data['stage']:
324 print "checkStageAndTime: backing off of %s" % self.hostname
325 rec = self.makeRecord(
326 stage='monitor-end-record', send_email_to=TECH,
327 action=['close_rt'], take_action=True,
328 message_index=0, save_act_all=True,
330 record.data.update(rec)
333 # There is no action to be taken, possibly b/c the stage has
334 # already been performed, but diagnose picked it up again.
336 # 1. stage is unknown, or
337 # 2. delta is not big enough to bump it to the next stage.
338 # TODO: figure out which. for now assume 2.
339 print "UNKNOWN stage for %s; nothing done" % self.hostname
340 rec = self.makeRecord(
341 stage='weekone', send_email_to=TECH,
345 date_action_taken=current_time,
348 record.data.update(rec)
350 print "%s" % record.data['log'],
351 print "%15s" % record.data['action']