5 from unified_model import cmpCategoryVal
9 from monitor.wrapper import plccache
10 from datetime import datetime
12 from rt import is_host_in_rt_tickets
15 # Time to enforce policy
18 # Where to email the summary
19 SUMTO = "soltesz@cs.princeton.edu"
23 from unified_model import *
25 class MonitorMergeDiagnoseSendEscellate:
28 def __init__(self, hostname, act):
29 self.hostname = hostname
31 self.plcdb_hn2lb = None
32 if self.plcdb_hn2lb is None:
33 self.plcdb_hn2lb = plccache.plcdb_hn2lb
34 self.loginbase = self.plcdb_hn2lb[self.hostname]
37 def getFBRecords(self):
38 fbrecs = FindbadNodeRecord.get_latest_n_by(hostname=self.hostname)
46 def getLastActionRecord(self):
47 actrec = ActionRecord.get_latest_by(hostname=self.hostname)
55 def getPreviousCategory(self, actrec):
58 ret = actrec.findbad_records[0].observed_category
64 def mergeRecord(self, fbnodes, actrec):
67 actdefault['date_created'] = datetime.now()
68 actdefault['date_action_taken'] = datetime.now()
70 actdefault['stage'] = "initial"
71 actdefault['message_series'] = None
72 actdefault['message_index'] = None
73 actdefault['message_arguments'] = None
75 actdefault['send_email_to'] = TECH
76 actdefault['penalty_level'] = 0
77 actdefault['action'] = [ 'noop' ]
78 actdefault['take_action'] = False
80 actdefault['ticket_id'] = ""
81 actdefault['findbad_records'] = fbnodes
82 actdefault['last_action_record'] = actrec
84 actdefault['prev_category'] = self.getPreviousCategory(actrec)
85 actdefault['category'] = fbnodes[0].observed_category
87 actdefault['rt'] = mailer.getTicketStatus(actrec.ticket_id)
92 fbnodes = self.getFBRecords()
93 actnode= self.getLastActionRecord()
94 actrec = self.mergeRecord(fbnodes, actnode)
95 record = Record(self.hostname, actrec)
96 diag = self.diagnose(record)
97 if self.act and diag is not None:
98 self.action(record,diag)
100 def diagnose(self, record):
103 # NOTE: change record stage based on RT status.
104 if record.stageIswaitforever():
105 ticket = record.data['rt']
106 if 'new' in ticket['Status']:
107 print "Resetting Stage!!!!!"
110 if 'resolved' in ticket['Status']:
111 diag['RTEndRecord'] = True
113 # NOTE: take category, and prepare action
114 category = record.getCategory()
115 if category == "error":
116 diag['SendNodedown'] = True
117 record.data['message_series'] = emailTxt.mailtxt.newdown
118 record.data['log'] = self.getDownLog(record)
120 elif category == "prod" or category == "alpha":
121 state = record.getState()
123 if record.severity() != 0:
124 diag['SendThankyou'] = True
125 print "RESETTING STAGE: improvement"
126 record.data['stage'] = 'improvement'
127 record.data['message_series'] = emailTxt.mailtxt.newthankyou
128 record.data['log'] = self.getThankyouLog(record)
130 # NOTE: do nothing, since we've already done the above.
131 print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
133 elif state == "debug":
136 print "unknown state %s for host %s" % (state, self.hostname)
138 print "unknown category: %s" % category
141 # TODO: how to not send email?...
142 record = self.checkStageAndTime(record)
144 print "diagnose: checkStageAndTime Returned Valid Record"
145 siterec = HistorySiteRecord.by_loginbase(self.loginbase)
147 if "good" not in siterec.status: # != "good":
148 print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
149 diag['Squeeze'] = True
151 print "diagnose: Setting site %s for 'backoff'" % self.loginbase
152 diag['BackOff'] = True
156 def action(self, record, diag):
160 print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
161 if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
162 "monitor-end-record" in record.data['stage']:
163 print "action: getting message"
165 message = record.getMessage(record.data['ticket_id'])
167 print "action: sending email"
168 message.send(record.getContacts())
169 if message.rt.ticket_id:
170 print "action: setting record ticket_id"
171 record.data['ticket_id'] = message.rt.ticket_id
174 if ( record.data['take_action'] and diag['Squeeze'] ):
175 print "action: taking action"
176 record.takeAction(record.data['penalty_level'])
178 if diag.getFlag('BackOff'):
183 if record.saveAction():
184 print "action: saving act_all db"
185 self.add_and_save_act_all(record)
187 print "action: NOT saving act_all db"
188 print "stage: %s %s" % ( record.data['stage'], record.data['save_act_all'] )
191 if record.improved() or diag['RTEndRecord']:
192 print "action: end record for %s" % self.hostname
194 diag['CloseRT'] = True
195 del diag['RTEndRecord']
200 message.rt.closeTicket()
204 print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
208 def add_and_save_act_all(self, record):
210 Read the sync record for this node, and increment the round and
211 create an ActionRecord for this host using the record.data values.
213 recsync = RecordActionSync.get_by(hostname=self.hostname)
214 rec = RecordAction(hostname=self.hostname)
216 record.data['round'] = recsync.round
217 # TODO: we will need to delete some of these before setting them in the DB.
218 rec.set(**record.data)
221 def getDownLog(self, record):
223 record.data['args'] = {'nodename': self.hostname}
224 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
226 #for key in record.data.keys():
227 # print "%10s %s %s " % (key, "==", record.data[key])
229 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
230 log = "DOWN: %20s : %-40s == %20s %s" % \
231 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
233 log = "DOWN: %20s : %-40s == %20s %s" % \
234 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
237 def getThankyouLog(self, record):
239 record.data['args'] = {'nodename': self.hostname}
240 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
243 if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
244 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
245 (self.loginbase, self.hostname, record.data['stage'],
246 record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
248 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
249 (self.loginbase, self.hostname, record.data['stage'],
250 record.data['prev_category'], record.data['category'], record.data['ticket_id'])
252 log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
255 def makeRecord(self, **kwargs):
257 for key in kwargs.keys():
258 rec[key] = kwargs[key]
261 def checkStageAndTime(self, record):
263 The core variables are:
265 send_email_to : defines who to send messages to at this time
266 take_action : whether or not to take action
267 penalty_level : how much of a penalty to apply
268 message_index : where in the escellation sequence we are.
269 save_act_all : whether or not to save the action record in the db.
271 action/stage : stage tracks which state we're in.
274 "initial" : [ { action='noop', next="weekone"}],
275 "weekone" : [ { action='noop', index=0, save=True, email=TECH, length=7*SPERDAY, next="weektwo" }, ],
276 "weektwo" : [ { action='nocreate', index=1, save=True, email=TECH|PI, length=7*SPERDAY, next="waitforever" }, ],
277 "waitforever" : [ { action='suspendslices',index=2, save=True, email=TECH|PI|USER, length=7*SPERDAY, next="waitforever" }, ],
278 "paused" : [ { action='noop', save=True length=30*SPERDAY, next="weekone" }, ]
279 "improvement" : [ { action='close_rt', index=0, save=True, email=TECH, next="monitor-end-record" }, ],
281 # TODO: make this time relative to the PREVIOUS action taken.
282 current_time = time.time()
283 current_stage = record.getMostRecentStage()
284 recent_time = record.getMostRecentTime()
286 delta = current_time - recent_time
288 if current_stage in stages:
289 values = stages[current_stage][0]
291 if delta >= values['length']:
292 print "checkStageAndTime: transition to next stage"
293 new_stage = values['next']
294 values = stages[new_stage]
296 elif delta >= values['length']/3 and not 'second_mail_at_oneweek' in record.data:
297 print "checkStageAndTime: second message in one week for stage two"
305 print "checkStageAndTime: second message in one week for stage two"
307 rec = self.makeRecord( stage=new_stage, send_email_to=values['email'],
308 action=values['action'], message_index=values['index'],
309 save_act_all=values['save'], penalty_level=values['index'],
310 date_action_taken=current_time)
311 record.data.update(rec)
314 if 'initial' in record.data['stage']:
315 # The node is bad, and there's no previous record of it.
316 rec = self.makeRecord(
317 stage="weekone", send_email_to=TECH,
318 action=['noop'], take_action=False,
319 message_index=0, save_act_all=True,
321 record.data.update(rec)
323 elif 'improvement' in record.data['stage']:
324 print "checkStageAndTime: backing off of %s" % self.hostname
325 rec = self.makeRecord(
326 stage='monitor-end-record', send_email_to=TECH,
327 action=['close_rt'], take_action=True,
328 message_index=0, save_act_all=True,
330 record.data.update(rec)
333 # There is no action to be taken, possibly b/c the stage has
334 # already been performed, but diagnose picked it up again.
336 # 1. stage is unknown, or
337 # 2. delta is not big enough to bump it to the next stage.
338 # TODO: figure out which. for now assume 2.
339 print "UNKNOWN stage for %s; nothing done" % self.hostname
340 rec = self.makeRecord(
341 stage='weekone', send_email_to=TECH,
345 date_action_taken=current_time,
348 record.data.update(rec)
350 print "%s" % record.data['log'],
351 print "%15s" % record.data['action']