5 from unified_model import cmpCategoryVal
10 from rt import is_host_in_rt_tickets
13 # Time to enforce policy
16 # Where to email the summary
17 SUMTO = "soltesz@cs.princeton.edu"
21 from unified_model import *
23 def get_ticket_id(record):
24 if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
25 return record['ticket_id']
26 elif 'found_rt_ticket' in record and \
27 record['found_rt_ticket'] is not "" and \
28 record['found_rt_ticket'] is not None:
29 return record['found_rt_ticket']
33 class MonitorMergeDiagnoseSendEscellate:
34 def __init__(self, hostname, act):
35 self.hostname = hostname
37 self.plcdb_hn2lb = None
38 if self.plcdb_hn2lb is None:
39 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
40 self.loginbase = self.plcdb_hn2lb[self.hostname]
43 def getFBRecord(self):
44 fb = database.dbLoad("findbad")
45 if self.hostname in fb['nodes']:
46 fbnode = fb['nodes'][self.hostname]['values']
48 raise Exception("Hostname %s not in scan database"% self.hostname)
51 def getActionRecord(self):
52 # update ticket status
53 act_all = database.dbLoad("act_all")
54 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
55 actnode = act_all[self.hostname][0]
61 def getKernel(self, unamestr):
68 def mergeRecord(self, fbnode, actnode):
69 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
70 fbnode['stage'] = "findbad"
71 fbnode['message'] = None
75 fbnode['time'] = time.time()
76 fbnode['date_created'] = time.time()
80 actnode.update(fbnode)
81 actnode['ticket_id'] = ""
82 actnode['prev_category'] = "NORECORD"
84 actnode['prev_category']= actnode['category']
85 actnode['comonstats'] = fbnode['comonstats']
86 actnode['category'] = fbnode['category']
87 actnode['state'] = fbnode['state']
88 actnode['kernel'] = fbnode['kernel']
89 actnode['bootcd'] = fbnode['bootcd']
90 actnode['plcnode'] = fbnode['plcnode']
91 ticket = get_ticket_id(actnode)
92 if ticket is None: actnode['ticket_id'] = ""
93 actnode['rt'] = mailer.getTicketStatus(ticket)
95 #for key in actnode.keys():
96 # print "%10s %s %s " % (key, "==", actnode[key])
97 #print "----------------------------"
102 fbnode = self.getFBRecord()
103 actnode= self.getActionRecord()
104 actrec = self.mergeRecord(fbnode, actnode)
105 record = Record(self.hostname, actrec)
106 diag = self.diagnose(record)
107 if self.act and diag is not None:
108 self.action(record,diag)
110 def diagnose(self, record):
112 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
113 # NOTE: change record stage based on RT status.
114 diag.setFlag('ResetStage')
115 if record.stageIswaitforever():
116 ticket = record.data['rt']
117 if 'new' in ticket['Status']:
118 diag.setFlag('ResetStage')
120 if 'resolved' in ticket['Status']:
121 diag.setFlag('EndRecord')
123 # NOTE: take category, and prepare action
124 category = record.getCategory()
125 if category == "error":
126 diag.setFlag('SendNodedown')
127 record.data['message'] = emailTxt.mailtxt.newdown
128 record.data['log'] = self.getDownLog(record)
130 elif category == "prod":
131 state = record.getState()
133 diag.setFlag('SendThankyou')
134 record.data['message'] = emailTxt.mailtxt.newthankyou
135 record.data['log'] = self.getThankyouLog(record)
137 elif state == "debug":
140 print "unknown state %s for host %s" % (state, self.hostname)
142 print "unknown category: %s" % category
144 if diag.getFlag('ResetStage'):
145 print "resetting stage"
148 record = self.checkStageAndTime(diag,record)
150 print "checkStageAndTime Returned Valid Record"
151 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
153 if site.status is not "good":
154 print "Setting site %s for 'squeeze'" % self.loginbase
155 diag.setFlag('Squeeze')
157 print "Setting site %s for 'backoff'" % self.loginbase
158 diag.setFlag('BackOff')
163 print "checkStageAndTime Returned NULL Record"
166 def action(self, record, diag):
167 if record.improved() or diag.getFlag('EndRecord'):
168 print "end record for %s" % self.hostname
170 diag.setFlag('CloseRT')
173 if self.getSendEmailFlag(record):
174 print "sending email"
175 message = record.getMessage(record.data['ticket_id'])
177 message.send(record.getContacts())
178 if message.rt.ticket_id:
179 print "setting record ticket_id"
180 record.data['ticket_id'] = message.rt.ticket_id
181 if diag.getFlag('CloseRT'):
182 message.rt.closeTicket()
184 print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
186 if record.data['takeaction'] and diag.getFlag('Squeeze'):
187 print "taking action"
190 print "saving act_all db"
191 self.add_and_save_act_all(record)
195 def getSendEmailFlag(self, record):
199 # resend if open & created longer than 30 days ago.
200 if 'rt' in record.data and \
201 'Status' in record.data['rt'] and \
202 "open" in record.data['rt']['Status'] and \
203 record.data['rt']['Created'] < 60*60*24*30:
208 def add_and_save_act_all(self, record):
209 self.act_all = database.dbLoad("act_all")
210 self.act_all[self.hostname].insert(0,record.data)
211 database.dbDump("act_all", self.act_all)
213 def getDownLog(self, record):
215 record.data['args'] = {'nodename': self.hostname}
216 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
218 #for key in record.data.keys():
219 # print "%10s %s %s " % (key, "==", record.data[key])
221 if record.data['ticket_id'] == "":
222 log = "DOWN: %20s : %-40s == %20s %s" % \
223 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
225 log = "DOWN: %20s : %-40s == %20s %s" % \
226 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
229 def getThankyouLog(self, record):
231 record.data['args'] = {'nodename': self.hostname}
232 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
234 if record.data['ticket_id'] == "":
235 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
236 (self.loginbase, self.hostname, record.data['stage'],
237 state, category, record.data['found_rt_ticket'])
239 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
240 (self.loginbase, self.hostname, record.data['stage'],
241 state, category, record.data['ticket_id'])
244 def checkStageAndTime(self, diag, record):
245 current_time = time.time()
246 delta = current_time - record.data['time']
247 if 'findbad' in record.data['stage']:
248 # The node is bad, and there's no previous record of it.
249 record.data['email'] = TECH
250 record.data['action'] = ['noop']
251 record.data['takeaction'] = False
252 record.data['message'] = record.data['message'][0]
253 record.data['stage'] = 'stage_actinoneweek'
255 elif 'reboot_node' in record.data['stage']:
256 record.data['email'] = TECH
257 record.data['action'] = ['noop']
258 record.data['message'] = record.data['message'][0]
259 record.data['stage'] = 'stage_actinoneweek'
260 record.data['takeaction'] = False
262 elif 'improvement' in record.data['stage']:
263 print "backing off of %s" % self.hostname
264 record.data['action'] = ['close_rt']
265 record.data['takeaction'] = True
266 record.data['message'] = record.data['message'][0]
267 record.data['stage'] = 'monitor-end-record'
269 elif 'actinoneweek' in record.data['stage']:
270 if delta >= 7 * SPERDAY:
271 record.data['email'] = TECH | PI
272 record.data['stage'] = 'stage_actintwoweeks'
273 record.data['message'] = record.data['message'][1]
274 record.data['action'] = ['nocreate' ]
275 record.data['time'] = current_time # reset clock for waitforever
276 record.data['takeaction'] = True
277 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
278 record.data['email'] = TECH
279 record.data['message'] = record.data['message'][0]
280 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
281 record.data['second-mail-at-oneweek'] = True
282 record.data['takeaction'] = False
284 record.data['message'] = None
285 record.data['action'] = ['waitforoneweekaction' ]
286 print "ignoring this record for: %s" % self.hostname
287 return None # don't send if there's no action
289 elif 'actintwoweeks' in record.data['stage']:
290 if delta >= 7 * SPERDAY:
291 record.data['email'] = TECH | PI | USER
292 record.data['stage'] = 'stage_waitforever'
293 record.data['message'] = record.data['message'][2]
294 record.data['action'] = ['suspendslices']
295 record.data['time'] = current_time # reset clock for waitforever
296 record.data['takeaction'] = True
297 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
298 record.data['email'] = TECH | PI
299 record.data['message'] = record.data['message'][1]
300 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
301 record.data['second-mail-at-twoweeks'] = True
302 record.data['takeaction'] = False
304 record.data['message'] = None
305 record.data['action'] = ['waitfortwoweeksaction']
306 return None # don't send if there's no action
308 elif 'ticket_waitforever' in record.data['stage']:
309 record.data['email'] = TECH
310 record.data['takeaction'] = True
311 if 'first-found' not in record.data:
312 record.data['first-found'] = True
313 record.data['log'] += " firstfound"
314 record.data['action'] = ['ticket_waitforever']
315 record.data['message'] = None
316 record.data['time'] = current_time
318 if delta >= 7*SPERDAY:
319 record.data['action'] = ['ticket_waitforever']
320 record.data['message'] = None
321 record.data['time'] = current_time # reset clock
323 record.data['action'] = ['ticket_waitforever']
324 record.data['message'] = None
327 elif 'waitforever' in record.data['stage']:
328 # more than 3 days since last action
329 # TODO: send only on weekdays.
330 # NOTE: expects that 'time' has been reset before entering waitforever stage
331 record.data['takeaction'] = True
332 if delta >= 3*SPERDAY:
333 record.data['action'] = ['email-againwaitforever']
334 record.data['message'] = record.data['message'][2]
335 record.data['time'] = current_time # reset clock
337 record.data['action'] = ['waitforever']
338 record.data['message'] = None
339 return None # don't send if there's no action
342 # There is no action to be taken, possibly b/c the stage has
343 # already been performed, but diagnose picked it up again.
345 # 1. stage is unknown, or
346 # 2. delta is not big enough to bump it to the next stage.
347 # TODO: figure out which. for now assume 2.
348 print "UNKNOWN stage for %s; nothing done" % self.hostname
349 record.data['action'] = ['unknown']
350 record.data['message'] = record.data['message'][0]
352 record.data['email'] = TECH
353 record.data['action'] = ['noop']
354 record.data['message'] = record.data['message'][0]
355 record.data['stage'] = 'stage_actinoneweek'
356 record.data['time'] = current_time # reset clock
357 record.data['takeaction'] = False
359 print "%s" % record.data['log'],
360 print "%15s" % record.data['action']