5 from www.printbadnodes import cmpCategoryVal
10 from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
11 from rt import is_host_in_rt_tickets
14 # Time to enforce policy
17 # Where to email the summary
18 SUMTO = "soltesz@cs.princeton.edu"
22 from unified_model import *
24 class MonitorMergeDiagnoseSendEscellate:
25 def __init__(self, hostname, act):
26 self.hostname = hostname
28 self.plcdb_hn2lb = None
29 if self.plcdb_hn2lb is None:
30 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
31 self.loginbase = self.plcdb_hn2lb[self.hostname]
34 def getFBRecord(self):
35 fb = database.dbLoad("findbad")
36 if self.hostname in fb['nodes']:
37 fbnode = fb['nodes'][self.hostname]['values']
39 raise Exception("Hostname %s not in scan database"% self.hostname)
42 def getActionRecord(self):
43 # update ticket status
44 act_all = database.dbLoad("act_all")
45 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
46 actnode = act_all[self.hostname][0]
52 def getKernel(self, unamestr):
59 def mergeRecord(self, fbnode, actnode):
60 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
61 fbnode['stage'] = "findbad"
62 fbnode['message'] = None
66 fbnode['time'] = time.time()
67 fbnode['date_created'] = time.time()
71 actnode.update(fbnode)
72 actnode['ticket_id'] = ""
73 actnode['prev_category'] = "NORECORD"
75 actnode['prev_category']= actnode['category']
76 actnode['comonstats'] = fbnode['comonstats']
77 actnode['category'] = fbnode['category']
78 actnode['state'] = fbnode['state']
79 actnode['kernel'] = fbnode['kernel']
80 actnode['bootcd'] = fbnode['bootcd']
81 actnode['plcnode'] = fbnode['plcnode']
82 ticket = get_ticket_id(actnode)
83 if ticket is None: actnode['ticket_id'] = ""
84 actnode['rt'] = mailer.getTicketStatus(ticket)
86 #for key in actnode.keys():
87 # print "%10s %s %s " % (key, "==", actnode[key])
88 #print "----------------------------"
93 fbnode = self.getFBRecord()
94 actnode= self.getActionRecord()
95 actrec = self.mergeRecord(fbnode, actnode)
96 record = Record(self.hostname, actrec)
97 diag = self.diagnose(record)
98 if self.act and diag is not None:
99 self.action(record,diag)
101 def diagnose(self, record):
103 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
104 # NOTE: change record stage based on RT status.
105 diag.setFlag('ResetStage')
106 if record.stageIswaitforever():
107 ticket = record.data['rt']
108 if 'new' in ticket['Status']:
109 diag.setFlag('ResetStage')
111 if 'resolved' in ticket['Status']:
112 diag.setFlag('EndRecord')
114 # NOTE: take category, and prepare action
115 category = record.getCategory()
116 if category == "error":
117 diag.setFlag('SendNodedown')
118 record.data['message'] = emailTxt.mailtxt.newdown
119 record.data['log'] = self.getDownLog(record)
121 elif category == "prod":
122 state = record.getState()
124 diag.setFlag('SendThankyou')
125 record.data['message'] = emailTxt.mailtxt.newthankyou
126 record.data['log'] = self.getThankyouLog(record)
128 elif state == "debug":
131 print "unknown state %s for host %s" % (state, self.hostname)
133 print "unknown category: %s" % category
135 if diag.getFlag('ResetStage'):
136 print "resetting stage"
139 record = self.checkStageAndTime(diag,record)
141 print "checkStageAndTime Returned Valid Record"
142 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
144 if site.status is not "good":
145 print "Setting site %s for 'squeeze'" % self.loginbase
146 diag.setFlag('Squeeze')
148 print "Setting site %s for 'backoff'" % self.loginbase
149 diag.setFlag('BackOff')
154 print "checkStageAndTime Returned NULL Record"
157 def action(self, record, diag):
158 if record.improved() or diag.getFlag('EndRecord'):
159 print "end record for %s" % self.hostname
161 diag.setFlag('CloseRT')
164 if self.getSendEmailFlag(record):
165 print "sending email"
166 message = record.getMessage(record.data['ticket_id'])
168 message.send(record.getContacts())
169 if message.rt.ticket_id:
170 print "setting record ticket_id"
171 record.data['ticket_id'] = message.rt.ticket_id
172 if diag.getFlag('CloseRT'):
173 message.rt.closeTicket()
175 print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
177 if record.data['takeaction'] and diag.getFlag('Squeeze'):
178 print "taking action"
181 print "saving act_all db"
182 self.add_and_save_act_all(record)
186 def getSendEmailFlag(self, record):
190 # resend if open & created longer than 30 days ago.
191 if 'rt' in record.data and \
192 'Status' in record.data['rt'] and \
193 "open" in record.data['rt']['Status'] and \
194 record.data['rt']['Created'] < 60*60*24*30:
199 def add_and_save_act_all(self, record):
200 self.act_all = database.dbLoad("act_all")
201 self.act_all[self.hostname].insert(0,record.data)
202 database.dbDump("act_all", self.act_all)
204 def getDownLog(self, record):
206 record.data['args'] = {'nodename': self.hostname}
207 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
209 #for key in record.data.keys():
210 # print "%10s %s %s " % (key, "==", record.data[key])
212 if record.data['ticket_id'] == "":
213 log = "DOWN: %20s : %-40s == %20s %s" % \
214 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
216 log = "DOWN: %20s : %-40s == %20s %s" % \
217 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
220 def getThankyouLog(self, record):
222 record.data['args'] = {'nodename': self.hostname}
223 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
225 if record.data['ticket_id'] == "":
226 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
227 (self.loginbase, self.hostname, record.data['stage'],
228 state, category, record.data['found_rt_ticket'])
230 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
231 (self.loginbase, self.hostname, record.data['stage'],
232 state, category, record.data['ticket_id'])
235 def checkStageAndTime(self, diag, record):
236 current_time = time.time()
237 delta = current_time - record.data['time']
238 if 'findbad' in record.data['stage']:
239 # The node is bad, and there's no previous record of it.
240 record.data['email'] = TECH
241 record.data['action'] = ['noop']
242 record.data['takeaction'] = False
243 record.data['message'] = record.data['message'][0]
244 record.data['stage'] = 'stage_actinoneweek'
246 elif 'reboot_node' in record.data['stage']:
247 record.data['email'] = TECH
248 record.data['action'] = ['noop']
249 record.data['message'] = record.data['message'][0]
250 record.data['stage'] = 'stage_actinoneweek'
251 record.data['takeaction'] = False
253 elif 'improvement' in record.data['stage']:
254 print "backing off of %s" % self.hostname
255 record.data['action'] = ['close_rt']
256 record.data['takeaction'] = True
257 record.data['message'] = record.data['message'][0]
258 record.data['stage'] = 'monitor-end-record'
260 elif 'actinoneweek' in record.data['stage']:
261 if delta >= 7 * SPERDAY:
262 record.data['email'] = TECH | PI
263 record.data['stage'] = 'stage_actintwoweeks'
264 record.data['message'] = record.data['message'][1]
265 record.data['action'] = ['nocreate' ]
266 record.data['time'] = current_time # reset clock for waitforever
267 record.data['takeaction'] = True
268 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
269 record.data['email'] = TECH
270 record.data['message'] = record.data['message'][0]
271 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
272 record.data['second-mail-at-oneweek'] = True
273 record.data['takeaction'] = False
275 record.data['message'] = None
276 record.data['action'] = ['waitforoneweekaction' ]
277 print "ignoring this record for: %s" % self.hostname
278 return None # don't send if there's no action
280 elif 'actintwoweeks' in record.data['stage']:
281 if delta >= 7 * SPERDAY:
282 record.data['email'] = TECH | PI | USER
283 record.data['stage'] = 'stage_waitforever'
284 record.data['message'] = record.data['message'][2]
285 record.data['action'] = ['suspendslices']
286 record.data['time'] = current_time # reset clock for waitforever
287 record.data['takeaction'] = True
288 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
289 record.data['email'] = TECH | PI
290 record.data['message'] = record.data['message'][1]
291 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
292 record.data['second-mail-at-twoweeks'] = True
293 record.data['takeaction'] = False
295 record.data['message'] = None
296 record.data['action'] = ['waitfortwoweeksaction']
297 return None # don't send if there's no action
299 elif 'ticket_waitforever' in record.data['stage']:
300 record.data['email'] = TECH
301 record.data['takeaction'] = True
302 if 'first-found' not in record.data:
303 record.data['first-found'] = True
304 record.data['log'] += " firstfound"
305 record.data['action'] = ['ticket_waitforever']
306 record.data['message'] = None
307 record.data['time'] = current_time
309 if delta >= 7*SPERDAY:
310 record.data['action'] = ['ticket_waitforever']
311 record.data['message'] = None
312 record.data['time'] = current_time # reset clock
314 record.data['action'] = ['ticket_waitforever']
315 record.data['message'] = None
318 elif 'waitforever' in record.data['stage']:
319 # more than 3 days since last action
320 # TODO: send only on weekdays.
321 # NOTE: expects that 'time' has been reset before entering waitforever stage
322 record.data['takeaction'] = True
323 if delta >= 3*SPERDAY:
324 record.data['action'] = ['email-againwaitforever']
325 record.data['message'] = record.data['message'][2]
326 record.data['time'] = current_time # reset clock
328 record.data['action'] = ['waitforever']
329 record.data['message'] = None
330 return None # don't send if there's no action
333 # There is no action to be taken, possibly b/c the stage has
334 # already been performed, but diagnose picked it up again.
336 # 1. stage is unknown, or
337 # 2. delta is not big enough to bump it to the next stage.
338 # TODO: figure out which. for now assume 2.
339 print "UNKNOWN stage for %s; nothing done" % self.hostname
340 record.data['action'] = ['unknown']
341 record.data['message'] = record.data['message'][0]
343 record.data['email'] = TECH
344 record.data['action'] = ['noop']
345 record.data['message'] = record.data['message'][0]
346 record.data['stage'] = 'stage_actinoneweek'
347 record.data['time'] = current_time # reset clock
348 record.data['takeaction'] = False
350 print "%s" % record.data['log'],
351 print "%15s" % record.data['action']