1 from config import config
7 from www.printbadnodes import cmpCategoryVal
12 from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
13 from rt import is_host_in_rt_tickets
16 # Time to enforce policy
19 # Where to email the summary
20 SUMTO = "soltesz@cs.princeton.edu"
24 from unified_model import *
26 class MonitorMergeDiagnoseSendEscellate:
27 def __init__(self, hostname, act):
28 self.hostname = hostname
30 self.plcdb_hn2lb = None
31 if self.plcdb_hn2lb is None:
32 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
33 self.loginbase = self.plcdb_hn2lb[self.hostname]
36 def getFBRecord(self):
37 fb = database.dbLoad("findbad")
38 if self.hostname in fb['nodes']:
39 fbnode = fb['nodes'][self.hostname]['values']
41 raise Exception("Hostname %s not in scan database"% self.hostname)
44 def getActionRecord(self):
45 # update ticket status
46 act_all = database.dbLoad("act_all")
47 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
48 actnode = act_all[self.hostname][0]
54 def getKernel(self, unamestr):
61 def mergeRecord(self, fbnode, actnode):
62 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
63 fbnode['stage'] = "findbad"
64 fbnode['message'] = None
68 fbnode['time'] = time.time()
69 fbnode['date_created'] = time.time()
73 actnode.update(fbnode)
74 actnode['ticket_id'] = ""
75 actnode['prev_category'] = "NORECORD"
77 actnode['prev_category']= actnode['category']
78 actnode['comonstats'] = fbnode['comonstats']
79 actnode['category'] = fbnode['category']
80 actnode['state'] = fbnode['state']
81 actnode['kernel'] = fbnode['kernel']
82 actnode['bootcd'] = fbnode['bootcd']
83 actnode['plcnode'] = fbnode['plcnode']
84 ticket = get_ticket_id(actnode)
85 if ticket is None: actnode['ticket_id'] = ""
86 actnode['rt'] = mailer.getTicketStatus(ticket)
88 #for key in actnode.keys():
89 # print "%10s %s %s " % (key, "==", actnode[key])
90 #print "----------------------------"
95 fbnode = self.getFBRecord()
96 actnode= self.getActionRecord()
97 actrec = self.mergeRecord(fbnode, actnode)
98 record = Record(self.hostname, actrec)
99 diag = self.diagnose(record)
100 if self.act and diag is not None:
101 self.action(record,diag)
103 def diagnose(self, record):
105 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
106 # NOTE: change record stage based on RT status.
107 diag.setFlag('ResetStage')
108 if record.stageIswaitforever():
109 ticket = record.data['rt']
110 if 'new' in ticket['Status']:
111 diag.setFlag('ResetStage')
113 if 'resolved' in ticket['Status']:
114 diag.setFlag('EndRecord')
116 # NOTE: take category, and prepare action
117 category = record.getCategory()
118 if category == "error":
119 diag.setFlag('SendNodedown')
120 record.data['message'] = emailTxt.mailtxt.newdown
121 record.data['log'] = self.getDownLog(record)
123 elif category == "prod":
124 state = record.getState()
126 diag.setFlag('SendThankyou')
127 record.data['message'] = emailTxt.mailtxt.newthankyou
128 record.data['log'] = self.getThankyouLog(record)
130 elif state == "debug":
133 print "unknown state %s for host %s" % (state, self.hostname)
135 print "unknown category: %s" % category
137 if diag.getFlag('ResetStage'):
138 print "resetting stage"
141 record = self.checkStageAndTime(diag,record)
143 print "checkStageAndTime Returned Valid Record"
144 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
146 if site.status is not "good":
147 print "Setting site %s for 'squeeze'" % self.loginbase
148 diag.setFlag('Squeeze')
150 print "Setting site %s for 'backoff'" % self.loginbase
151 diag.setFlag('BackOff')
156 print "checkStageAndTime Returned NULL Record"
159 def action(self, record, diag):
160 if record.improved() or diag.getFlag('EndRecord'):
161 print "end record for %s" % self.hostname
163 diag.setFlag('CloseRT')
166 if self.getSendEmailFlag(record):
167 print "sending email"
168 message = record.getMessage(record.data['ticket_id'])
170 message.send(record.getContacts())
171 if message.rt.ticket_id:
172 print "setting record ticket_id"
173 record.data['ticket_id'] = message.rt.ticket_id
174 if diag.getFlag('CloseRT'):
175 message.rt.closeTicket()
177 print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
179 if record.data['takeaction'] and diag.getFlag('Squeeze'):
180 print "taking action"
183 print "saving act_all db"
184 self.add_and_save_act_all(record)
188 def getSendEmailFlag(self, record):
192 # resend if open & created longer than 30 days ago.
193 if 'rt' in record.data and \
194 'Status' in record.data['rt'] and \
195 "open" in record.data['rt']['Status'] and \
196 record.data['rt']['Created'] < 60*60*24*30:
201 def add_and_save_act_all(self, record):
202 self.act_all = database.dbLoad("act_all")
203 self.act_all[self.hostname].insert(0,record.data)
204 database.dbDump("act_all", self.act_all)
206 def getDownLog(self, record):
208 record.data['args'] = {'nodename': self.hostname}
209 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
211 #for key in record.data.keys():
212 # print "%10s %s %s " % (key, "==", record.data[key])
214 if record.data['ticket_id'] == "":
215 log = "DOWN: %20s : %-40s == %20s %s" % \
216 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
218 log = "DOWN: %20s : %-40s == %20s %s" % \
219 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
222 def getThankyouLog(self, record):
224 record.data['args'] = {'nodename': self.hostname}
225 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
227 if record.data['ticket_id'] == "":
228 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
229 (self.loginbase, self.hostname, record.data['stage'],
230 state, category, record.data['found_rt_ticket'])
232 log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
233 (self.loginbase, self.hostname, record.data['stage'],
234 state, category, record.data['ticket_id'])
237 def checkStageAndTime(self, diag, record):
238 current_time = time.time()
239 delta = current_time - record.data['time']
240 if 'findbad' in record.data['stage']:
241 # The node is bad, and there's no previous record of it.
242 record.data['email'] = TECH
243 record.data['action'] = ['noop']
244 record.data['takeaction'] = False
245 record.data['message'] = record.data['message'][0]
246 record.data['stage'] = 'stage_actinoneweek'
248 elif 'reboot_node' in record.data['stage']:
249 record.data['email'] = TECH
250 record.data['action'] = ['noop']
251 record.data['message'] = record.data['message'][0]
252 record.data['stage'] = 'stage_actinoneweek'
253 record.data['takeaction'] = False
255 elif 'improvement' in record.data['stage']:
256 print "backing off of %s" % self.hostname
257 record.data['action'] = ['close_rt']
258 record.data['takeaction'] = True
259 record.data['message'] = record.data['message'][0]
260 record.data['stage'] = 'monitor-end-record'
262 elif 'actinoneweek' in record.data['stage']:
263 if delta >= 7 * SPERDAY:
264 record.data['email'] = TECH | PI
265 record.data['stage'] = 'stage_actintwoweeks'
266 record.data['message'] = record.data['message'][1]
267 record.data['action'] = ['nocreate' ]
268 record.data['time'] = current_time # reset clock for waitforever
269 record.data['takeaction'] = True
270 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
271 record.data['email'] = TECH
272 record.data['message'] = record.data['message'][0]
273 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
274 record.data['second-mail-at-oneweek'] = True
275 record.data['takeaction'] = False
277 record.data['message'] = None
278 record.data['action'] = ['waitforoneweekaction' ]
279 print "ignoring this record for: %s" % self.hostname
280 return None # don't send if there's no action
282 elif 'actintwoweeks' in record.data['stage']:
283 if delta >= 7 * SPERDAY:
284 record.data['email'] = TECH | PI | USER
285 record.data['stage'] = 'stage_waitforever'
286 record.data['message'] = record.data['message'][2]
287 record.data['action'] = ['suspendslices']
288 record.data['time'] = current_time # reset clock for waitforever
289 record.data['takeaction'] = True
290 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
291 record.data['email'] = TECH | PI
292 record.data['message'] = record.data['message'][1]
293 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
294 record.data['second-mail-at-twoweeks'] = True
295 record.data['takeaction'] = False
297 record.data['message'] = None
298 record.data['action'] = ['waitfortwoweeksaction']
299 return None # don't send if there's no action
301 elif 'ticket_waitforever' in record.data['stage']:
302 record.data['email'] = TECH
303 record.data['takeaction'] = True
304 if 'first-found' not in record.data:
305 record.data['first-found'] = True
306 record.data['log'] += " firstfound"
307 record.data['action'] = ['ticket_waitforever']
308 record.data['message'] = None
309 record.data['time'] = current_time
311 if delta >= 7*SPERDAY:
312 record.data['action'] = ['ticket_waitforever']
313 record.data['message'] = None
314 record.data['time'] = current_time # reset clock
316 record.data['action'] = ['ticket_waitforever']
317 record.data['message'] = None
320 elif 'waitforever' in record.data['stage']:
321 # more than 3 days since last action
322 # TODO: send only on weekdays.
323 # NOTE: expects that 'time' has been reset before entering waitforever stage
324 record.data['takeaction'] = True
325 if delta >= 3*SPERDAY:
326 record.data['action'] = ['email-againwaitforever']
327 record.data['message'] = record.data['message'][2]
328 record.data['time'] = current_time # reset clock
330 record.data['action'] = ['waitforever']
331 record.data['message'] = None
332 return None # don't send if there's no action
335 # There is no action to be taken, possibly b/c the stage has
336 # already been performed, but diagnose picked it up again.
338 # 1. stage is unknown, or
339 # 2. delta is not big enough to bump it to the next stage.
340 # TODO: figure out which. for now assume 2.
341 print "UNKNOWN stage for %s; nothing done" % self.hostname
342 record.data['action'] = ['unknown']
343 record.data['message'] = record.data['message'][0]
345 record.data['email'] = TECH
346 record.data['action'] = ['noop']
347 record.data['message'] = record.data['message'][0]
348 record.data['stage'] = 'stage_actinoneweek'
349 record.data['time'] = current_time # reset clock
350 record.data['takeaction'] = False
352 print "%s" % record.data['log'],
353 print "%15s" % record.data['action']