f1249cf29a289c2ee09d04ec05ccbd564d195ef3
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 from www.printbadnodes import cmpCategoryVal
6 import sys
7 import emailTxt
8 import string
9
10 from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
11 from rt import is_host_in_rt_tickets
12 import plc
13
14 # Time to enforce policy
15 POLSLEEP = 7200
16
17 # Where to email the summary
18 SUMTO = "soltesz@cs.princeton.edu"
19
20 from const import *
21
22 from unified_model import *
23
24 class MonitorMergeDiagnoseSendEscellate:
25         def __init__(self, hostname, act):
26                 self.hostname = hostname
27                 self.act = act
28                 self.plcdb_hn2lb = None
29                 if self.plcdb_hn2lb is None:
30                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
31                 self.loginbase = self.plcdb_hn2lb[self.hostname]
32                 return
33
34         def getFBRecord(self):
35                 fb = database.dbLoad("findbad")
36                 if self.hostname in fb['nodes']:
37                         fbnode = fb['nodes'][self.hostname]['values']
38                 else:
39                         raise Exception("Hostname %s not in scan database"% self.hostname)
40                 return fbnode
41
42         def getActionRecord(self):
43                 # update ticket status
44                 act_all = database.dbLoad("act_all")
45                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
46                         actnode = act_all[self.hostname][0]
47                 else:
48                         actnode = None
49                 del act_all
50                 return actnode
51
52         def getKernel(self, unamestr):
53                 s = unamestr.split()
54                 if len(s) > 2:
55                         return s[2]
56                 else:
57                         return ""
58
59         def mergeRecord(self, fbnode, actnode):
60                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
61                 fbnode['stage'] = "findbad"
62                 fbnode['message'] = None
63                 fbnode['args'] = None
64                 fbnode['info'] = None
65                 fbnode['log'] = None
66                 fbnode['time'] = time.time()
67                 fbnode['date_created'] = time.time()
68
69                 if actnode is None:
70                         actnode = {} 
71                         actnode.update(fbnode)
72                         actnode['ticket_id'] = ""
73                         actnode['prev_category'] = "NORECORD" 
74                 else:
75                         actnode['prev_category']= actnode['category']
76                         actnode['comonstats']   = fbnode['comonstats']
77                         actnode['category']             = fbnode['category']
78                         actnode['state']                = fbnode['state']
79                         actnode['kernel']               = fbnode['kernel']
80                         actnode['bootcd']               = fbnode['bootcd']
81                         actnode['plcnode']              = fbnode['plcnode']
82                         ticket = get_ticket_id(actnode)
83                         if ticket is None: actnode['ticket_id'] = ""
84                         actnode['rt'] = mailer.getTicketStatus(ticket)
85
86                         #for key in actnode.keys():
87                         #       print "%10s %s %s " % (key, "==", actnode[key])
88                         #print "----------------------------"
89
90                 return actnode
91
92         def run(self):
93                 fbnode = self.getFBRecord()
94                 actnode= self.getActionRecord()
95                 actrec = self.mergeRecord(fbnode, actnode)
96                 record = Record(self.hostname, actrec)
97                 diag   = self.diagnose(record)
98                 if self.act and diag is not None:
99                         self.action(record,diag)
100         
101         def diagnose(self, record):
102
103                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
104                 # NOTE: change record stage based on RT status.
105                 diag.setFlag('ResetStage')
106                 if record.stageIswaitforever():
107                         ticket = record.data['rt']
108                         if 'new' in ticket['Status']:
109                                 diag.setFlag('ResetStage')
110                                 
111                         if 'resolved' in ticket['Status']:
112                                 diag.setFlag('EndRecord')
113
114                 # NOTE: take category, and prepare action
115                 category = record.getCategory()
116                 if category == "error":
117                         diag.setFlag('SendNodedown')
118                         record.data['message'] = emailTxt.mailtxt.newdown
119                         record.data['log'] = self.getDownLog(record)
120
121                 elif category == "prod":
122                         state = record.getState()
123                         if state == "boot":
124                                 diag.setFlag('SendThankyou')
125                                 record.data['message'] = emailTxt.mailtxt.newthankyou
126                                 record.data['log'] = self.getThankyouLog(record)
127
128                         elif state == "debug":
129                                 pass
130                         else:
131                                 print "unknown state %s for host %s" % (state, self.hostname)
132                 else:
133                         print "unknown category: %s" % category
134
135                 if diag.getFlag('ResetStage'):
136                         print "resetting stage"
137                         record.reset_stage()
138
139                 record = self.checkStageAndTime(diag,record)
140                 if record:
141                         print "checkStageAndTime Returned Valid Record"
142                         site = PersistFlags(self.loginbase, 1, db='site_persistflags')
143
144                         if site.status is not "good":
145                                 print "Setting site %s for 'squeeze'" % self.loginbase
146                                 diag.setFlag('Squeeze')
147                         else:
148                                 print "Setting site %s for 'backoff'" % self.loginbase
149                                 diag.setFlag('BackOff')
150
151                         diag.save()
152                         return diag
153                 else:
154                         print "checkStageAndTime Returned NULL Record"
155                         return None
156
157         def action(self, record, diag):
158                 if record.improved() or diag.getFlag('EndRecord'):
159                         print "end record for %s" % self.hostname
160                         record.end_record()
161                         diag.setFlag('CloseRT')
162                         return None
163
164                 if self.getSendEmailFlag(record): 
165                         print "sending email"
166                         message = record.getMessage(record.data['ticket_id'])
167                         message.reset()
168                         message.send(record.getContacts())
169                         if message.rt.ticket_id:
170                                 print "setting record ticket_id"
171                                 record.data['ticket_id'] = message.rt.ticket_id
172                         if diag.getFlag('CloseRT'):
173                                 message.rt.closeTicket()
174                 else:
175                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
176
177                 if record.data['takeaction'] and diag.getFlag('Squeeze'):
178                         print "taking action"
179                         record.takeAction()
180
181                 print "saving act_all db"
182                 self.add_and_save_act_all(record)
183
184                 return
185
186         def getSendEmailFlag(self, record):
187                 if not config.mail:
188                         return False
189
190                 # resend if open & created longer than 30 days ago.
191                 if  'rt' in record.data and \
192                         'Status' in record.data['rt'] and \
193                         "open" in record.data['rt']['Status'] and \
194                         record.data['rt']['Created'] < 60*60*24*30:
195                         return False
196
197                 return True
198
199         def add_and_save_act_all(self, record):
200                 self.act_all = database.dbLoad("act_all")
201                 self.act_all[self.hostname].insert(0,record.data)
202                 database.dbDump("act_all", self.act_all)
203                 
204         def getDownLog(self, record):
205
206                 record.data['args'] = {'nodename': self.hostname}
207                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
208
209                 #for key in record.data.keys():
210                 #       print "%10s %s %s " % (key, "==", record.data[key])
211
212                 if record.data['ticket_id'] == "":
213                         log = "DOWN: %20s : %-40s == %20s %s" % \
214                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
215                 else:
216                         log = "DOWN: %20s : %-40s == %20s %s" % \
217                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
218                 return log
219
220         def getThankyouLog(self, record):
221
222                 record.data['args'] = {'nodename': self.hostname}
223                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
224
225                 if record.data['ticket_id'] == "":
226                         log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
227                                                 (self.loginbase, self.hostname, record.data['stage'], 
228                                                  state, category, record.data['found_rt_ticket'])
229                 else:
230                         log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
231                                                 (self.loginbase, self.hostname, record.data['stage'], 
232                                                  state, category, record.data['ticket_id'])
233                 return log
234
235         def checkStageAndTime(self, diag, record):
236                 current_time = time.time()
237                 delta = current_time - record.data['time']
238                 if   'findbad' in record.data['stage']:
239                         # The node is bad, and there's no previous record of it.
240                         record.data['email'] = TECH
241                         record.data['action'] = ['noop']
242                         record.data['takeaction'] = False
243                         record.data['message'] = record.data['message'][0]
244                         record.data['stage'] = 'stage_actinoneweek'
245
246                 elif 'reboot_node' in record.data['stage']:
247                         record.data['email'] = TECH
248                         record.data['action'] = ['noop']
249                         record.data['message'] = record.data['message'][0]
250                         record.data['stage'] = 'stage_actinoneweek'
251                         record.data['takeaction'] = False
252                         
253                 elif 'improvement' in record.data['stage']:
254                         print "backing off of %s" % self.hostname
255                         record.data['action'] = ['close_rt']
256                         record.data['takeaction'] = True
257                         record.data['message'] = record.data['message'][0]
258                         record.data['stage'] = 'monitor-end-record'
259
260                 elif 'actinoneweek' in record.data['stage']:
261                         if delta >= 7 * SPERDAY: 
262                                 record.data['email'] = TECH | PI
263                                 record.data['stage'] = 'stage_actintwoweeks'
264                                 record.data['message'] = record.data['message'][1]
265                                 record.data['action'] = ['nocreate' ]
266                                 record.data['time'] = current_time              # reset clock for waitforever
267                                 record.data['takeaction'] = True
268                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
269                                 record.data['email'] = TECH 
270                                 record.data['message'] = record.data['message'][0]
271                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
272                                 record.data['second-mail-at-oneweek'] = True
273                                 record.data['takeaction'] = False
274                         else:
275                                 record.data['message'] = None
276                                 record.data['action'] = ['waitforoneweekaction' ]
277                                 print "ignoring this record for: %s" % self.hostname
278                                 return None                     # don't send if there's no action
279
280                 elif 'actintwoweeks' in record.data['stage']:
281                         if delta >= 7 * SPERDAY:
282                                 record.data['email'] = TECH | PI | USER
283                                 record.data['stage'] = 'stage_waitforever'
284                                 record.data['message'] = record.data['message'][2]
285                                 record.data['action'] = ['suspendslices']
286                                 record.data['time'] = current_time              # reset clock for waitforever
287                                 record.data['takeaction'] = True
288                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
289                                 record.data['email'] = TECH | PI
290                                 record.data['message'] = record.data['message'][1]
291                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
292                                 record.data['second-mail-at-twoweeks'] = True
293                                 record.data['takeaction'] = False
294                         else:
295                                 record.data['message'] = None
296                                 record.data['action'] = ['waitfortwoweeksaction']
297                                 return None                     # don't send if there's no action
298
299                 elif 'ticket_waitforever' in record.data['stage']:
300                         record.data['email'] = TECH
301                         record.data['takeaction'] = True
302                         if 'first-found' not in record.data:
303                                 record.data['first-found'] = True
304                                 record.data['log'] += " firstfound"
305                                 record.data['action'] = ['ticket_waitforever']
306                                 record.data['message'] = None
307                                 record.data['time'] = current_time
308                         else:
309                                 if delta >= 7*SPERDAY:
310                                         record.data['action'] = ['ticket_waitforever']
311                                         record.data['message'] = None
312                                         record.data['time'] = current_time              # reset clock
313                                 else:
314                                         record.data['action'] = ['ticket_waitforever']
315                                         record.data['message'] = None
316                                         return None
317
318                 elif 'waitforever' in record.data['stage']:
319                         # more than 3 days since last action
320                         # TODO: send only on weekdays.
321                         # NOTE: expects that 'time' has been reset before entering waitforever stage
322                         record.data['takeaction'] = True
323                         if delta >= 3*SPERDAY:
324                                 record.data['action'] = ['email-againwaitforever']
325                                 record.data['message'] = record.data['message'][2]
326                                 record.data['time'] = current_time              # reset clock
327                         else:
328                                 record.data['action'] = ['waitforever']
329                                 record.data['message'] = None
330                                 return None                     # don't send if there's no action
331
332                 else:
333                         # There is no action to be taken, possibly b/c the stage has
334                         # already been performed, but diagnose picked it up again.
335                         # two cases, 
336                         #       1. stage is unknown, or 
337                         #       2. delta is not big enough to bump it to the next stage.
338                         # TODO: figure out which. for now assume 2.
339                         print "UNKNOWN stage for %s; nothing done" % self.hostname
340                         record.data['action'] = ['unknown']
341                         record.data['message'] = record.data['message'][0]
342
343                         record.data['email'] = TECH
344                         record.data['action'] = ['noop']
345                         record.data['message'] = record.data['message'][0]
346                         record.data['stage'] = 'stage_actinoneweek'
347                         record.data['time'] = current_time              # reset clock
348                         record.data['takeaction'] = False
349
350                 print "%s" % record.data['log'],
351                 print "%15s" % record.data['action']
352                 return record
353