www/printbadnodes.py
[monitor.git] / clean_policy.py
1 import config
2 import database 
3 import time
4 import mailer
5 from unified_model import cmpCategoryVal
6 import sys
7 import emailTxt
8 import string
9
10 from rt import is_host_in_rt_tickets
11 import plc
12
13 # Time to enforce policy
14 POLSLEEP = 7200
15
16 # Where to email the summary
17 SUMTO = "soltesz@cs.princeton.edu"
18
19 from const import *
20
21 from unified_model import *
22
23 def get_ticket_id(record):
24         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
25                 return record['ticket_id']
26         elif            'found_rt_ticket' in record and \
27                  record['found_rt_ticket'] is not "" and \
28                  record['found_rt_ticket'] is not None:
29                 return record['found_rt_ticket']
30         else:
31                 return None
32
33 class MonitorMergeDiagnoseSendEscellate:
34         def __init__(self, hostname, act):
35                 self.hostname = hostname
36                 self.act = act
37                 self.plcdb_hn2lb = None
38                 if self.plcdb_hn2lb is None:
39                         self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
40                 self.loginbase = self.plcdb_hn2lb[self.hostname]
41                 return
42
43         def getFBRecord(self):
44                 fb = database.dbLoad("findbad")
45                 if self.hostname in fb['nodes']:
46                         fbnode = fb['nodes'][self.hostname]['values']
47                 else:
48                         raise Exception("Hostname %s not in scan database"% self.hostname)
49                 return fbnode
50
51         def getActionRecord(self):
52                 # update ticket status
53                 act_all = database.dbLoad("act_all")
54                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
55                         actnode = act_all[self.hostname][0]
56                 else:
57                         actnode = None
58                 del act_all
59                 return actnode
60
61         def getKernel(self, unamestr):
62                 s = unamestr.split()
63                 if len(s) > 2:
64                         return s[2]
65                 else:
66                         return ""
67
68         def mergeRecord(self, fbnode, actnode):
69                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
70                 fbnode['stage'] = "findbad"
71                 fbnode['message'] = None
72                 fbnode['args'] = None
73                 fbnode['info'] = None
74                 fbnode['log'] = None
75                 fbnode['time'] = time.time()
76                 fbnode['date_created'] = time.time()
77
78                 if actnode is None:
79                         actnode = {} 
80                         actnode.update(fbnode)
81                         actnode['ticket_id'] = ""
82                         actnode['prev_category'] = "NORECORD" 
83                 else:
84                         actnode['prev_category']= actnode['category']
85                         actnode['comonstats']   = fbnode['comonstats']
86                         actnode['category']             = fbnode['category']
87                         actnode['state']                = fbnode['state']
88                         actnode['kernel']               = fbnode['kernel']
89                         actnode['bootcd']               = fbnode['bootcd']
90                         actnode['plcnode']              = fbnode['plcnode']
91                         ticket = get_ticket_id(actnode)
92                         if ticket is None: actnode['ticket_id'] = ""
93                         actnode['rt'] = mailer.getTicketStatus(ticket)
94
95                         #for key in actnode.keys():
96                         #       print "%10s %s %s " % (key, "==", actnode[key])
97                         #print "----------------------------"
98
99                 return actnode
100
101         def run(self):
102                 fbnode = self.getFBRecord()
103                 actnode= self.getActionRecord()
104                 actrec = self.mergeRecord(fbnode, actnode)
105                 record = Record(self.hostname, actrec)
106                 diag   = self.diagnose(record)
107                 if self.act and diag is not None:
108                         self.action(record,diag)
109         
110         def diagnose(self, record):
111
112                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
113                 # NOTE: change record stage based on RT status.
114                 diag.setFlag('ResetStage')
115                 if record.stageIswaitforever():
116                         ticket = record.data['rt']
117                         if 'new' in ticket['Status']:
118                                 diag.setFlag('ResetStage')
119                                 
120                         if 'resolved' in ticket['Status']:
121                                 diag.setFlag('EndRecord')
122
123                 # NOTE: take category, and prepare action
124                 category = record.getCategory()
125                 if category == "error":
126                         diag.setFlag('SendNodedown')
127                         record.data['message'] = emailTxt.mailtxt.newdown
128                         record.data['log'] = self.getDownLog(record)
129
130                 elif category == "prod":
131                         state = record.getState()
132                         if state == "boot":
133                                 diag.setFlag('SendThankyou')
134                                 record.data['message'] = emailTxt.mailtxt.newthankyou
135                                 record.data['log'] = self.getThankyouLog(record)
136
137                         elif state == "debug":
138                                 pass
139                         else:
140                                 print "unknown state %s for host %s" % (state, self.hostname)
141                 else:
142                         print "unknown category: %s" % category
143
144                 if diag.getFlag('ResetStage'):
145                         print "resetting stage"
146                         record.reset_stage()
147
148                 record = self.checkStageAndTime(diag,record)
149                 if record:
150                         print "checkStageAndTime Returned Valid Record"
151                         site = PersistFlags(self.loginbase, 1, db='site_persistflags')
152
153                         if site.status is not "good":
154                                 print "Setting site %s for 'squeeze'" % self.loginbase
155                                 diag.setFlag('Squeeze')
156                         else:
157                                 print "Setting site %s for 'backoff'" % self.loginbase
158                                 diag.setFlag('BackOff')
159
160                         diag.save()
161                         return diag
162                 else:
163                         print "checkStageAndTime Returned NULL Record"
164                         return None
165
166         def action(self, record, diag):
167                 if record.improved() or diag.getFlag('EndRecord'):
168                         print "end record for %s" % self.hostname
169                         record.end_record()
170                         diag.setFlag('CloseRT')
171                         return None
172
173                 if self.getSendEmailFlag(record): 
174                         print "sending email"
175                         message = record.getMessage(record.data['ticket_id'])
176                         message.reset()
177                         message.send(record.getContacts())
178                         if message.rt.ticket_id:
179                                 print "setting record ticket_id"
180                                 record.data['ticket_id'] = message.rt.ticket_id
181                         if diag.getFlag('CloseRT'):
182                                 message.rt.closeTicket()
183                 else:
184                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
185
186                 if record.data['takeaction'] and diag.getFlag('Squeeze'):
187                         print "taking action"
188                         record.takeAction()
189
190                 print "saving act_all db"
191                 self.add_and_save_act_all(record)
192
193                 return
194
195         def getSendEmailFlag(self, record):
196                 if not config.mail:
197                         return False
198
199                 # resend if open & created longer than 30 days ago.
200                 if  'rt' in record.data and \
201                         'Status' in record.data['rt'] and \
202                         "open" in record.data['rt']['Status'] and \
203                         record.data['rt']['Created'] < 60*60*24*30:
204                         return False
205
206                 return True
207
208         def add_and_save_act_all(self, record):
209                 self.act_all = database.dbLoad("act_all")
210                 self.act_all[self.hostname].insert(0,record.data)
211                 database.dbDump("act_all", self.act_all)
212                 
213         def getDownLog(self, record):
214
215                 record.data['args'] = {'nodename': self.hostname}
216                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
217
218                 #for key in record.data.keys():
219                 #       print "%10s %s %s " % (key, "==", record.data[key])
220
221                 if record.data['ticket_id'] == "":
222                         log = "DOWN: %20s : %-40s == %20s %s" % \
223                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
224                 else:
225                         log = "DOWN: %20s : %-40s == %20s %s" % \
226                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
227                 return log
228
229         def getThankyouLog(self, record):
230
231                 record.data['args'] = {'nodename': self.hostname}
232                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
233
234                 if record.data['ticket_id'] == "":
235                         log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
236                                                 (self.loginbase, self.hostname, record.data['stage'], 
237                                                  state, category, record.data['found_rt_ticket'])
238                 else:
239                         log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
240                                                 (self.loginbase, self.hostname, record.data['stage'], 
241                                                  state, category, record.data['ticket_id'])
242                 return log
243
244         def checkStageAndTime(self, diag, record):
245                 current_time = time.time()
246                 delta = current_time - record.data['time']
247                 if   'findbad' in record.data['stage']:
248                         # The node is bad, and there's no previous record of it.
249                         record.data['email'] = TECH
250                         record.data['action'] = ['noop']
251                         record.data['takeaction'] = False
252                         record.data['message'] = record.data['message'][0]
253                         record.data['stage'] = 'stage_actinoneweek'
254
255                 elif 'reboot_node' in record.data['stage']:
256                         record.data['email'] = TECH
257                         record.data['action'] = ['noop']
258                         record.data['message'] = record.data['message'][0]
259                         record.data['stage'] = 'stage_actinoneweek'
260                         record.data['takeaction'] = False
261                         
262                 elif 'improvement' in record.data['stage']:
263                         print "backing off of %s" % self.hostname
264                         record.data['action'] = ['close_rt']
265                         record.data['takeaction'] = True
266                         record.data['message'] = record.data['message'][0]
267                         record.data['stage'] = 'monitor-end-record'
268
269                 elif 'actinoneweek' in record.data['stage']:
270                         if delta >= 7 * SPERDAY: 
271                                 record.data['email'] = TECH | PI
272                                 record.data['stage'] = 'stage_actintwoweeks'
273                                 record.data['message'] = record.data['message'][1]
274                                 record.data['action'] = ['nocreate' ]
275                                 record.data['time'] = current_time              # reset clock for waitforever
276                                 record.data['takeaction'] = True
277                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
278                                 record.data['email'] = TECH 
279                                 record.data['message'] = record.data['message'][0]
280                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
281                                 record.data['second-mail-at-oneweek'] = True
282                                 record.data['takeaction'] = False
283                         else:
284                                 record.data['message'] = None
285                                 record.data['action'] = ['waitforoneweekaction' ]
286                                 print "ignoring this record for: %s" % self.hostname
287                                 return None                     # don't send if there's no action
288
289                 elif 'actintwoweeks' in record.data['stage']:
290                         if delta >= 7 * SPERDAY:
291                                 record.data['email'] = TECH | PI | USER
292                                 record.data['stage'] = 'stage_waitforever'
293                                 record.data['message'] = record.data['message'][2]
294                                 record.data['action'] = ['suspendslices']
295                                 record.data['time'] = current_time              # reset clock for waitforever
296                                 record.data['takeaction'] = True
297                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
298                                 record.data['email'] = TECH | PI
299                                 record.data['message'] = record.data['message'][1]
300                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
301                                 record.data['second-mail-at-twoweeks'] = True
302                                 record.data['takeaction'] = False
303                         else:
304                                 record.data['message'] = None
305                                 record.data['action'] = ['waitfortwoweeksaction']
306                                 return None                     # don't send if there's no action
307
308                 elif 'ticket_waitforever' in record.data['stage']:
309                         record.data['email'] = TECH
310                         record.data['takeaction'] = True
311                         if 'first-found' not in record.data:
312                                 record.data['first-found'] = True
313                                 record.data['log'] += " firstfound"
314                                 record.data['action'] = ['ticket_waitforever']
315                                 record.data['message'] = None
316                                 record.data['time'] = current_time
317                         else:
318                                 if delta >= 7*SPERDAY:
319                                         record.data['action'] = ['ticket_waitforever']
320                                         record.data['message'] = None
321                                         record.data['time'] = current_time              # reset clock
322                                 else:
323                                         record.data['action'] = ['ticket_waitforever']
324                                         record.data['message'] = None
325                                         return None
326
327                 elif 'waitforever' in record.data['stage']:
328                         # more than 3 days since last action
329                         # TODO: send only on weekdays.
330                         # NOTE: expects that 'time' has been reset before entering waitforever stage
331                         record.data['takeaction'] = True
332                         if delta >= 3*SPERDAY:
333                                 record.data['action'] = ['email-againwaitforever']
334                                 record.data['message'] = record.data['message'][2]
335                                 record.data['time'] = current_time              # reset clock
336                         else:
337                                 record.data['action'] = ['waitforever']
338                                 record.data['message'] = None
339                                 return None                     # don't send if there's no action
340
341                 else:
342                         # There is no action to be taken, possibly b/c the stage has
343                         # already been performed, but diagnose picked it up again.
344                         # two cases, 
345                         #       1. stage is unknown, or 
346                         #       2. delta is not big enough to bump it to the next stage.
347                         # TODO: figure out which. for now assume 2.
348                         print "UNKNOWN stage for %s; nothing done" % self.hostname
349                         record.data['action'] = ['unknown']
350                         record.data['message'] = record.data['message'][0]
351
352                         record.data['email'] = TECH
353                         record.data['action'] = ['noop']
354                         record.data['message'] = record.data['message'][0]
355                         record.data['stage'] = 'stage_actinoneweek'
356                         record.data['time'] = current_time              # reset clock
357                         record.data['takeaction'] = False
358
359                 print "%s" % record.data['log'],
360                 print "%15s" % record.data['action']
361                 return record
362