AM nagios/plc2nagios.py
[monitor.git] / clean_policy.py
1 from config import config
2 #print "policy"
3 config = config()
4 import soltesz
5 import time
6 import mailer
7 from www.printbadnodes import cmpCategoryVal
8 import sys
9 import emailTxt
10 import string
11
12 from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
13 from rt import is_host_in_rt_tickets
14 import plc
15
16 # Time to enforce policy
17 POLSLEEP = 7200
18
19 # Where to email the summary
20 SUMTO = "soltesz@cs.princeton.edu"
21
22 from const import *
23
24 from unified_model import *
25
26 class MonitorMergeDiagnoseSendEscellate:
27         def __init__(self, hostname, act):
28                 self.hostname = hostname
29                 self.act = act
30                 self.plcdb_hn2lb = None
31                 if self.plcdb_hn2lb is None:
32                         self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
33                 self.loginbase = self.plcdb_hn2lb[self.hostname]
34                 return
35
36         def getFBRecord(self):
37                 fb = soltesz.dbLoad("findbad")
38                 if self.hostname in fb['nodes']:
39                         fbnode = fb['nodes'][self.hostname]['values']
40                 else:
41                         raise Exception("Hostname %s not in scan database"% self.hostname)
42                 return fbnode
43
44         def getActionRecord(self):
45                 # update ticket status
46                 act_all = soltesz.dbLoad("act_all")
47                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
48                         actnode = act_all[self.hostname][0]
49                 else:
50                         actnode = None
51                 del act_all
52                 return actnode
53
54         def getKernel(self, unamestr):
55                 s = unamestr.split()
56                 if len(s) > 2:
57                         return s[2]
58                 else:
59                         return ""
60
61         def mergeRecord(self, fbnode, actnode):
62                 fbnode['kernel'] = self.getKernel(fbnode['kernel'])
63                 fbnode['stage'] = "findbad"
64                 fbnode['message'] = None
65                 fbnode['args'] = None
66                 fbnode['info'] = None
67                 fbnode['log'] = None
68                 fbnode['time'] = time.time()
69                 fbnode['date_created'] = time.time()
70
71                 if actnode is None:
72                         actnode = {} 
73                         actnode.update(fbnode)
74                         actnode['ticket_id'] = ""
75                         actnode['prev_category'] = "NORECORD" 
76                 else:
77                         actnode['prev_category']= actnode['category']
78                         actnode['comonstats']   = fbnode['comonstats']
79                         actnode['category']             = fbnode['category']
80                         actnode['state']                = fbnode['state']
81                         actnode['kernel']               = fbnode['kernel']
82                         actnode['bootcd']               = fbnode['bootcd']
83                         actnode['plcnode']              = fbnode['plcnode']
84                         ticket = get_ticket_id(actnode)
85                         if ticket is None: actnode['ticket_id'] = ""
86                         actnode['rt'] = mailer.getTicketStatus(ticket)
87
88                         #for key in actnode.keys():
89                         #       print "%10s %s %s " % (key, "==", actnode[key])
90                         #print "----------------------------"
91
92                 return actnode
93
94         def run(self):
95                 fbnode = self.getFBRecord()
96                 actnode= self.getActionRecord()
97                 actrec = self.mergeRecord(fbnode, actnode)
98                 record = Record(self.hostname, actrec)
99                 diag   = self.diagnose(record)
100                 if self.act and diag is not None:
101                         self.action(record,diag)
102         
103         def diagnose(self, record):
104
105                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
106                 # NOTE: change record stage based on RT status.
107                 diag.setFlag('ResetStage')
108                 if record.stageIswaitforever():
109                         ticket = record.data['rt']
110                         if 'new' in ticket['Status']:
111                                 diag.setFlag('ResetStage')
112                                 
113                         if 'resolved' in ticket['Status']:
114                                 diag.setFlag('EndRecord')
115
116                 # NOTE: take category, and prepare action
117                 category = record.getCategory()
118                 if category == "error":
119                         diag.setFlag('SendNodedown')
120                         record.data['message'] = emailTxt.mailtxt.newdown
121                         record.data['log'] = self.getDownLog(record)
122
123                 elif category == "prod":
124                         state = diag.getState()
125                         if state == "boot":
126                                 diag.setFlag('SendThankyou')
127                                 record.data['message'] = emailTxt.mailtxt.newthankyou
128                                 record.data['log'] = self.getThankyouLog(record)
129
130                         elif state == "debug":
131                                 pass
132                         else:
133                                 print "unknown state %s for host %s" % (state, self.hostname)
134                 else:
135                         print "unknown category: %s" % category
136
137                 if diag.getFlag('ResetStage'):
138                         print "resetting stage"
139                         record.reset_stage()
140
141                 record = self.checkStageAndTime(diag,record)
142                 if record:
143                         print "checkStageAndTime Returned Valid Record"
144                         site = PersistFlags(self.loginbase, 1, db='site_persistflags')
145
146                         if site.status is not "good":
147                                 print "Setting site %s for 'squeeze'" % self.loginbase
148                                 diag.setFlag('Squeeze')
149                         else:
150                                 print "Setting site %s for 'backoff'" % self.loginbase
151                                 diag.setFlag('BackOff')
152
153                         diag.save()
154                         return diag
155                 else:
156                         print "checkStageAndTime Returned NULL Record"
157                         return None
158
159         def action(self, record, diag):
160                 if record.improved() or diag.getFlag('EndRecord'):
161                         print "end record for %s" % self.hostname
162                         record.end_record()
163                         diag.setFlag('CloseRT')
164                         return None
165
166                 if self.getSendEmailFlag(record): 
167                         print "sending email"
168                         message = record.getMessage(record.data['ticket_id'])
169                         message.reset()
170                         message.send(record.getContacts())
171                         if message.rt.ticket_id:
172                                 print "setting record ticket_id"
173                                 record.data['ticket_id'] = message.rt.ticket_id
174                         if diag.getFlag('CloseRT'):
175                                 message.rt.closeTicket()
176                 else:
177                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
178
179                 if record.data['takeaction'] and diag.getFlag('Squeeze'):
180                         print "taking action"
181                         record.takeAction()
182
183                 print "saving act_all db"
184                 self.add_and_save_act_all(record)
185
186                 return
187
188         def getSendEmailFlag(self, record):
189                 if not config.mail:
190                         return False
191
192                 # resend if open & created longer than 30 days ago.
193                 if  'rt' in record.data and \
194                         'Status' in record.data['rt'] and \
195                         "open" in record.data['rt']['Status'] and \
196                         record.data['rt']['Created'] < 60*60*24*30:
197                         return False
198
199                 return True
200
201         def add_and_save_act_all(self, record):
202                 self.act_all = soltesz.dbLoad("act_all")
203                 self.act_all[self.hostname].insert(0,record.data)
204                 soltesz.dbDump("act_all", self.act_all)
205                 
206         def getDownLog(self, record):
207
208                 record.data['args'] = {'nodename': self.hostname}
209                 record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
210
211                 #for key in record.data.keys():
212                 #       print "%10s %s %s " % (key, "==", record.data[key])
213
214                 if record.data['ticket_id'] == "":
215                         log = "DOWN: %20s : %-40s == %20s %s" % \
216                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
217                 else:
218                         log = "DOWN: %20s : %-40s == %20s %s" % \
219                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
220                 return log
221
222         def getThankyouLog(self, record):
223
224                 record.data['args'] = {'nodename': self.hostname}
225                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
226
227                 if record.data['ticket_id'] == "":
228                         log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
229                                                 (self.loginbase, self.hostname, record.data['stage'], 
230                                                  state, category, record.data['found_rt_ticket'])
231                 else:
232                         log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
233                                                 (self.loginbase, self.hostname, record.data['stage'], 
234                                                  state, category, record.data['ticket_id'])
235                 return log
236
237         def checkStageAndTime(self, diag, record):
238                 current_time = time.time()
239                 delta = current_time - record.data['time']
240                 if   'findbad' in record.data['stage']:
241                         # The node is bad, and there's no previous record of it.
242                         record.data['email'] = TECH
243                         record.data['action'] = ['noop']
244                         record.data['takeaction'] = False
245                         record.data['message'] = record.data['message'][0]
246                         record.data['stage'] = 'stage_actinoneweek'
247
248                 elif 'reboot_node' in record.data['stage']:
249                         record.data['email'] = TECH
250                         record.data['action'] = ['noop']
251                         record.data['message'] = record.data['message'][0]
252                         record.data['stage'] = 'stage_actinoneweek'
253                         record.data['takeaction'] = False
254                         
255                 elif 'improvement' in record.data['stage']:
256                         print "backing off of %s" % self.hostname
257                         record.data['action'] = ['close_rt']
258                         record.data['takeaction'] = True
259                         record.data['message'] = record.data['message'][0]
260                         record.data['stage'] = 'monitor-end-record'
261
262                 elif 'actinoneweek' in record.data['stage']:
263                         if delta >= 7 * SPERDAY: 
264                                 record.data['email'] = TECH | PI
265                                 record.data['stage'] = 'stage_actintwoweeks'
266                                 record.data['message'] = record.data['message'][1]
267                                 record.data['action'] = ['nocreate' ]
268                                 record.data['time'] = current_time              # reset clock for waitforever
269                                 record.data['takeaction'] = True
270                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
271                                 record.data['email'] = TECH 
272                                 record.data['message'] = record.data['message'][0]
273                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
274                                 record.data['second-mail-at-oneweek'] = True
275                                 record.data['takeaction'] = False
276                         else:
277                                 record.data['message'] = None
278                                 record.data['action'] = ['waitforoneweekaction' ]
279                                 print "ignoring this record for: %s" % self.hostname
280                                 return None                     # don't send if there's no action
281
282                 elif 'actintwoweeks' in record.data['stage']:
283                         if delta >= 7 * SPERDAY:
284                                 record.data['email'] = TECH | PI | USER
285                                 record.data['stage'] = 'stage_waitforever'
286                                 record.data['message'] = record.data['message'][2]
287                                 record.data['action'] = ['suspendslices']
288                                 record.data['time'] = current_time              # reset clock for waitforever
289                                 record.data['takeaction'] = True
290                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
291                                 record.data['email'] = TECH | PI
292                                 record.data['message'] = record.data['message'][1]
293                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
294                                 record.data['second-mail-at-twoweeks'] = True
295                                 record.data['takeaction'] = False
296                         else:
297                                 record.data['message'] = None
298                                 record.data['action'] = ['waitfortwoweeksaction']
299                                 return None                     # don't send if there's no action
300
301                 elif 'ticket_waitforever' in record.data['stage']:
302                         record.data['email'] = TECH
303                         record.data['takeaction'] = True
304                         if 'first-found' not in record.data:
305                                 record.data['first-found'] = True
306                                 record.data['log'] += " firstfound"
307                                 record.data['action'] = ['ticket_waitforever']
308                                 record.data['message'] = None
309                                 record.data['time'] = current_time
310                         else:
311                                 if delta >= 7*SPERDAY:
312                                         record.data['action'] = ['ticket_waitforever']
313                                         record.data['message'] = None
314                                         record.data['time'] = current_time              # reset clock
315                                 else:
316                                         record.data['action'] = ['ticket_waitforever']
317                                         record.data['message'] = None
318                                         return None
319
320                 elif 'waitforever' in record.data['stage']:
321                         # more than 3 days since last action
322                         # TODO: send only on weekdays.
323                         # NOTE: expects that 'time' has been reset before entering waitforever stage
324                         record.data['takeaction'] = True
325                         if delta >= 3*SPERDAY:
326                                 record.data['action'] = ['email-againwaitforever']
327                                 record.data['message'] = record.data['message'][2]
328                                 record.data['time'] = current_time              # reset clock
329                         else:
330                                 record.data['action'] = ['waitforever']
331                                 record.data['message'] = None
332                                 return None                     # don't send if there's no action
333
334                 else:
335                         # There is no action to be taken, possibly b/c the stage has
336                         # already been performed, but diagnose picked it up again.
337                         # two cases, 
338                         #       1. stage is unknown, or 
339                         #       2. delta is not big enough to bump it to the next stage.
340                         # TODO: figure out which. for now assume 2.
341                         print "UNKNOWN stage for %s; nothing done" % self.hostname
342                         record.data['action'] = ['unknown']
343                         record.data['message'] = record.data['message'][0]
344
345                         record.data['email'] = TECH
346                         record.data['action'] = ['noop']
347                         record.data['message'] = record.data['message'][0]
348                         record.data['stage'] = 'stage_actinoneweek'
349                         record.data['time'] = current_time              # reset clock
350                         record.data['takeaction'] = False
351
352                 print "%s" % record.data['log'],
353                 print "%15s" % record.data['action']
354                 return record
355