corrected a bug in reporting nmreset errors.
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import reboot
21 import soltesz
22 import string
23 from www.printbadnodes import cmpCategoryVal
24 from config import config
25 print "policy"
26 config = config()
27
28 DAT="./monitor.dat"
29
30 logger = logging.getLogger("monitor")
31
32 # Time to enforce policy
33 POLSLEEP = 7200
34
35 # Where to email the summary
36 SUMTO = "soltesz@cs.princeton.edu"
37 TECHEMAIL="tech-%s@sites.planet-lab.org"
38 PIEMAIL="pi-%s@sites.planet-lab.org"
39 SLICEMAIL="%s@slices.planet-lab.org"
40 PLCEMAIL="support@planet-lab.org"
41
42 #Thresholds (DAYS)
43 SPERMIN = 60
44 SPERHOUR = 60*60
45 SPERDAY = 86400
46 PITHRESH = 7 * SPERDAY
47 SLICETHRESH = 7 * SPERDAY
48 # Days before attempting rins again
49 RINSTHRESH = 5 * SPERDAY
50
51 # Days before calling the node dead.
52 DEADTHRESH = 30 * SPERDAY
53 # Minimum number of nodes up before squeezing
54 MINUP = 2
55
56 TECH=1
57 PI=2
58 USER=4
59 ADMIN=8
60
61 # IF:
62 #  no SSH, down.
63 #  bad disk, down
64 #  DNS, kinda down (sick)
65 #  clock, kinda down (sick)
66 #  Full disk, going to be down
67
68 # Actions:
69 #  Email
70 #  suspend slice creation
71 #  kill slices
72 def array_to_priority_map(array):
73         """ Create a mapping where each entry of array is given a priority equal
74         to its position in the array.  This is useful for subsequent use in the
75         cmpMap() function."""
76         map = {}
77         count = 0
78         for i in array:
79                 map[i] = count
80                 count += 1
81         return map
82
83 def getdebug():
84         return config.debug
85
86 def print_stats(key, stats):
87         if key in stats: print "%20s : %d" % (key, stats[key])
88
89 class Merge(Thread):
90         def __init__(self, l_merge, toRT):
91                 self.toRT = toRT
92                 self.merge_list = l_merge
93                 # the hostname to loginbase mapping
94                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
95
96                 # Previous actions taken on nodes.
97                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
98                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
99
100                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
101                 self.sickdb = {}
102                 self.mergedb = {}
103                 Thread.__init__(self)
104
105         def run(self):
106                 # populate sickdb
107                 self.accumSickSites()
108                 # read data from findbad and act_all
109                 self.mergeActionsAndBadDB()
110                 # pass node_records to RT
111                 self.sendToRT()
112
113         def accumSickSites(self):
114                 """
115                 Take all nodes, from l_diagnose, look them up in the act_all database, 
116                 and insert them into sickdb[] as:
117
118                         sickdb[loginbase][nodename] = fb_record
119                 """
120                 # look at all problems reported by findbad
121                 l_nodes = self.findbad['nodes'].keys()
122                 count = 0
123                 for nodename in l_nodes:
124                         if nodename not in self.merge_list:
125                                 continue                # skip this node, since it's not wanted
126
127                         count += 1
128                         loginbase = self.plcdb_hn2lb[nodename]
129                         values = self.findbad['nodes'][nodename]['values']
130
131                         fb_record = {}
132                         fb_record['nodename'] = nodename
133                         fb_record['category'] = values['category']
134                         fb_record['state'] = values['state']
135                         fb_record['comonstats'] = values['comonstats']
136                         fb_record['plcnode'] = values['plcnode']
137                         fb_record['kernel'] = self.getKernel(values['kernel'])
138                         fb_record['stage'] = "findbad"
139                         fb_record['message'] = None
140                         fb_record['bootcd'] = values['bootcd']
141                         fb_record['args'] = None
142                         fb_record['info'] = None
143                         fb_record['time'] = time.time()
144                         fb_record['date_created'] = time.time()
145
146                         if loginbase not in self.sickdb:
147                                 self.sickdb[loginbase] = {}
148
149                         self.sickdb[loginbase][nodename] = fb_record
150
151                 print "Found %d nodes" % count
152
153         def getKernel(self, unamestr):
154                 s = unamestr.split()
155                 if len(s) > 2:
156                         return s[2]
157                 else:
158                         return ""
159
160         def mergeActionsAndBadDB(self): 
161                 """
162                 - Look at the sick node_records as reported in findbad, 
163                 - Then look at the node_records in act_all.  
164
165                 There are four cases:
166                 1) Problem in findbad, no problem in act_all
167                         this ok, b/c it just means it's a new problem
168                 2) Problem in findbad, problem in act_all
169                         -Did the problem get better or worse?  
170                                 -If Same, or Worse, then continue looking for open tickets.
171                                 -If Better, or No problem, then "back-off" penalties.
172                                         This judgement may need to wait until 'Diagnose()'
173
174                 3) No problem in findbad, problem in act_all
175                         The the node is operational again according to Findbad()
176
177                 4) No problem in findbad, no problem in act_all
178                         There won't be a record in either db, so there's no code.
179                 """
180
181                 sorted_sites = self.sickdb.keys()
182                 sorted_sites.sort()
183                 # look at all problems reported by findbad
184                 for loginbase in sorted_sites:
185                         d_fb_nodes = self.sickdb[loginbase]
186                         sorted_nodes = d_fb_nodes.keys()
187                         sorted_nodes.sort()
188                         for nodename in sorted_nodes:
189                                 fb_record = self.sickdb[loginbase][nodename]
190                                 x = fb_record
191                                 if loginbase not in self.mergedb:
192                                         self.mergedb[loginbase] = {}
193
194                                 # We must compare findbad state with act_all state
195                                 if nodename not in self.act_all:
196                                         # 1) ok, b/c it's a new problem. set ticket_id to null
197                                         self.mergedb[loginbase][nodename] = {} 
198                                         self.mergedb[loginbase][nodename].update(x)
199                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
200                                         self.mergedb[loginbase][nodename]['prev_category'] = None
201                                 else: 
202                                         if len(self.act_all[nodename]) == 0:
203                                                 print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
204                                                 continue
205
206                                         y = self.act_all[nodename][0]
207
208                                         # skip if end-stage
209                                         if 'stage' in y and "monitor-end-record" in y['stage']:
210                                                 # 1) ok, b/c it's a new problem. set ticket_id to null
211                                                 self.mergedb[loginbase][nodename] = {} 
212                                                 self.mergedb[loginbase][nodename].update(x)
213                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
214                                                 self.mergedb[loginbase][nodename]['prev_category'] = None
215                                                 continue
216
217                                         ## for legacy actions
218                                         #if 'bucket' in y and y['bucket'][0] == 'dbg':
219                                         #       # Only bootcd debugs made it to the act_all db.
220                                         #       y['prev_category'] = "OLDBOOTCD"
221                                         #elif 'bucket' in y and y['bucket'][0] == 'down':
222                                         #       y['prev_category'] = "ERROR"
223                                         #elif 'bucket' not in y:
224                                         #       # for all other actions, just carry over the
225                                         #       # previous category
226                                         #       y['prev_category'] = y['category']
227                                         #else:
228                                         #       print "UNKNOWN state for record: %s" % y
229                                         #       sys.exit(1)
230
231                                         # determine through translation, if the buckets match
232                                         #if 'category' in y and x['category'] == y['category']:
233                                         #       b_match = True
234                                         #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
235                                         #       b_match = True
236                                         #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
237                                         #       b_match = True
238                                         #else:
239                                         #       b_match = False
240
241                                         #if b_match: 
242                                         #       # 2b) ok, b/c they agree that there's still a problem..
243                                         #       # 2b) Comon & Monitor still agree; RT ticket?
244                                         #       y['prev_category'] = y['category']
245                                         #else:
246                                         #       # 2a) mismatch, need a policy for how to resolve
247                                         #       #     resolution will be handled in __diagnoseNode()
248                                         #       #         for now just record the two categories.
249                                         #       #if x['category'] == "PROD" and x['state'] == "BOOT" and \
250                                         #       # ( y['bucket'][0] == 'down' or  y['bucket'][0] == 'dbg'):
251                                         #       print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
252                                         #                               (x['category'], y['bucket'])
253
254
255                                         self.mergedb[loginbase][nodename] = {}
256                                         self.mergedb[loginbase][nodename].update(y)
257                                         self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
258                                         self.mergedb[loginbase][nodename]['category']   = x['category']
259                                         self.mergedb[loginbase][nodename]['state'] = x['state']
260                                         self.mergedb[loginbase][nodename]['kernel']=x['kernel']
261                                         self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
262                                         self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
263                                         # delete the entry from cache_all to keep it out of case 3)
264                                         del self.cache_all[nodename]
265
266                 # 3) nodes that remin in cache_all were not identified by findbad.
267                 #        Do we keep them or not?
268                 #   NOTE: i think that since the categories are performed before this
269                 #               step now, and by a monitor-controlled agent.
270
271                 # TODO: This does not work correctly.  Do we need this? 
272                 #for hn in self.cache_all.keys():
273                 #       y = self.act_all[hn][0]
274                 #       if 'monitor' in y['bucket']:
275                 #               loginbase = self.plcdb_hn2lb[hn] 
276                 #               if loginbase not in self.sickdb:
277                 #                       self.sickdb[loginbase] = {}
278                 #               self.sickdb[loginbase][hn] = y
279                 #       else:
280                 #               del self.cache_all[hn]
281
282                 print "len of cache_all: %d" % len(self.cache_all.keys())
283                 return
284
285         def sendToRT(self):
286                 sorted_sites = self.mergedb.keys()
287                 sorted_sites.sort()
288                 # look at all problems reported by merge
289                 for loginbase in sorted_sites:
290                         d_merge_nodes = self.mergedb[loginbase]
291                         for nodename in d_merge_nodes.keys():
292                                 record = self.mergedb[loginbase][nodename]
293                                 self.toRT.put(record)
294
295                 # send signal to stop reading
296                 self.toRT.put(None)
297                 return
298
299 class Diagnose(Thread):
300         def __init__(self, fromRT):
301                 self.fromRT = fromRT
302                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
303                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
304
305                 self.diagnose_in = {}
306                 self.diagnose_out = {}
307                 Thread.__init__(self)
308
309
310         def run(self):
311                 self.accumSickSites()
312
313                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
314                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
315
316                 try:
317                         stats = self.diagnoseAll()
318                 except Exception, err:
319                         print "----------------"
320                         import traceback
321                         print traceback.print_exc()
322                         print err
323                         #if config.policysavedb:
324                         sys.exit(1)
325
326                 print_stats("sites_observed", stats)
327                 print_stats("sites_diagnosed", stats)
328                 print_stats("nodes_diagnosed", stats)
329
330                 if config.policysavedb:
331                         print "Saving Databases... diagnose_out"
332                         soltesz.dbDump("diagnose_out", self.diagnose_out)
333
334         def accumSickSites(self):
335                 """
336                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
337                 and insert them into diagnose_in[] as:
338
339                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
340                 """
341                 while 1:
342                         node_record = self.fromRT.get(block = True)
343                         if node_record == None:
344                                 break;
345
346                         nodename = node_record['nodename']
347                         loginbase = self.plcdb_hn2lb[nodename]
348
349                         if loginbase not in self.diagnose_in:
350                                 self.diagnose_in[loginbase] = {}
351
352                         self.diagnose_in[loginbase][nodename] = node_record
353
354                 return
355
356         def diagnoseAll(self):
357                 i_sites_observed = 0
358                 i_sites_diagnosed = 0
359                 i_nodes_diagnosed = 0
360                 i_nodes_actedon = 0
361                 i_sites_emailed = 0
362                 l_allsites = []
363
364                 sorted_sites = self.diagnose_in.keys()
365                 sorted_sites.sort()
366                 self.diagnose_out= {}
367                 for loginbase in sorted_sites:
368                         l_allsites += [loginbase]
369
370                         d_diag_nodes = self.diagnose_in[loginbase]
371                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
372                         # store records in diagnose_out, for saving later.
373                         self.diagnose_out.update(d_act_records)
374                         
375                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
376                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
377                                 i_sites_diagnosed += 1
378                         i_sites_observed += 1
379
380                 return {'sites_observed': i_sites_observed, 
381                                 'sites_diagnosed': i_sites_diagnosed, 
382                                 'nodes_diagnosed': i_nodes_diagnosed, 
383                                 'allsites':l_allsites}
384
385                 pass
386                 
387         def __getDaysDown(self, diag_record, nodename):
388                 daysdown = -1
389                 if diag_record['comonstats']['sshstatus'] != "null":
390                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
391                 elif diag_record['comonstats']['lastcotop'] != "null":
392                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
393                 else:
394                         now = time.time()
395                         last_contact = diag_record['plcnode']['last_contact']
396                         if last_contact == None:
397                                 # the node has never been up, so give it a break
398                                 daysdown = -1
399                         else:
400                                 diff = now - last_contact
401                                 daysdown = diff // (60*60*24)
402                 return daysdown
403
404         def __getStrDaysDown(self, diag_record, nodename):
405                 daysdown = self.__getDaysDown(diag_record, nodename)
406                 if daysdown > 0:
407                         return "(%d days down)"%daysdown
408                 else:
409                         return "Unknown number of days"
410
411         def __getCDVersion(self, diag_record, nodename):
412                 cdversion = ""
413                 #print "Getting kernel for: %s" % diag_record['nodename']
414                 cdversion = diag_record['kernel']
415                 return cdversion
416
417         def __diagnoseSite(self, loginbase, d_diag_nodes):
418                 """
419                 d_diag_nodes are diagnose_in entries.
420                 """
421                 d_diag_site = {loginbase : { 'config' : 
422                                                                                                 {'squeeze': False,
423                                                                                                  'email': False
424                                                                                                 }, 
425                                                                         'nodes': {}
426                                                                         }
427                                            }
428                 sorted_nodes = d_diag_nodes.keys()
429                 sorted_nodes.sort()
430                 for nodename in sorted_nodes:
431                         node_record = d_diag_nodes[nodename]
432                         diag_record = self.__diagnoseNode(loginbase, node_record)
433
434                         if diag_record != None:
435                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
436
437                                 # NOTE: improvement means, we need to act/squeeze and email.
438                                 #print "DIAG_RECORD", diag_record
439                                 if 'monitor-end-record' in diag_record['stage'] or \
440                                    'nmreset' in diag_record['stage']:
441                                 #       print "resetting loginbase!" 
442                                         d_diag_site[loginbase]['config']['squeeze'] = True
443                                         d_diag_site[loginbase]['config']['email'] = True
444                                 #else:
445                                 #       print "NO IMPROVEMENT!!!!"
446                         else:
447                                 pass # there is nothing to do for this node.
448
449                 # NOTE: these settings can be overridden by command line arguments,
450                 #       or the state of a record, i.e. if already in RT's Support Queue.
451                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
452                 if nodes_up < MINUP:
453                         d_diag_site[loginbase]['config']['squeeze'] = True
454
455                 max_slices = self.getMaxSlices(loginbase)
456                 num_nodes = self.getNumNodes(loginbase)
457                 # NOTE: when max_slices == 0, this is either a new site (the old way)
458                 #       or an old disabled site from previous monitor (before site['enabled'])
459                 if nodes_up < num_nodes and max_slices != 0:
460                         d_diag_site[loginbase]['config']['email'] = True
461
462                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
463                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
464
465                 return d_diag_site
466
467         def diagRecordByCategory(self, node_record):
468                 nodename = node_record['nodename']
469                 category = node_record['category']
470                 state    = node_record['state']
471                 loginbase = self.plcdb_hn2lb[nodename]
472                 diag_record = None
473
474                 if  "ERROR" in category:        # i.e. "DOWN"
475                         diag_record = {}
476                         diag_record.update(node_record)
477                         daysdown = self.__getDaysDown(diag_record, nodename) 
478                         if daysdown < 7:
479                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
480                                 print format % (loginbase, nodename, daysdown)
481                                 return None
482
483                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
484                         diag_record['message'] = emailTxt.mailtxt.newdown
485                         diag_record['args'] = {'nodename': nodename}
486                         diag_record['info'] = (nodename, s_daysdown, "")
487                         if diag_record['ticket_id'] == "":
488                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
489                                         (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
490                         else:
491                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
492                                         (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
493
494                 elif "OLDBOOTCD" in category:
495                         # V2 boot cds as determined by findbad
496                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
497                         s_cdversion = self.__getCDVersion(node_record, nodename)
498                         diag_record = {}
499                         diag_record.update(node_record)
500                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
501                         diag_record['message'] = emailTxt.mailtxt.newbootcd
502                         diag_record['args'] = {'nodename': nodename}
503                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
504                         if diag_record['ticket_id'] == "":
505                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
506                                                                         (loginbase, nodename, diag_record['kernel'], 
507                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
508                         else:
509                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
510                                                                         (loginbase, nodename, diag_record['kernel'], 
511                                                                          diag_record['bootcd'], diag_record['ticket_id'])
512
513                 elif "PROD" in category:
514                         if "DEBUG" in state:
515                                 # Not sure what to do with these yet.  Probably need to
516                                 # reboot, and email.
517                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
518                                 return None
519                         elif "BOOT" in state:
520                                 # no action needed.
521                                 # TODO: remove penalties, if any are applied.
522                                 now = time.time()
523                                 last_contact = node_record['plcnode']['last_contact']
524                                 if last_contact == None:
525                                         time_diff = 0
526                                 else:
527                                         time_diff = now - last_contact;
528
529                                 if 'improvement' in node_record['stage']:
530                                         # then we need to pass this on to 'action'
531                                         diag_record = {}
532                                         diag_record.update(node_record)
533                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
534                                         diag_record['args'] = {'nodename': nodename}
535                                         diag_record['info'] = (nodename, node_record['prev_category'], 
536                                                                                                          node_record['category'])
537                                         if diag_record['ticket_id'] == "":
538                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
539                                                                         (loginbase, nodename, diag_record['stage'], 
540                                                                          state, category, diag_record['found_rt_ticket'])
541                                         else:
542                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
543                                                                         (loginbase, nodename, diag_record['stage'], 
544                                                                          state, category, diag_record['ticket_id'])
545                                         return diag_record
546                                 elif time_diff >= 6*SPERHOUR:
547                                         # heartbeat is older than 30 min.
548                                         # then reset NM.
549                                         #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
550                                         diag_record = {}
551                                         diag_record.update(node_record)
552                                         diag_record['message'] = emailTxt.mailtxt.NMReset
553                                         diag_record['args'] = {'nodename': nodename}
554                                         diag_record['stage'] = "nmreset"
555                                         diag_record['info'] = (nodename, 
556                                                                                         node_record['prev_category'], 
557                                                                                         node_record['category'])
558                                         if diag_record['ticket_id'] == "":
559                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
560                                                                         (loginbase, nodename, diag_record['stage'], 
561                                                                          state, category, diag_record['found_rt_ticket'])
562                                         else:
563                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
564                                                                         (loginbase, nodename, diag_record['stage'])
565
566                                         return diag_record
567                                 else:
568                                         return None
569                         else:
570                                 # unknown
571                                 pass
572                 elif "ALPHA"    in category:
573                         pass
574                 elif "clock_drift" in category:
575                         pass
576                 elif "dns"    in category:
577                         pass
578                 elif "filerw"    in category:
579                         pass
580                 else:
581                         print "Unknown category!!!! %s" % category
582                         sys.exit(1)
583
584                 return diag_record
585
586         def __diagnoseNode(self, loginbase, node_record):
587                 # TODO: change the format of the hostname in this 
588                 #               record to something more natural.
589                 nodename          = node_record['nodename']
590                 category          = node_record['category']
591                 prev_category = node_record['prev_category']
592                 state             = node_record['state']
593
594                 val = cmpCategoryVal(category, prev_category)
595                 if val == -1:
596                         # current category is worse than previous, carry on
597                         pass
598                 elif val == 1:
599                         # current category is better than previous
600                         # TODO: too generous for now, but will be handled correctly
601                         # TODO: if stage is currently ticket_waitforever, 
602                         if 'ticket_id' not in node_record:
603                                 print "ignoring: ", node_record['nodename']
604                                 return None
605                         else:
606                                 if node_record['ticket_id'] == "" or \
607                                    node_record['ticket_id'] == None:
608                                         print "closing: ", node_record['nodename']
609                                         node_record['action'] = ['close_rt']
610                                         node_record['message'] = None
611                                         node_record['stage'] = 'monitor-end-record'
612                                         return node_record
613                                         #return None
614                                 else:
615                                         node_record['stage'] = 'improvement'
616                 else:
617                         #values are equal, carry on.
618                         pass
619                         
620                 #### COMPARE category and prev_category
621                 # if not_equal
622                 #       then assign a stage based on relative priorities
623                 # else equal
624                 #       then check category for stats.
625                 diag_record = self.diagRecordByCategory(node_record)
626                 if diag_record == None:
627                         return None
628
629                 #### found_RT_ticket
630                 # TODO: need to record time found, and maybe add a stage for acting on it...
631                 if 'found_rt_ticket' in diag_record and \
632                         diag_record['found_rt_ticket'] is not None:
633                         if diag_record['stage'] is not 'improvement':
634                                 diag_record['stage'] = 'ticket_waitforever'
635                                 
636                 current_time = time.time()
637                 # take off four days, for the delay that database caused.
638                 # TODO: generalize delays at PLC, and prevent enforcement when there
639                 #               have been no emails.
640                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
641                 #delta = current_time - diag_record['time'] - 7*SPERDAY
642                 delta = current_time - diag_record['time']
643
644                 message = diag_record['message']
645                 act_record = {}
646                 act_record.update(diag_record)
647
648                 #### DIAGNOSE STAGES 
649                 if   'findbad' in diag_record['stage']:
650                         # The node is bad, and there's no previous record of it.
651                         act_record['email'] = TECH
652                         act_record['action'] = ['noop']
653                         act_record['message'] = message[0]
654                         act_record['stage'] = 'stage_actinoneweek'
655
656                 elif 'nmreset' in diag_record['stage']:
657                         act_record['email']  = ADMIN 
658                         act_record['action'] = ['reset_nodemanager']
659                         act_record['message'] = message[0]
660                         act_record['stage']  = 'nmreset'
661                         return None
662                         
663                 elif 'improvement' in diag_record['stage']:
664                         # - backoff previous squeeze actions (slice suspend, nocreate)
665                         # TODO: add a backoff_squeeze section... Needs to runthrough
666                         act_record['action'] = ['close_rt']
667                         act_record['message'] = message[0]
668                         act_record['stage'] = 'monitor-end-record'
669
670                 elif 'actinoneweek' in diag_record['stage']:
671                         if delta >= 7 * SPERDAY: 
672                                 act_record['email'] = TECH | PI
673                                 act_record['stage'] = 'stage_actintwoweeks'
674                                 act_record['message'] = message[1]
675                                 act_record['action'] = ['nocreate' ]
676                                 act_record['time'] = current_time               # reset clock for waitforever
677                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
678                                 act_record['email'] = TECH 
679                                 act_record['message'] = message[0]
680                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
681                                 act_record['second-mail-at-oneweek'] = True
682                         else:
683                                 act_record['message'] = None
684                                 act_record['action'] = ['waitforoneweekaction' ]
685                                 return None                     # don't send if there's no action
686
687                 elif 'actintwoweeks' in diag_record['stage']:
688                         if delta >= 7 * SPERDAY:
689                                 act_record['email'] = TECH | PI | USER
690                                 act_record['stage'] = 'stage_waitforever'
691                                 act_record['message'] = message[2]
692                                 act_record['action'] = ['suspendslices']
693                                 act_record['time'] = current_time               # reset clock for waitforever
694                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
695                                 act_record['email'] = TECH | PI
696                                 act_record['message'] = message[1]
697                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
698                                 act_record['second-mail-at-twoweeks'] = True
699                         else:
700                                 act_record['message'] = None
701                                 act_record['action'] = ['waitfortwoweeksaction']
702                                 return None                     # don't send if there's no action
703
704                 elif 'ticket_waitforever' in diag_record['stage']:
705                         act_record['email'] = TECH
706                         if 'first-found' not in act_record:
707                                 act_record['first-found'] = True
708                                 act_record['log'] += " firstfound"
709                                 act_record['action'] = ['ticket_waitforever']
710                                 act_record['message'] = None
711                                 act_record['time'] = current_time
712                         else:
713                                 if delta >= 7*SPERDAY:
714                                         act_record['action'] = ['ticket_waitforever']
715                                         act_record['message'] = None
716                                         act_record['time'] = current_time               # reset clock
717                                 else:
718                                         act_record['action'] = ['ticket_waitforever']
719                                         act_record['message'] = None
720                                         return None
721
722                 elif 'waitforever' in diag_record['stage']:
723                         # more than 3 days since last action
724                         # TODO: send only on weekdays.
725                         # NOTE: expects that 'time' has been reset before entering waitforever stage
726                         if delta >= 3*SPERDAY:
727                                 act_record['action'] = ['email-againwaitforever']
728                                 act_record['message'] = message[2]
729                                 act_record['time'] = current_time               # reset clock
730                         else:
731                                 act_record['action'] = ['waitforever']
732                                 act_record['message'] = None
733                                 return None                     # don't send if there's no action
734
735                 else:
736                         # There is no action to be taken, possibly b/c the stage has
737                         # already been performed, but diagnose picked it up again.
738                         # two cases, 
739                         #       1. stage is unknown, or 
740                         #       2. delta is not big enough to bump it to the next stage.
741                         # TODO: figure out which. for now assume 2.
742                         print "UNKNOWN!?!? %s" % nodename
743                         act_record['action'] = ['unknown']
744                         act_record['message'] = message[0]
745                         print "Exiting..."
746                         sys.exit(1)
747
748                 print "%s" % act_record['log'],
749                 print "%15s" % act_record['action']
750                 return act_record
751
752         def getMaxSlices(self, loginbase):
753                 # if sickdb has a loginbase, then it will have at least one node.
754                 site_stats = None
755
756                 for nodename in self.diagnose_in[loginbase].keys():
757                         if nodename in self.findbad['nodes']:
758                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
759                                 break
760
761                 if site_stats == None:
762                         raise Exception, "loginbase with no nodes in findbad"
763                 else:
764                         return site_stats['max_slices']
765
766         def getNumNodes(self, loginbase):
767                 # if sickdb has a loginbase, then it will have at least one node.
768                 site_stats = None
769
770                 for nodename in self.diagnose_in[loginbase].keys():
771                         if nodename in self.findbad['nodes']:
772                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
773                                 break
774
775                 if site_stats == None:
776                         raise Exception, "loginbase with no nodes in findbad"
777                 else:
778                         return site_stats['num_nodes']
779
780         """
781         Returns number of up nodes as the total number *NOT* in act_all with a
782         stage other than 'steady-state' .
783         """
784         def getUpAtSite(self, loginbase, d_diag_site):
785                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
786                 #               that aren't recorded yet.
787
788                 numnodes = self.getNumNodes(loginbase)
789                 # NOTE: assume nodes we have no record of are ok. (too conservative)
790                 # TODO: make the 'up' value more representative
791                 up = numnodes
792                 for nodename in d_diag_site[loginbase]['nodes'].keys():
793
794                         rec = d_diag_site[loginbase]['nodes'][nodename]
795                         if rec['stage'] != 'monitor-end-record':
796                                 up -= 1
797                         else:
798                                 pass # the node is assumed to be up.
799
800                 #if up != numnodes:
801                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
802
803                 return up
804
805
806 class SiteAction:
807         def __init__(self, parameter_names=['hostname', 'ticket_id']):
808                 self.parameter_names = parameter_names
809         def checkParam(self, args):
810                 for param in self.parameter_names:
811                         if param not in args:
812                                 raise Exception("Parameter %s not provided in args"%param)
813         def run(self, args):
814                 self.checkParam(args)
815                 return self._run(args)
816         def _run(self, args):
817                 pass
818
819 class SuspendAction(SiteAction):
820         def _run(self, args):
821                 return plc.suspendSlices(args['hostname'])
822
823 class RemoveSliceCreation(SiteAction):
824         def _run(self, args):
825                 return plc.removeSliceCreation(args['hostname'])
826
827 class BackoffActions(SiteAction):
828         def _run(self, args):
829                 plc.enableSlices(args['hostname'])
830                 plc.enableSliceCreation(args['hostname'])
831                 return True
832
833 # TODO: create class for each action below, 
834 #               allow for lists of actions to be performed...
835
836 def close_rt_backoff(args):
837         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
838                 mailer.closeTicketViaRT(args['ticket_id'], 
839                                                                 "Ticket CLOSED automatically by SiteAssist.")
840                 plc.enableSlices(args['hostname'])
841                 plc.enableSliceCreation(args['hostname'])
842         return
843
844 def reset_nodemanager(args):
845         os.system("ssh root@%s /sbin/service nm restart" % nodename)
846         return
847
848 class Action(Thread):
849         def __init__(self, l_action):
850                 self.l_action = l_action
851
852                 # the hostname to loginbase mapping
853                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
854
855                 # Actions to take.
856                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
857                 # Actions taken.
858                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
859
860                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
861                 self.actions = {}
862                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
863                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
864                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
865                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
866                 self.actions['noop'] = lambda args: args
867                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
868
869                 self.actions['ticket_waitforever'] = lambda args: args
870                 self.actions['waitforever'] = lambda args: args
871                 self.actions['unknown'] = lambda args: args
872                 self.actions['waitforoneweekaction'] = lambda args: args
873                 self.actions['waitfortwoweeksaction'] = lambda args: args
874                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
875                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
876                 self.actions['email-againwaitforever'] = lambda args: args
877                 self.actions['email-againticket_waitforever'] = lambda args: args
878                                 
879
880                 self.sickdb = {}
881                 Thread.__init__(self)
882
883         def run(self):
884                 self.accumSites()
885                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
886                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
887
888                 try:
889                         stats = self.analyseSites()
890                 except Exception, err:
891                         print "----------------"
892                         import traceback
893                         print traceback.print_exc()
894                         print err
895                         if config.policysavedb:
896                                 print "Saving Databases... act_all"
897                                 soltesz.dbDump("act_all", self.act_all)
898                         sys.exit(1)
899
900                 print_stats("sites_observed", stats)
901                 print_stats("sites_diagnosed", stats)
902                 print_stats("nodes_diagnosed", stats)
903                 print_stats("sites_emailed", stats)
904                 print_stats("nodes_actedon", stats)
905                 print string.join(stats['allsites'], ",")
906
907                 if config.policysavedb:
908                         print "Saving Databases... act_all"
909                         #soltesz.dbDump("policy.eventlog", self.eventlog)
910                         # TODO: remove 'diagnose_out', 
911                         #       or at least the entries that were acted on.
912                         soltesz.dbDump("act_all", self.act_all)
913
914         def accumSites(self):
915                 """
916                 Take all nodes, from l_action, look them up in the diagnose_db database, 
917                 and insert them into sickdb[] as:
918
919                 This way only the given l_action nodes will be acted on regardless
920                 of how many from diagnose_db are available.
921
922                         sickdb[loginbase][nodename] = diag_record
923                 """
924                 # TODO: what if l_action == None ?
925                 for nodename in self.l_action:
926
927                         loginbase = self.plcdb_hn2lb[nodename]
928
929                         if loginbase in self.diagnose_db and \
930                                 nodename in self.diagnose_db[loginbase]['nodes']:
931
932                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
933
934                                 if loginbase not in self.sickdb:
935                                         self.sickdb[loginbase] = {'nodes' : {}}
936
937                                 # NOTE: don't copy all node records, since not all will be in l_action
938                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
939                                 # NOTE: but, we want to get the loginbase config settings, 
940                                 #               this is the easiest way.
941                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
942                         #else:
943                                 #print "%s not in diagnose_db!!" % loginbase
944                 return
945
946         def __emailSite(self, loginbase, roles, message, args):
947                 """
948                 loginbase is the unique site abbreviation, prepended to slice names.
949                 roles contains TECH, PI, USER roles, and derive email aliases.
950                 record contains {'message': [<subj>,<body>], 'args': {...}} 
951                 """
952                 ticket_id = 0
953                 args.update({'loginbase':loginbase})
954
955                 if not config.mail and not config.debug and config.bcc:
956                         roles = ADMIN
957                 if config.mail and config.debug:
958                         roles = ADMIN
959
960                 # build targets
961                 contacts = []
962                 if ADMIN & roles:
963                         contacts += [config.email]
964                 if TECH & roles:
965                         contacts += [TECHEMAIL % loginbase]
966                 if PI & roles:
967                         contacts += [PIEMAIL % loginbase]
968                 if USER & roles:
969                         slices = plc.slices(loginbase)
970                         if len(slices) >= 1:
971                                 for slice in slices:
972                                         contacts += [SLICEMAIL % slice]
973                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
974                         else:
975                                 print "SLIC: %20s : 0 slices" % loginbase
976
977                 try:
978                         subject = message[0] % args
979                         body = message[1] % args
980                         if ADMIN & roles:
981                                 # send only to admin
982                                 if 'ticket_id' in args:
983                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
984                                 else:
985                                         subj = "Re: [PL noticket] %s" % subject
986                                 mailer.email(subj, body, contacts)
987                                 ticket_id = args['ticket_id']
988                         else:
989                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
990                 except Exception, err:
991                         print "exception on message:"
992                         import traceback
993                         print traceback.print_exc()
994                         print message
995
996                 return ticket_id
997
998
999         def _format_diaginfo(self, diag_node):
1000                 info = diag_node['info']
1001                 if diag_node['stage'] == 'monitor-end-record':
1002                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1003                 else:
1004                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1005                 return hlist
1006
1007
1008         def get_email_args(self, act_recordlist):
1009
1010                 email_args = {}
1011                 email_args['hostname_list'] = ""
1012
1013                 for act_record in act_recordlist:
1014                         email_args['hostname_list'] += act_record['msg_format']
1015                         email_args['hostname'] = act_record['nodename']
1016                         if 'ticket_id' in act_record:
1017                                 email_args['ticket_id'] = act_record['ticket_id']
1018
1019                 return email_args
1020
1021         def get_unique_issues(self, act_recordlist):
1022                 # NOTE: only send one email per site, per problem...
1023                 unique_issues = {}
1024                 for act_record in act_recordlist:
1025                         act_key = act_record['action'][0]
1026                         if act_key not in unique_issues:
1027                                 unique_issues[act_key] = []
1028                                 
1029                         unique_issues[act_key] += [act_record]
1030                         
1031                 return unique_issues
1032                         
1033
1034         def __actOnSite(self, loginbase, site_record):
1035                 i_nodes_actedon = 0
1036                 i_nodes_emailed = 0
1037
1038                 act_recordlist = []
1039
1040                 for nodename in site_record['nodes'].keys():
1041                         diag_record = site_record['nodes'][nodename]
1042                         act_record  = self.__actOnNode(diag_record)
1043                         #print "nodename: %s %s" % (nodename, act_record)
1044                         act_recordlist += [act_record]
1045
1046                 unique_issues = self.get_unique_issues(act_recordlist)
1047
1048                 for issue in unique_issues.keys():
1049                         print "\tworking on issue: %s" % issue
1050                         issue_record_list = unique_issues[issue]
1051                         email_args = self.get_email_args(issue_record_list)
1052                         
1053                         act_record = issue_record_list[0]
1054                         # send message before squeezing
1055                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1056                                                                                                 site_record['config']['email'])
1057                         if act_record['message'] != None and site_record['config']['email']:
1058                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1059                                                                                          act_record['message'], email_args)
1060
1061                                 # Add ticket_id to ALL nodenames
1062                                 for act_record in issue_record_list:
1063                                         nodename = act_record['nodename']
1064                                         # update node record with RT ticket_id
1065                                         if nodename in self.act_all:
1066                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1067                                         if config.mail: i_nodes_emailed += 1
1068
1069                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1070                                                                                                         site_record['config']['squeeze'])
1071                         if config.squeeze and site_record['config']['squeeze']:
1072                                 for act_key in act_record['action']:
1073                                         self.actions[act_key](email_args)
1074                                 i_nodes_actedon += 1
1075                 
1076                 if config.policysavedb:
1077                         print "Saving Databases... act_all, diagnose_out"
1078                         soltesz.dbDump("act_all", self.act_all)
1079                         # remove site record from diagnose_out, it's in act_all as done.
1080                         del self.diagnose_db[loginbase]
1081                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1082
1083                 print "sleeping for 1 sec"
1084                 time.sleep(1)
1085                 #print "Hit enter to continue..."
1086                 #sys.stdout.flush()
1087                 #line = sys.stdin.readline()
1088
1089                 return (i_nodes_actedon, i_nodes_emailed)
1090
1091         def __actOnNode(self, diag_record):
1092                 nodename = diag_record['nodename']
1093                 message = diag_record['message']
1094
1095                 act_record = {}
1096                 act_record.update(diag_record)
1097                 act_record['nodename'] = nodename
1098                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1099
1100                 print "%s" % act_record['log'],
1101                 print "%15s" % act_record['action']
1102
1103                 if act_record['stage'] is not 'monitor-end-record' and \
1104                    act_record['stage'] is not 'nmreset':
1105                         if nodename not in self.act_all: 
1106                                 self.act_all[nodename] = []
1107
1108                         self.act_all[nodename].insert(0,act_record)
1109                 else:
1110                         print "Not recording %s in act_all" % nodename
1111
1112                 return act_record
1113
1114         def analyseSites(self):
1115                 i_sites_observed = 0
1116                 i_sites_diagnosed = 0
1117                 i_nodes_diagnosed = 0
1118                 i_nodes_actedon = 0
1119                 i_sites_emailed = 0
1120                 l_allsites = []
1121
1122                 sorted_sites = self.sickdb.keys()
1123                 sorted_sites.sort()
1124                 for loginbase in sorted_sites:
1125                         site_record = self.sickdb[loginbase]
1126                         print "sites: %s" % loginbase
1127                         
1128                         i_nodes_diagnosed += len(site_record.keys())
1129                         i_sites_diagnosed += 1
1130
1131                         (na,ne) = self.__actOnSite(loginbase, site_record)
1132
1133                         i_sites_observed += 1
1134                         i_nodes_actedon += na
1135                         i_sites_emailed += ne
1136
1137                         l_allsites += [loginbase]
1138
1139                 return {'sites_observed': i_sites_observed, 
1140                                 'sites_diagnosed': i_sites_diagnosed, 
1141                                 'nodes_diagnosed': i_nodes_diagnosed, 
1142                                 'sites_emailed': i_sites_emailed, 
1143                                 'nodes_actedon': i_nodes_actedon, 
1144                                 'allsites':l_allsites}
1145
1146         def print_stats(self, key, stats):
1147                 print "%20s : %d" % (key, stats[key])
1148
1149
1150
1151         #"""
1152         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1153         #"""
1154         #def status(self):
1155         #       sub = "Monitor Summary"
1156         #       msg = "\nThe following nodes were acted upon:  \n\n"
1157         #       for (node, (type, date)) in self.emailed.items():
1158         #               # Print only things acted on today.
1159         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1160         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1161         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1162         #       for (loginbase, (date, type)) in self.squeezed.items():
1163         #               # Print only things acted on today.
1164         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1165         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1166         #       mailer.email(sub, msg, [SUMTO])
1167         #       logger.info(msg)
1168         #       return 
1169
1170         #"""
1171         #Store/Load state of emails.  When, where, what.
1172         #"""
1173         #def emailedStore(self, action):
1174         #       try:
1175         #               if action == "LOAD":
1176         #                       f = open(DAT, "r+")
1177         #                       logger.info("POLICY:  Found and reading " + DAT)
1178         #                       self.emailed.update(pickle.load(f))
1179         #               if action == "WRITE":
1180         #                       f = open(DAT, "w")
1181         #                       #logger.debug("Writing " + DAT)
1182         #                       pickle.dump(self.emailed, f)
1183         #               f.close()
1184         #       except Exception, err:
1185         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1186
1187
1188 #class Policy(Thread):
1189
1190 def main():
1191         print "policy.py is a module, not a script for running directly."
1192
1193 if __name__ == '__main__':
1194         import os
1195         import plc
1196         try:
1197                 main()
1198         except KeyboardInterrupt:
1199                 print "Killed.  Exitting."
1200                 logger.info('Monitor Killed')
1201                 os._exit(0)