Policy.py includes updates to better handle PCUs
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import soltesz
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 from config import config
26 print "policy"
27 config = config()
28
29 DAT="./monitor.dat"
30
31 logger = logging.getLogger("monitor")
32
33 # Time to enforce policy
34 POLSLEEP = 7200
35
36 # Where to email the summary
37 SUMTO = "soltesz@cs.princeton.edu"
38 TECHEMAIL="tech-%s@sites.planet-lab.org"
39 PIEMAIL="pi-%s@sites.planet-lab.org"
40 SLICEMAIL="%s@slices.planet-lab.org"
41 PLCEMAIL="support@planet-lab.org"
42
43 #Thresholds (DAYS)
44 SPERMIN = 60
45 SPERHOUR = 60*60
46 SPERDAY = 86400
47 PITHRESH = 7 * SPERDAY
48 SLICETHRESH = 7 * SPERDAY
49 # Days before attempting rins again
50 RINSTHRESH = 5 * SPERDAY
51
52 # Days before calling the node dead.
53 DEADTHRESH = 30 * SPERDAY
54 # Minimum number of nodes up before squeezing
55 MINUP = 2
56
57 TECH=1
58 PI=2
59 USER=4
60 ADMIN=8
61
62 # IF:
63 #  no SSH, down.
64 #  bad disk, down
65 #  DNS, kinda down (sick)
66 #  clock, kinda down (sick)
67 #  Full disk, going to be down
68
69 # Actions:
70 #  Email
71 #  suspend slice creation
72 #  kill slices
73 def array_to_priority_map(array):
74         """ Create a mapping where each entry of array is given a priority equal
75         to its position in the array.  This is useful for subsequent use in the
76         cmpMap() function."""
77         map = {}
78         count = 0
79         for i in array:
80                 map[i] = count
81                 count += 1
82         return map
83
84 def getdebug():
85         return config.debug
86
87 def print_stats(key, stats):
88         if key in stats: print "%20s : %d" % (key, stats[key])
89
90 class Merge(Thread):
91         def __init__(self, l_merge, toRT):
92                 self.toRT = toRT
93                 self.merge_list = l_merge
94                 # the hostname to loginbase mapping
95                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
96
97                 # Previous actions taken on nodes.
98                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
99                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
100
101                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
102                 self.sickdb = {}
103                 self.mergedb = {}
104                 Thread.__init__(self)
105
106         def run(self):
107                 # populate sickdb
108                 self.accumSickSites()
109                 # read data from findbad and act_all
110                 self.mergeActionsAndBadDB()
111                 # pass node_records to RT
112                 self.sendToRT()
113
114         def accumSickSites(self):
115                 """
116                 Take all nodes, from l_diagnose, look them up in the act_all database, 
117                 and insert them into sickdb[] as:
118
119                         sickdb[loginbase][nodename] = fb_record
120                 """
121                 # look at all problems reported by findbad
122                 l_nodes = self.findbad['nodes'].keys()
123                 count = 0
124                 for nodename in l_nodes:
125                         if nodename not in self.merge_list:
126                                 continue                # skip this node, since it's not wanted
127
128                         count += 1
129                         loginbase = self.plcdb_hn2lb[nodename]
130                         values = self.findbad['nodes'][nodename]['values']
131
132                         fb_record = {}
133                         fb_record['nodename'] = nodename
134                         try:
135                                 fb_record['category'] = values['category']
136                         except:
137                                 print values
138                                 print nodename
139                                 print self.findbad['nodes'][nodename]
140                                 count -= 1
141                                 continue
142                         fb_record['state'] = values['state']
143                         fb_record['comonstats'] = values['comonstats']
144                         fb_record['plcnode'] = values['plcnode']
145                         fb_record['kernel'] = self.getKernel(values['kernel'])
146                         fb_record['stage'] = "findbad"
147                         fb_record['message'] = None
148                         fb_record['bootcd'] = values['bootcd']
149                         fb_record['args'] = None
150                         fb_record['info'] = None
151                         fb_record['time'] = time.time()
152                         fb_record['date_created'] = time.time()
153
154                         if loginbase not in self.sickdb:
155                                 self.sickdb[loginbase] = {}
156
157                         self.sickdb[loginbase][nodename] = fb_record
158
159                 print "Found %d nodes" % count
160
161         def getKernel(self, unamestr):
162                 s = unamestr.split()
163                 if len(s) > 2:
164                         return s[2]
165                 else:
166                         return ""
167
168         def mergeActionsAndBadDB(self): 
169                 """
170                 - Look at the sick node_records as reported in findbad, 
171                 - Then look at the node_records in act_all.  
172
173                 There are four cases:
174                 1) Problem in findbad, no problem in act_all
175                         this ok, b/c it just means it's a new problem
176                 2) Problem in findbad, problem in act_all
177                         -Did the problem get better or worse?  
178                                 -If Same, or Worse, then continue looking for open tickets.
179                                 -If Better, or No problem, then "back-off" penalties.
180                                         This judgement may need to wait until 'Diagnose()'
181
182                 3) No problem in findbad, problem in act_all
183                         The the node is operational again according to Findbad()
184
185                 4) No problem in findbad, no problem in act_all
186                         There won't be a record in either db, so there's no code.
187                 """
188
189                 sorted_sites = self.sickdb.keys()
190                 sorted_sites.sort()
191                 # look at all problems reported by findbad
192                 for loginbase in sorted_sites:
193                         d_fb_nodes = self.sickdb[loginbase]
194                         sorted_nodes = d_fb_nodes.keys()
195                         sorted_nodes.sort()
196                         for nodename in sorted_nodes:
197                                 fb_record = self.sickdb[loginbase][nodename]
198                                 x = fb_record
199                                 if loginbase not in self.mergedb:
200                                         self.mergedb[loginbase] = {}
201
202                                 # We must compare findbad state with act_all state
203                                 if nodename not in self.act_all:
204                                         # 1) ok, b/c it's a new problem. set ticket_id to null
205                                         self.mergedb[loginbase][nodename] = {} 
206                                         self.mergedb[loginbase][nodename].update(x)
207                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
208                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
209                                 else: 
210                                         if len(self.act_all[nodename]) == 0:
211                                                 print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
212                                                 continue
213
214                                         y = self.act_all[nodename][0]
215
216                                         ## skip if end-stage
217                                         #if 'stage' in y and "monitor-end-record" in y['stage']:
218                                         #       # 1) ok, b/c it's a new problem. set ticket_id to null
219                                         ##      self.mergedb[loginbase][nodename] = {} 
220                                         #       self.mergedb[loginbase][nodename].update(x)
221                                         #       self.mergedb[loginbase][nodename]['ticket_id'] = ""
222                                         #       self.mergedb[loginbase][nodename]['prev_category'] = None
223                                         #       continue
224
225                                         ## for legacy actions
226                                         #if 'bucket' in y and y['bucket'][0] == 'dbg':
227                                         #       # Only bootcd debugs made it to the act_all db.
228                                         #       y['prev_category'] = "OLDBOOTCD"
229                                         #elif 'bucket' in y and y['bucket'][0] == 'down':
230                                         #       y['prev_category'] = "ERROR"
231                                         #elif 'bucket' not in y:
232                                         #       # for all other actions, just carry over the
233                                         #       # previous category
234                                         #       y['prev_category'] = y['category']
235                                         #else:
236                                         #       print "UNKNOWN state for record: %s" % y
237                                         #       sys.exit(1)
238
239                                         # determine through translation, if the buckets match
240                                         #if 'category' in y and x['category'] == y['category']:
241                                         #       b_match = True
242                                         #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
243                                         #       b_match = True
244                                         #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
245                                         #       b_match = True
246                                         #else:
247                                         #       b_match = False
248
249                                         #if b_match: 
250                                         #       # 2b) ok, b/c they agree that there's still a problem..
251                                         #       # 2b) Comon & Monitor still agree; RT ticket?
252                                         #else:
253                                         #       # 2a) mismatch, need a policy for how to resolve
254                                         #       #     resolution will be handled in __diagnoseNode()
255                                         #       #         for now just record the two categories.
256                                         #       #if x['category'] == "PROD" and x['state'] == "BOOT" and \
257                                         #       # ( y['bucket'][0] == 'down' or  y['bucket'][0] == 'dbg'):
258                                         #       print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
259                                         #                               (x['category'], y['bucket'])
260
261                                         y['prev_category'] = y['category']
262                                         self.mergedb[loginbase][nodename] = {}
263                                         self.mergedb[loginbase][nodename].update(y)
264                                         self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
265                                         self.mergedb[loginbase][nodename]['category']   = x['category']
266                                         self.mergedb[loginbase][nodename]['state'] = x['state']
267                                         self.mergedb[loginbase][nodename]['kernel']=x['kernel']
268                                         self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
269                                         self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
270                                         # delete the entry from cache_all to keep it out of case 3)
271                                         del self.cache_all[nodename]
272
273                 # 3) nodes that remin in cache_all were not identified by findbad.
274                 #        Do we keep them or not?
275                 #   NOTE: i think that since the categories are performed before this
276                 #               step now, and by a monitor-controlled agent.
277
278                 # TODO: This does not work correctly.  Do we need this? 
279                 #for hn in self.cache_all.keys():
280                 #       y = self.act_all[hn][0]
281                 #       if 'monitor' in y['bucket']:
282                 #               loginbase = self.plcdb_hn2lb[hn] 
283                 #               if loginbase not in self.sickdb:
284                 #                       self.sickdb[loginbase] = {}
285                 #               self.sickdb[loginbase][hn] = y
286                 #       else:
287                 #               del self.cache_all[hn]
288
289                 print "len of cache_all: %d" % len(self.cache_all.keys())
290                 return
291
292         def sendToRT(self):
293                 sorted_sites = self.mergedb.keys()
294                 sorted_sites.sort()
295                 # look at all problems reported by merge
296                 for loginbase in sorted_sites:
297                         d_merge_nodes = self.mergedb[loginbase]
298                         for nodename in d_merge_nodes.keys():
299                                 record = self.mergedb[loginbase][nodename]
300                                 self.toRT.put(record)
301
302                 # send signal to stop reading
303                 self.toRT.put(None)
304                 return
305
306 class Diagnose(Thread):
307         def __init__(self, fromRT):
308                 self.fromRT = fromRT
309                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
310                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
311
312                 self.diagnose_in = {}
313                 self.diagnose_out = {}
314                 Thread.__init__(self)
315
316
317         def run(self):
318                 self.accumSickSites()
319
320                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
321                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
322
323                 try:
324                         stats = self.diagnoseAll()
325                 except Exception, err:
326                         print "----------------"
327                         import traceback
328                         print traceback.print_exc()
329                         print err
330                         #if config.policysavedb:
331                         sys.exit(1)
332
333                 print_stats("sites_observed", stats)
334                 print_stats("sites_diagnosed", stats)
335                 print_stats("nodes_diagnosed", stats)
336
337                 if config.policysavedb:
338                         print "Saving Databases... diagnose_out"
339                         soltesz.dbDump("diagnose_out", self.diagnose_out)
340
341         def accumSickSites(self):
342                 """
343                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
344                 and insert them into diagnose_in[] as:
345
346                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
347                 """
348                 while 1:
349                         node_record = self.fromRT.get(block = True)
350                         if node_record == None:
351                                 break;
352
353                         nodename = node_record['nodename']
354                         loginbase = self.plcdb_hn2lb[nodename]
355
356                         if loginbase not in self.diagnose_in:
357                                 self.diagnose_in[loginbase] = {}
358
359                         self.diagnose_in[loginbase][nodename] = node_record
360
361                 return
362
363         def diagnoseAll(self):
364                 i_sites_observed = 0
365                 i_sites_diagnosed = 0
366                 i_nodes_diagnosed = 0
367                 i_nodes_actedon = 0
368                 i_sites_emailed = 0
369                 l_allsites = []
370
371                 sorted_sites = self.diagnose_in.keys()
372                 sorted_sites.sort()
373                 self.diagnose_out= {}
374                 for loginbase in sorted_sites:
375                         l_allsites += [loginbase]
376
377                         d_diag_nodes = self.diagnose_in[loginbase]
378                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
379                         # store records in diagnose_out, for saving later.
380                         self.diagnose_out.update(d_act_records)
381                         
382                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
383                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
384                                 i_sites_diagnosed += 1
385                         i_sites_observed += 1
386
387                 return {'sites_observed': i_sites_observed, 
388                                 'sites_diagnosed': i_sites_diagnosed, 
389                                 'nodes_diagnosed': i_nodes_diagnosed, 
390                                 'allsites':l_allsites}
391
392                 pass
393                 
394         def __getDaysDown(self, diag_record, nodename):
395                 daysdown = -1
396                 if diag_record['comonstats']['sshstatus'] != "null":
397                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
398                 elif diag_record['comonstats']['lastcotop'] != "null":
399                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
400                 else:
401                         now = time.time()
402                         last_contact = diag_record['plcnode']['last_contact']
403                         if last_contact == None:
404                                 # the node has never been up, so give it a break
405                                 daysdown = -1
406                         else:
407                                 diff = now - last_contact
408                                 daysdown = diff // (60*60*24)
409                 return daysdown
410
411         def __getStrDaysDown(self, diag_record, nodename):
412                 daysdown = self.__getDaysDown(diag_record, nodename)
413                 if daysdown > 0:
414                         return "(%d days down)"%daysdown
415                 else:
416                         return "Unknown number of days"
417
418         def __getCDVersion(self, diag_record, nodename):
419                 cdversion = ""
420                 #print "Getting kernel for: %s" % diag_record['nodename']
421                 cdversion = diag_record['kernel']
422                 return cdversion
423
424         def __diagnoseSite(self, loginbase, d_diag_nodes):
425                 """
426                 d_diag_nodes are diagnose_in entries.
427                 """
428                 d_diag_site = {loginbase : { 'config' : 
429                                                                                                 {'squeeze': False,
430                                                                                                  'email': False
431                                                                                                 }, 
432                                                                         'nodes': {}
433                                                                         }
434                                            }
435                 sorted_nodes = d_diag_nodes.keys()
436                 sorted_nodes.sort()
437                 for nodename in sorted_nodes:
438                         node_record = d_diag_nodes[nodename]
439                         diag_record = self.__diagnoseNode(loginbase, node_record)
440
441                         if diag_record != None:
442                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
443
444                                 # NOTE: improvement means, we need to act/squeeze and email.
445                                 #print "DIAG_RECORD", diag_record
446                                 if 'monitor-end-record' in diag_record['stage'] or \
447                                    'nmreset' in diag_record['stage']:
448                                 #       print "resetting loginbase!" 
449                                         d_diag_site[loginbase]['config']['squeeze'] = True
450                                         d_diag_site[loginbase]['config']['email'] = True
451                                 #else:
452                                 #       print "NO IMPROVEMENT!!!!"
453                         else:
454                                 pass # there is nothing to do for this node.
455
456                 # NOTE: these settings can be overridden by command line arguments,
457                 #       or the state of a record, i.e. if already in RT's Support Queue.
458                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
459                 if nodes_up < MINUP:
460                         d_diag_site[loginbase]['config']['squeeze'] = True
461
462                 max_slices = self.getMaxSlices(loginbase)
463                 num_nodes = self.getNumNodes(loginbase)
464                 # NOTE: when max_slices == 0, this is either a new site (the old way)
465                 #       or an old disabled site from previous monitor (before site['enabled'])
466                 if nodes_up < num_nodes and max_slices != 0:
467                         d_diag_site[loginbase]['config']['email'] = True
468
469                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
470                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
471
472                 return d_diag_site
473
474         def diagRecordByCategory(self, node_record):
475                 nodename = node_record['nodename']
476                 category = node_record['category']
477                 state    = node_record['state']
478                 loginbase = self.plcdb_hn2lb[nodename]
479                 diag_record = None
480
481                 if  "ERROR" in category:        # i.e. "DOWN"
482                         diag_record = {}
483                         diag_record.update(node_record)
484                         daysdown = self.__getDaysDown(diag_record, nodename) 
485                         if daysdown < 7:
486                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
487                                 print format % (loginbase, nodename, daysdown)
488                                 return None
489
490                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
491                         diag_record['message'] = emailTxt.mailtxt.newdown
492                         diag_record['args'] = {'nodename': nodename}
493                         diag_record['info'] = (nodename, s_daysdown, "")
494
495                         if 'reboot_node_failed' in node_record:
496                                 # there was a previous attempt to use the PCU.
497                                 if node_record['reboot_node_failed'] == False:
498                                         # then the last attempt apparently, succeeded.
499                                         # But, the category is still 'ERROR'.  Therefore, the
500                                         # PCU-to-Node mapping is broken.
501                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
502                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
503                                         diag_record['email_pcu'] = True
504
505                         if diag_record['ticket_id'] == "":
506                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
507                                         (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
508                         else:
509                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
510                                         (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
511
512                 elif "OLDBOOTCD" in category:
513                         # V2 boot cds as determined by findbad
514                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
515                         s_cdversion = self.__getCDVersion(node_record, nodename)
516                         diag_record = {}
517                         diag_record.update(node_record)
518                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
519                         diag_record['message'] = emailTxt.mailtxt.newbootcd
520                         diag_record['args'] = {'nodename': nodename}
521                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
522                         if diag_record['ticket_id'] == "":
523                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
524                                                                         (loginbase, nodename, diag_record['kernel'], 
525                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
526                         else:
527                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
528                                                                         (loginbase, nodename, diag_record['kernel'], 
529                                                                          diag_record['bootcd'], diag_record['ticket_id'])
530
531                 elif "PROD" in category:
532                         if "DEBUG" in state:
533                                 # Not sure what to do with these yet.  Probably need to
534                                 # reboot, and email.
535                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
536                                 return None
537                         elif "BOOT" in state:
538                                 # no action needed.
539                                 # TODO: remove penalties, if any are applied.
540                                 now = time.time()
541                                 last_contact = node_record['plcnode']['last_contact']
542                                 if last_contact == None:
543                                         time_diff = 0
544                                 else:
545                                         time_diff = now - last_contact;
546
547                                 if 'improvement' in node_record['stage']:
548                                         # then we need to pass this on to 'action'
549                                         diag_record = {}
550                                         diag_record.update(node_record)
551                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
552                                         diag_record['args'] = {'nodename': nodename}
553                                         diag_record['info'] = (nodename, node_record['prev_category'], 
554                                                                                                          node_record['category'])
555                                         if diag_record['ticket_id'] == "":
556                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
557                                                                         (loginbase, nodename, diag_record['stage'], 
558                                                                          state, category, diag_record['found_rt_ticket'])
559                                         else:
560                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
561                                                                         (loginbase, nodename, diag_record['stage'], 
562                                                                          state, category, diag_record['ticket_id'])
563                                         return diag_record
564                                 elif time_diff >= 6*SPERHOUR:
565                                         # heartbeat is older than 30 min.
566                                         # then reset NM.
567                                         #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
568                                         diag_record = {}
569                                         diag_record.update(node_record)
570                                         diag_record['message'] = emailTxt.mailtxt.NMReset
571                                         diag_record['args'] = {'nodename': nodename}
572                                         diag_record['stage'] = "nmreset"
573                                         diag_record['info'] = (nodename, 
574                                                                                         node_record['prev_category'], 
575                                                                                         node_record['category'])
576                                         if diag_record['ticket_id'] == "":
577                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
578                                                                         (loginbase, nodename, diag_record['stage'], 
579                                                                          state, category, diag_record['found_rt_ticket'])
580                                         else:
581                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
582                                                                         (loginbase, nodename, diag_record['stage'])
583
584                                         return diag_record
585                                 else:
586                                         return None
587                         else:
588                                 # unknown
589                                 pass
590                 elif "ALPHA"    in category:
591                         pass
592                 elif "clock_drift" in category:
593                         pass
594                 elif "dns"    in category:
595                         pass
596                 elif "filerw"    in category:
597                         pass
598                 else:
599                         print "Unknown category!!!! %s" % category
600                         sys.exit(1)
601
602                 return diag_record
603
604         def __diagnoseNode(self, loginbase, node_record):
605                 # TODO: change the format of the hostname in this 
606                 #               record to something more natural.
607                 nodename                = node_record['nodename']
608                 category                = node_record['category']
609                 prev_category   = node_record['prev_category']
610                 state                   = node_record['state']
611                 #if 'prev_category' in node_record:
612                 #       prev_category = node_record['prev_category']
613                 #else:
614                 #       prev_category = "ERROR"
615                 if node_record['prev_category'] != "NORECORD":
616                 
617                         val = cmpCategoryVal(category, prev_category)
618                         print "%s went from %s -> %s" % (nodename, prev_category, category)
619                         if val == 1:
620                                 # improved
621                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
622                                         print "closing record with no ticket: ", node_record['nodename']
623                                         node_record['action'] = ['close_rt']
624                                         node_record['message'] = None
625                                         node_record['stage'] = 'monitor-end-record'
626                                         return node_record
627                                 else:
628                                         node_record['stage'] = 'improvement'
629
630                                 #if 'monitor-end-record' in node_record['stage']:
631                                 #       # just ignore it if it's already ended.
632                                 #       # otherwise, the status should be worse, and we won't get
633                                 #       # here.
634                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
635                                 #       return None
636 #
637 #                                       #return None
638                         elif val == -1:
639                                 # current category is worse than previous, carry on
640                                 pass
641                         else:
642                                 #values are equal, carry on.
643                                 #print "why are we here?"
644                                 pass
645                         
646                 #### COMPARE category and prev_category
647                 # if not_equal
648                 #       then assign a stage based on relative priorities
649                 # else equal
650                 #       then check category for stats.
651                 diag_record = self.diagRecordByCategory(node_record)
652                 if diag_record == None:
653                         #print "diag_record == None"
654                         return None
655
656                 #### found_RT_ticket
657                 # TODO: need to record time found, and maybe add a stage for acting on it...
658                 if 'found_rt_ticket' in diag_record and \
659                         diag_record['found_rt_ticket'] is not None:
660                         if diag_record['stage'] is not 'improvement':
661                                 diag_record['stage'] = 'ticket_waitforever'
662                                 
663                 current_time = time.time()
664                 # take off four days, for the delay that database caused.
665                 # TODO: generalize delays at PLC, and prevent enforcement when there
666                 #               have been no emails.
667                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
668                 #delta = current_time - diag_record['time'] - 7*SPERDAY
669                 delta = current_time - diag_record['time']
670
671                 message = diag_record['message']
672                 act_record = {}
673                 act_record.update(diag_record)
674
675                 #### DIAGNOSE STAGES 
676                 if   'findbad' in diag_record['stage']:
677                         # The node is bad, and there's no previous record of it.
678                         act_record['email'] = TECH
679                         act_record['action'] = ['noop']
680                         act_record['message'] = message[0]
681                         act_record['stage'] = 'stage_actinoneweek'
682
683                 elif 'nmreset' in diag_record['stage']:
684                         act_record['email']  = ADMIN 
685                         act_record['action'] = ['reset_nodemanager']
686                         act_record['message'] = message[0]
687                         act_record['stage']  = 'nmreset'
688                         return None
689
690                 elif 'reboot_node' in diag_record['stage']:
691                         act_record['email'] = TECH
692                         act_record['action'] = ['noop']
693                         act_record['message'] = message[0]
694                         act_record['stage'] = 'stage_actinoneweek'
695                         
696                 elif 'improvement' in diag_record['stage']:
697                         # - backoff previous squeeze actions (slice suspend, nocreate)
698                         # TODO: add a backoff_squeeze section... Needs to runthrough
699                         act_record['action'] = ['close_rt']
700                         act_record['message'] = message[0]
701                         act_record['stage'] = 'monitor-end-record'
702
703                 elif 'actinoneweek' in diag_record['stage']:
704                         if delta >= 7 * SPERDAY: 
705                                 act_record['email'] = TECH | PI
706                                 act_record['stage'] = 'stage_actintwoweeks'
707                                 act_record['message'] = message[1]
708                                 act_record['action'] = ['nocreate' ]
709                                 act_record['time'] = current_time               # reset clock for waitforever
710                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
711                                 act_record['email'] = TECH 
712                                 act_record['message'] = message[0]
713                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
714                                 act_record['second-mail-at-oneweek'] = True
715                         else:
716                                 act_record['message'] = None
717                                 act_record['action'] = ['waitforoneweekaction' ]
718                                 print "ignoring this record for: %s" % act_record['nodename']
719                                 return None                     # don't send if there's no action
720
721                 elif 'actintwoweeks' in diag_record['stage']:
722                         if delta >= 7 * SPERDAY:
723                                 act_record['email'] = TECH | PI | USER
724                                 act_record['stage'] = 'stage_waitforever'
725                                 act_record['message'] = message[2]
726                                 act_record['action'] = ['suspendslices']
727                                 act_record['time'] = current_time               # reset clock for waitforever
728                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
729                                 act_record['email'] = TECH | PI
730                                 act_record['message'] = message[1]
731                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
732                                 act_record['second-mail-at-twoweeks'] = True
733                         else:
734                                 act_record['message'] = None
735                                 act_record['action'] = ['waitfortwoweeksaction']
736                                 return None                     # don't send if there's no action
737
738                 elif 'ticket_waitforever' in diag_record['stage']:
739                         act_record['email'] = TECH
740                         if 'first-found' not in act_record:
741                                 act_record['first-found'] = True
742                                 act_record['log'] += " firstfound"
743                                 act_record['action'] = ['ticket_waitforever']
744                                 act_record['message'] = None
745                                 act_record['time'] = current_time
746                         else:
747                                 if delta >= 7*SPERDAY:
748                                         act_record['action'] = ['ticket_waitforever']
749                                         act_record['message'] = None
750                                         act_record['time'] = current_time               # reset clock
751                                 else:
752                                         act_record['action'] = ['ticket_waitforever']
753                                         act_record['message'] = None
754                                         return None
755
756                 elif 'waitforever' in diag_record['stage']:
757                         # more than 3 days since last action
758                         # TODO: send only on weekdays.
759                         # NOTE: expects that 'time' has been reset before entering waitforever stage
760                         if delta >= 3*SPERDAY:
761                                 act_record['action'] = ['email-againwaitforever']
762                                 act_record['message'] = message[2]
763                                 act_record['time'] = current_time               # reset clock
764                         else:
765                                 act_record['action'] = ['waitforever']
766                                 act_record['message'] = None
767                                 return None                     # don't send if there's no action
768
769                 else:
770                         # There is no action to be taken, possibly b/c the stage has
771                         # already been performed, but diagnose picked it up again.
772                         # two cases, 
773                         #       1. stage is unknown, or 
774                         #       2. delta is not big enough to bump it to the next stage.
775                         # TODO: figure out which. for now assume 2.
776                         print "UNKNOWN stage for %s; nothing done" % nodename
777                         act_record['action'] = ['unknown']
778                         act_record['message'] = message[0]
779                         #print "Exiting..."
780                         return None
781                         #sys.exit(1)
782
783                 print "%s" % act_record['log'],
784                 print "%15s" % act_record['action']
785                 return act_record
786
787         def getMaxSlices(self, loginbase):
788                 # if sickdb has a loginbase, then it will have at least one node.
789                 site_stats = None
790
791                 for nodename in self.diagnose_in[loginbase].keys():
792                         if nodename in self.findbad['nodes']:
793                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
794                                 break
795
796                 if site_stats == None:
797                         raise Exception, "loginbase with no nodes in findbad"
798                 else:
799                         return site_stats['max_slices']
800
801         def getNumNodes(self, loginbase):
802                 # if sickdb has a loginbase, then it will have at least one node.
803                 site_stats = None
804
805                 for nodename in self.diagnose_in[loginbase].keys():
806                         if nodename in self.findbad['nodes']:
807                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
808                                 break
809
810                 if site_stats == None:
811                         raise Exception, "loginbase with no nodes in findbad"
812                 else:
813                         return site_stats['num_nodes']
814
815         """
816         Returns number of up nodes as the total number *NOT* in act_all with a
817         stage other than 'steady-state' .
818         """
819         def getUpAtSite(self, loginbase, d_diag_site):
820                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
821                 #               that aren't recorded yet.
822
823                 numnodes = self.getNumNodes(loginbase)
824                 # NOTE: assume nodes we have no record of are ok. (too conservative)
825                 # TODO: make the 'up' value more representative
826                 up = numnodes
827                 for nodename in d_diag_site[loginbase]['nodes'].keys():
828
829                         rec = d_diag_site[loginbase]['nodes'][nodename]
830                         if rec['stage'] != 'monitor-end-record':
831                                 up -= 1
832                         else:
833                                 pass # the node is assumed to be up.
834
835                 #if up != numnodes:
836                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
837
838                 return up
839
840
841 class SiteAction:
842         def __init__(self, parameter_names=['hostname', 'ticket_id']):
843                 self.parameter_names = parameter_names
844         def checkParam(self, args):
845                 for param in self.parameter_names:
846                         if param not in args:
847                                 raise Exception("Parameter %s not provided in args"%param)
848         def run(self, args):
849                 self.checkParam(args)
850                 return self._run(args)
851         def _run(self, args):
852                 pass
853
854 class SuspendAction(SiteAction):
855         def _run(self, args):
856                 return plc.suspendSlices(args['hostname'])
857
858 class RemoveSliceCreation(SiteAction):
859         def _run(self, args):
860                 return plc.removeSliceCreation(args['hostname'])
861
862 class BackoffActions(SiteAction):
863         def _run(self, args):
864                 plc.enableSlices(args['hostname'])
865                 plc.enableSliceCreation(args['hostname'])
866                 return True
867
868 # TODO: create class for each action below, 
869 #               allow for lists of actions to be performed...
870
871 def close_rt_backoff(args):
872         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
873                 mailer.closeTicketViaRT(args['ticket_id'], 
874                                                                 "Ticket CLOSED automatically by SiteAssist.")
875                 plc.enableSlices(args['hostname'])
876                 plc.enableSliceCreation(args['hostname'])
877         return
878
879 def reboot_node(args):
880         host = args['hostname']
881         return reboot.reboot_new(host, True, config.debug)
882
883 def reset_nodemanager(args):
884         os.system("ssh root@%s /sbin/service nm restart" % nodename)
885         return
886
887 class Action(Thread):
888         def __init__(self, l_action):
889                 self.l_action = l_action
890
891                 # the hostname to loginbase mapping
892                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
893
894                 # Actions to take.
895                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
896                 # Actions taken.
897                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
898
899                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
900                 self.actions = {}
901                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
902                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
903                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
904                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
905                 self.actions['noop'] = lambda args: args
906                 self.actions['reboot_node'] = lambda args: reboot_node(args)
907                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
908
909                 self.actions['ticket_waitforever'] = lambda args: args
910                 self.actions['waitforever'] = lambda args: args
911                 self.actions['unknown'] = lambda args: args
912                 self.actions['waitforoneweekaction'] = lambda args: args
913                 self.actions['waitfortwoweeksaction'] = lambda args: args
914                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
915                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
916                 self.actions['email-againwaitforever'] = lambda args: args
917                 self.actions['email-againticket_waitforever'] = lambda args: args
918                                 
919
920                 self.sickdb = {}
921                 Thread.__init__(self)
922
923         def run(self):
924                 self.accumSites()
925                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
926                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
927
928                 try:
929                         stats = self.analyseSites()
930                 except Exception, err:
931                         print "----------------"
932                         import traceback
933                         print traceback.print_exc()
934                         print err
935                         if config.policysavedb:
936                                 print "Saving Databases... act_all"
937                                 soltesz.dbDump("act_all", self.act_all)
938                         sys.exit(1)
939
940                 print_stats("sites_observed", stats)
941                 print_stats("sites_diagnosed", stats)
942                 print_stats("nodes_diagnosed", stats)
943                 print_stats("sites_emailed", stats)
944                 print_stats("nodes_actedon", stats)
945                 print string.join(stats['allsites'], ",")
946
947                 if config.policysavedb:
948                         print "Saving Databases... act_all"
949                         #soltesz.dbDump("policy.eventlog", self.eventlog)
950                         # TODO: remove 'diagnose_out', 
951                         #       or at least the entries that were acted on.
952                         soltesz.dbDump("act_all", self.act_all)
953
954         def accumSites(self):
955                 """
956                 Take all nodes, from l_action, look them up in the diagnose_db database, 
957                 and insert them into sickdb[] as:
958
959                 This way only the given l_action nodes will be acted on regardless
960                 of how many from diagnose_db are available.
961
962                         sickdb[loginbase][nodename] = diag_record
963                 """
964                 # TODO: what if l_action == None ?
965                 for nodename in self.l_action:
966
967                         loginbase = self.plcdb_hn2lb[nodename]
968
969                         if loginbase in self.diagnose_db and \
970                                 nodename in self.diagnose_db[loginbase]['nodes']:
971
972                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
973
974                                 if loginbase not in self.sickdb:
975                                         self.sickdb[loginbase] = {'nodes' : {}}
976
977                                 # NOTE: don't copy all node records, since not all will be in l_action
978                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
979                                 # NOTE: but, we want to get the loginbase config settings, 
980                                 #               this is the easiest way.
981                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
982                         #else:
983                                 #print "%s not in diagnose_db!!" % loginbase
984                 return
985
986         def __emailSite(self, loginbase, roles, message, args):
987                 """
988                 loginbase is the unique site abbreviation, prepended to slice names.
989                 roles contains TECH, PI, USER roles, and derive email aliases.
990                 record contains {'message': [<subj>,<body>], 'args': {...}} 
991                 """
992                 ticket_id = 0
993                 args.update({'loginbase':loginbase})
994
995                 if not config.mail and not config.debug and config.bcc:
996                         roles = ADMIN
997                 if config.mail and config.debug:
998                         roles = ADMIN
999
1000                 # build targets
1001                 contacts = []
1002                 if ADMIN & roles:
1003                         contacts += [config.email]
1004                 if TECH & roles:
1005                         contacts += [TECHEMAIL % loginbase]
1006                 if PI & roles:
1007                         contacts += [PIEMAIL % loginbase]
1008                 if USER & roles:
1009                         slices = plc.slices(loginbase)
1010                         if len(slices) >= 1:
1011                                 for slice in slices:
1012                                         contacts += [SLICEMAIL % slice]
1013                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1014                         else:
1015                                 print "SLIC: %20s : 0 slices" % loginbase
1016
1017                 try:
1018                         subject = message[0] % args
1019                         body = message[1] % args
1020                         if ADMIN & roles:
1021                                 # send only to admin
1022                                 if 'ticket_id' in args:
1023                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1024                                 else:
1025                                         subj = "Re: [PL noticket] %s" % subject
1026                                 mailer.email(subj, body, contacts)
1027                                 ticket_id = args['ticket_id']
1028                         else:
1029                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1030                 except Exception, err:
1031                         print "exception on message:"
1032                         import traceback
1033                         print traceback.print_exc()
1034                         print message
1035
1036                 return ticket_id
1037
1038
1039         def _format_diaginfo(self, diag_node):
1040                 info = diag_node['info']
1041                 if diag_node['stage'] == 'monitor-end-record':
1042                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1043                 else:
1044                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1045                 return hlist
1046
1047
1048         def get_email_args(self, act_recordlist):
1049
1050                 email_args = {}
1051                 email_args['hostname_list'] = ""
1052
1053                 for act_record in act_recordlist:
1054                         email_args['hostname_list'] += act_record['msg_format']
1055                         email_args['hostname'] = act_record['nodename']
1056                         if  'plcnode' in act_record and \
1057                                 'pcu_ids' in act_record['plcnode'] and \
1058                                 len(act_record['plcnode']['pcu_ids']) > 0:
1059                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1060                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1061                         else:
1062                                 email_args['pcu_id'] = "-1"
1063                                         
1064                         if 'ticket_id' in act_record:
1065                                 email_args['ticket_id'] = act_record['ticket_id']
1066
1067                 return email_args
1068
1069         def get_unique_issues(self, act_recordlist):
1070                 # NOTE: only send one email per site, per problem...
1071                 unique_issues = {}
1072                 for act_record in act_recordlist:
1073                         act_key = act_record['action'][0]
1074                         if act_key not in unique_issues:
1075                                 unique_issues[act_key] = []
1076                                 
1077                         unique_issues[act_key] += [act_record]
1078                         
1079                 return unique_issues
1080                         
1081
1082         def __actOnSite(self, loginbase, site_record):
1083                 i_nodes_actedon = 0
1084                 i_nodes_emailed = 0
1085
1086                 act_recordlist = []
1087
1088                 for nodename in site_record['nodes'].keys():
1089                         diag_record = site_record['nodes'][nodename]
1090                         act_record  = self.__actOnNode(diag_record)
1091                         #print "nodename: %s %s" % (nodename, act_record)
1092                         if act_record is not None:
1093                                 act_recordlist += [act_record]
1094
1095                 unique_issues = self.get_unique_issues(act_recordlist)
1096
1097                 for issue in unique_issues.keys():
1098                         print "\tworking on issue: %s" % issue
1099                         issue_record_list = unique_issues[issue]
1100                         email_args = self.get_email_args(issue_record_list)
1101
1102                         # for each record.
1103                         for act_record in issue_record_list:
1104                                 # if there's a pcu record and email config is set
1105                                 if 'email_pcu' in act_record:
1106                                         if act_record['email_pcu'] and \
1107                                                 site_record['config']['email']:
1108
1109                                                 email_args['hostname'] = act_record['nodename']
1110                                                 ticket_id = self.__emailSite(loginbase, 
1111                                                                                         act_record['email'], 
1112                                                                                         emailTxt.mailtxt.pcudown[0],
1113                                                                                         email_args)
1114                                                 email_args['ticket_id'] = ticket_id
1115
1116                         
1117                         act_record = issue_record_list[0]
1118                         # send message before squeezing
1119                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1120                                                                                                 site_record['config']['email'])
1121                         if act_record['message'] != None and site_record['config']['email']:
1122                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1123                                                                                          act_record['message'], email_args)
1124
1125                                 # Add ticket_id to ALL nodenames
1126                                 for act_record in issue_record_list:
1127                                         nodename = act_record['nodename']
1128                                         # update node record with RT ticket_id
1129                                         if nodename in self.act_all:
1130                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1131                                         if config.mail: i_nodes_emailed += 1
1132
1133                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1134                                                                                                         site_record['config']['squeeze'])
1135                         if config.squeeze and site_record['config']['squeeze']:
1136                                 for act_key in act_record['action']:
1137                                         self.actions[act_key](email_args)
1138                                 i_nodes_actedon += 1
1139                 
1140                 if config.policysavedb:
1141                         print "Saving Databases... act_all, diagnose_out"
1142                         soltesz.dbDump("act_all", self.act_all)
1143                         # remove site record from diagnose_out, it's in act_all as done.
1144                         del self.diagnose_db[loginbase]
1145                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1146
1147                 #print "sleeping for 1 sec"
1148                 #time.sleep(1)
1149                 print "Hit enter to continue..."
1150                 sys.stdout.flush()
1151                 line = sys.stdin.readline()
1152
1153                 return (i_nodes_actedon, i_nodes_emailed)
1154
1155         def __actOnNode(self, diag_record):
1156                 nodename = diag_record['nodename']
1157                 message = diag_record['message']
1158
1159                 act_record = {}
1160                 act_record.update(diag_record)
1161                 act_record['nodename'] = nodename
1162                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1163                 print "act_record['stage'] == %s " % act_record['stage']
1164
1165                 # avoid end records, and nmreset records                                        
1166                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1167
1168                 if 'monitor-end-record' not in act_record['stage'] and \
1169                    'nmreset' not in act_record['stage'] and \
1170                    'reboot_node_failed' not in act_record:
1171
1172                         if "DOWN" in act_record['log'] and \
1173                                         'pcu_ids' in act_record['plcnode'] and \
1174                                         len(act_record['plcnode']['pcu_ids']) > 0:
1175
1176                                 print "%s" % act_record['log'],
1177                                 print "%15s" % (['reboot_node'],)
1178                                 # Set node to re-install
1179                                 plc.nodeBootState(act_record['nodename'], "rins")       
1180                                 try:
1181                                         ret = reboot_node({'hostname': act_record['nodename']})
1182                                 except Exception, exc:
1183                                         print "exception on reboot_node:"
1184                                         import traceback
1185                                         print traceback.print_exc()
1186                                         ret = False
1187
1188                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1189                                         # Reboot Succeeded
1190                                         print "reboot succeeded for %s" % act_record['nodename']
1191                                         act_record2 = {}
1192                                         act_record2.update(act_record)
1193                                         act_record2['action'] = ['reboot_node']
1194                                         act_record2['stage'] = "reboot_node"
1195                                         act_record2['reboot_node_failed'] = False
1196                                         act_record2['email_pcu'] = False
1197
1198                                         if nodename not in self.act_all: 
1199                                                 self.act_all[nodename] = []
1200                                         print "inserting 'reboot_node' record into act_all"
1201                                         self.act_all[nodename].insert(0,act_record2)
1202
1203                                         # return None to avoid further action
1204                                         print "Taking no further action"
1205                                         return None
1206                                 else:
1207                                         print "reboot failed for %s" % act_record['nodename']
1208                                         # set email_pcu to also send pcu notice for this record.
1209                                         act_record['reboot_node_failed'] = True
1210                                         act_record['email_pcu'] = True
1211
1212                         print "%s" % act_record['log'],
1213                         print "%15s" % act_record['action']
1214
1215                 if act_record['stage'] is not 'monitor-end-record' and \
1216                    act_record['stage'] is not 'nmreset':
1217                         if nodename not in self.act_all: 
1218                                 self.act_all[nodename] = []
1219
1220                         self.act_all[nodename].insert(0,act_record)
1221                 else:
1222                         print "Not recording %s in act_all" % nodename
1223
1224                 return act_record
1225
1226         def analyseSites(self):
1227                 i_sites_observed = 0
1228                 i_sites_diagnosed = 0
1229                 i_nodes_diagnosed = 0
1230                 i_nodes_actedon = 0
1231                 i_sites_emailed = 0
1232                 l_allsites = []
1233
1234                 sorted_sites = self.sickdb.keys()
1235                 sorted_sites.sort()
1236                 for loginbase in sorted_sites:
1237                         site_record = self.sickdb[loginbase]
1238                         print "sites: %s" % loginbase
1239                         
1240                         i_nodes_diagnosed += len(site_record.keys())
1241                         i_sites_diagnosed += 1
1242
1243                         (na,ne) = self.__actOnSite(loginbase, site_record)
1244
1245                         i_sites_observed += 1
1246                         i_nodes_actedon += na
1247                         i_sites_emailed += ne
1248
1249                         l_allsites += [loginbase]
1250
1251                 return {'sites_observed': i_sites_observed, 
1252                                 'sites_diagnosed': i_sites_diagnosed, 
1253                                 'nodes_diagnosed': i_nodes_diagnosed, 
1254                                 'sites_emailed': i_sites_emailed, 
1255                                 'nodes_actedon': i_nodes_actedon, 
1256                                 'allsites':l_allsites}
1257
1258         def print_stats(self, key, stats):
1259                 print "%20s : %d" % (key, stats[key])
1260
1261
1262
1263         #"""
1264         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1265         #"""
1266         #def status(self):
1267         #       sub = "Monitor Summary"
1268         #       msg = "\nThe following nodes were acted upon:  \n\n"
1269         #       for (node, (type, date)) in self.emailed.items():
1270         #               # Print only things acted on today.
1271         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1272         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1273         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1274         #       for (loginbase, (date, type)) in self.squeezed.items():
1275         #               # Print only things acted on today.
1276         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1277         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1278         #       mailer.email(sub, msg, [SUMTO])
1279         #       logger.info(msg)
1280         #       return 
1281
1282         #"""
1283         #Store/Load state of emails.  When, where, what.
1284         #"""
1285         #def emailedStore(self, action):
1286         #       try:
1287         #               if action == "LOAD":
1288         #                       f = open(DAT, "r+")
1289         #                       logger.info("POLICY:  Found and reading " + DAT)
1290         #                       self.emailed.update(pickle.load(f))
1291         #               if action == "WRITE":
1292         #                       f = open(DAT, "w")
1293         #                       #logger.debug("Writing " + DAT)
1294         #                       pickle.dump(self.emailed, f)
1295         #               f.close()
1296         #       except Exception, err:
1297         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1298
1299
1300 #class Policy(Thread):
1301
1302 def main():
1303         print "policy.py is a module, not a script for running directly."
1304
1305 if __name__ == '__main__':
1306         import os
1307         import plc
1308         try:
1309                 main()
1310         except KeyboardInterrupt:
1311                 print "Killed.  Exitting."
1312                 logger.info('Monitor Killed')
1313                 os._exit(0)