better support for PCUs
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import soltesz
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 from config import config
26 print "policy"
27 config = config()
28
29 DAT="./monitor.dat"
30
31 logger = logging.getLogger("monitor")
32
33 # Time to enforce policy
34 POLSLEEP = 7200
35
36 # Where to email the summary
37 SUMTO = "soltesz@cs.princeton.edu"
38 TECHEMAIL="tech-%s@sites.planet-lab.org"
39 PIEMAIL="pi-%s@sites.planet-lab.org"
40 SLICEMAIL="%s@slices.planet-lab.org"
41 PLCEMAIL="support@planet-lab.org"
42
43 #Thresholds (DAYS)
44 SPERMIN = 60
45 SPERHOUR = 60*60
46 SPERDAY = 86400
47 PITHRESH = 7 * SPERDAY
48 SLICETHRESH = 7 * SPERDAY
49 # Days before attempting rins again
50 RINSTHRESH = 5 * SPERDAY
51
52 # Days before calling the node dead.
53 DEADTHRESH = 30 * SPERDAY
54 # Minimum number of nodes up before squeezing
55 MINUP = 2
56
57 TECH=1
58 PI=2
59 USER=4
60 ADMIN=8
61
62 # IF:
63 #  no SSH, down.
64 #  bad disk, down
65 #  DNS, kinda down (sick)
66 #  clock, kinda down (sick)
67 #  Full disk, going to be down
68
69 # Actions:
70 #  Email
71 #  suspend slice creation
72 #  kill slices
73 def array_to_priority_map(array):
74         """ Create a mapping where each entry of array is given a priority equal
75         to its position in the array.  This is useful for subsequent use in the
76         cmpMap() function."""
77         map = {}
78         count = 0
79         for i in array:
80                 map[i] = count
81                 count += 1
82         return map
83
84 def getdebug():
85         return config.debug
86
87 def print_stats(key, stats):
88         if key in stats: print "%20s : %d" % (key, stats[key])
89
90 class Merge(Thread):
91         def __init__(self, l_merge, toRT):
92                 self.toRT = toRT
93                 self.merge_list = l_merge
94                 # the hostname to loginbase mapping
95                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
96
97                 # Previous actions taken on nodes.
98                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
99                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
100
101                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
102                 self.sickdb = {}
103                 self.mergedb = {}
104                 Thread.__init__(self)
105
106         def run(self):
107                 # populate sickdb
108                 self.accumSickSites()
109                 # read data from findbad and act_all
110                 self.mergeActionsAndBadDB()
111                 # pass node_records to RT
112                 self.sendToRT()
113
114         def accumSickSites(self):
115                 """
116                 Take all nodes, from l_diagnose, look them up in the act_all database, 
117                 and insert them into sickdb[] as:
118
119                         sickdb[loginbase][nodename] = fb_record
120                 """
121                 # look at all problems reported by findbad
122                 l_nodes = self.findbad['nodes'].keys()
123                 count = 0
124                 for nodename in l_nodes:
125                         if nodename not in self.merge_list:
126                                 continue                # skip this node, since it's not wanted
127
128                         count += 1
129                         loginbase = self.plcdb_hn2lb[nodename]
130                         values = self.findbad['nodes'][nodename]['values']
131
132                         fb_record = {}
133                         fb_record['nodename'] = nodename
134                         try:
135                                 fb_record['category'] = values['category']
136                         except:
137                                 print values
138                                 print nodename
139                                 print self.findbad['nodes'][nodename]
140                                 count -= 1
141                                 continue
142                         fb_record['state'] = values['state']
143                         fb_record['comonstats'] = values['comonstats']
144                         fb_record['plcnode'] = values['plcnode']
145                         fb_record['kernel'] = self.getKernel(values['kernel'])
146                         fb_record['stage'] = "findbad"
147                         fb_record['message'] = None
148                         fb_record['bootcd'] = values['bootcd']
149                         fb_record['args'] = None
150                         fb_record['info'] = None
151                         fb_record['time'] = time.time()
152                         fb_record['date_created'] = time.time()
153
154                         if loginbase not in self.sickdb:
155                                 self.sickdb[loginbase] = {}
156
157                         self.sickdb[loginbase][nodename] = fb_record
158
159                 print "Found %d nodes" % count
160
161         def getKernel(self, unamestr):
162                 s = unamestr.split()
163                 if len(s) > 2:
164                         return s[2]
165                 else:
166                         return ""
167
168         def mergeActionsAndBadDB(self): 
169                 """
170                 - Look at the sick node_records as reported in findbad, 
171                 - Then look at the node_records in act_all.  
172
173                 There are four cases:
174                 1) Problem in findbad, no problem in act_all
175                         this ok, b/c it just means it's a new problem
176                 2) Problem in findbad, problem in act_all
177                         -Did the problem get better or worse?  
178                                 -If Same, or Worse, then continue looking for open tickets.
179                                 -If Better, or No problem, then "back-off" penalties.
180                                         This judgement may need to wait until 'Diagnose()'
181
182                 3) No problem in findbad, problem in act_all
183                         The the node is operational again according to Findbad()
184
185                 4) No problem in findbad, no problem in act_all
186                         There won't be a record in either db, so there's no code.
187                 """
188
189                 sorted_sites = self.sickdb.keys()
190                 sorted_sites.sort()
191                 # look at all problems reported by findbad
192                 for loginbase in sorted_sites:
193                         d_fb_nodes = self.sickdb[loginbase]
194                         sorted_nodes = d_fb_nodes.keys()
195                         sorted_nodes.sort()
196                         for nodename in sorted_nodes:
197                                 fb_record = self.sickdb[loginbase][nodename]
198                                 x = fb_record
199                                 if loginbase not in self.mergedb:
200                                         self.mergedb[loginbase] = {}
201
202                                 # We must compare findbad state with act_all state
203                                 if nodename not in self.act_all:
204                                         # 1) ok, b/c it's a new problem. set ticket_id to null
205                                         self.mergedb[loginbase][nodename] = {} 
206                                         self.mergedb[loginbase][nodename].update(x)
207                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
208                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
209                                 else: 
210                                         if len(self.act_all[nodename]) == 0:
211                                                 print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
212                                                 continue
213
214                                         y = self.act_all[nodename][0]
215
216                                         ## skip if end-stage
217                                         #if 'stage' in y and "monitor-end-record" in y['stage']:
218                                         #       # 1) ok, b/c it's a new problem. set ticket_id to null
219                                         ##      self.mergedb[loginbase][nodename] = {} 
220                                         #       self.mergedb[loginbase][nodename].update(x)
221                                         #       self.mergedb[loginbase][nodename]['ticket_id'] = ""
222                                         #       self.mergedb[loginbase][nodename]['prev_category'] = None
223                                         #       continue
224
225                                         ## for legacy actions
226                                         #if 'bucket' in y and y['bucket'][0] == 'dbg':
227                                         #       # Only bootcd debugs made it to the act_all db.
228                                         #       y['prev_category'] = "OLDBOOTCD"
229                                         #elif 'bucket' in y and y['bucket'][0] == 'down':
230                                         #       y['prev_category'] = "ERROR"
231                                         #elif 'bucket' not in y:
232                                         #       # for all other actions, just carry over the
233                                         #       # previous category
234                                         #       y['prev_category'] = y['category']
235                                         #else:
236                                         #       print "UNKNOWN state for record: %s" % y
237                                         #       sys.exit(1)
238
239                                         # determine through translation, if the buckets match
240                                         #if 'category' in y and x['category'] == y['category']:
241                                         #       b_match = True
242                                         #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
243                                         #       b_match = True
244                                         #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
245                                         #       b_match = True
246                                         #else:
247                                         #       b_match = False
248
249                                         #if b_match: 
250                                         #       # 2b) ok, b/c they agree that there's still a problem..
251                                         #       # 2b) Comon & Monitor still agree; RT ticket?
252                                         #else:
253                                         #       # 2a) mismatch, need a policy for how to resolve
254                                         #       #     resolution will be handled in __diagnoseNode()
255                                         #       #         for now just record the two categories.
256                                         #       #if x['category'] == "PROD" and x['state'] == "BOOT" and \
257                                         #       # ( y['bucket'][0] == 'down' or  y['bucket'][0] == 'dbg'):
258                                         #       print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
259                                         #                               (x['category'], y['bucket'])
260
261                                         y['prev_category'] = y['category']
262                                         self.mergedb[loginbase][nodename] = {}
263                                         self.mergedb[loginbase][nodename].update(y)
264                                         self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
265                                         self.mergedb[loginbase][nodename]['category']   = x['category']
266                                         self.mergedb[loginbase][nodename]['state'] = x['state']
267                                         self.mergedb[loginbase][nodename]['kernel']=x['kernel']
268                                         self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
269                                         self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
270                                         # delete the entry from cache_all to keep it out of case 3)
271                                         del self.cache_all[nodename]
272
273                 # 3) nodes that remin in cache_all were not identified by findbad.
274                 #        Do we keep them or not?
275                 #   NOTE: i think that since the categories are performed before this
276                 #               step now, and by a monitor-controlled agent.
277
278                 # TODO: This does not work correctly.  Do we need this? 
279                 #for hn in self.cache_all.keys():
280                 #       y = self.act_all[hn][0]
281                 #       if 'monitor' in y['bucket']:
282                 #               loginbase = self.plcdb_hn2lb[hn] 
283                 #               if loginbase not in self.sickdb:
284                 #                       self.sickdb[loginbase] = {}
285                 #               self.sickdb[loginbase][hn] = y
286                 #       else:
287                 #               del self.cache_all[hn]
288
289                 print "len of cache_all: %d" % len(self.cache_all.keys())
290                 return
291
292         def sendToRT(self):
293                 sorted_sites = self.mergedb.keys()
294                 sorted_sites.sort()
295                 # look at all problems reported by merge
296                 for loginbase in sorted_sites:
297                         d_merge_nodes = self.mergedb[loginbase]
298                         for nodename in d_merge_nodes.keys():
299                                 record = self.mergedb[loginbase][nodename]
300                                 self.toRT.put(record)
301
302                 # send signal to stop reading
303                 self.toRT.put(None)
304                 return
305
306 class Diagnose(Thread):
307         def __init__(self, fromRT):
308                 self.fromRT = fromRT
309                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
310                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
311
312                 self.diagnose_in = {}
313                 self.diagnose_out = {}
314                 Thread.__init__(self)
315
316
317         def run(self):
318                 self.accumSickSites()
319
320                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
321                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
322
323                 try:
324                         stats = self.diagnoseAll()
325                 except Exception, err:
326                         print "----------------"
327                         import traceback
328                         print traceback.print_exc()
329                         print err
330                         #if config.policysavedb:
331                         sys.exit(1)
332
333                 print_stats("sites_observed", stats)
334                 print_stats("sites_diagnosed", stats)
335                 print_stats("nodes_diagnosed", stats)
336
337                 if config.policysavedb:
338                         print "Saving Databases... diagnose_out"
339                         soltesz.dbDump("diagnose_out", self.diagnose_out)
340
341         def accumSickSites(self):
342                 """
343                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
344                 and insert them into diagnose_in[] as:
345
346                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
347                 """
348                 while 1:
349                         node_record = self.fromRT.get(block = True)
350                         if node_record == None:
351                                 break;
352
353                         nodename = node_record['nodename']
354                         loginbase = self.plcdb_hn2lb[nodename]
355
356                         if loginbase not in self.diagnose_in:
357                                 self.diagnose_in[loginbase] = {}
358
359                         self.diagnose_in[loginbase][nodename] = node_record
360
361                 return
362
363         def diagnoseAll(self):
364                 i_sites_observed = 0
365                 i_sites_diagnosed = 0
366                 i_nodes_diagnosed = 0
367                 i_nodes_actedon = 0
368                 i_sites_emailed = 0
369                 l_allsites = []
370
371                 sorted_sites = self.diagnose_in.keys()
372                 sorted_sites.sort()
373                 self.diagnose_out= {}
374                 for loginbase in sorted_sites:
375                         l_allsites += [loginbase]
376
377                         d_diag_nodes = self.diagnose_in[loginbase]
378                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
379                         # store records in diagnose_out, for saving later.
380                         self.diagnose_out.update(d_act_records)
381                         
382                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
383                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
384                                 i_sites_diagnosed += 1
385                         i_sites_observed += 1
386
387                 return {'sites_observed': i_sites_observed, 
388                                 'sites_diagnosed': i_sites_diagnosed, 
389                                 'nodes_diagnosed': i_nodes_diagnosed, 
390                                 'allsites':l_allsites}
391
392                 pass
393                 
394         def __getDaysDown(self, diag_record, nodename):
395                 daysdown = -1
396                 if diag_record['comonstats']['sshstatus'] != "null":
397                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
398                 elif diag_record['comonstats']['lastcotop'] != "null":
399                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
400                 else:
401                         now = time.time()
402                         last_contact = diag_record['plcnode']['last_contact']
403                         if last_contact == None:
404                                 # the node has never been up, so give it a break
405                                 daysdown = -1
406                         else:
407                                 diff = now - last_contact
408                                 daysdown = diff // (60*60*24)
409                 return daysdown
410
411         def __getStrDaysDown(self, diag_record, nodename):
412                 daysdown = self.__getDaysDown(diag_record, nodename)
413                 if daysdown > 0:
414                         return "(%d days down)"%daysdown
415                 else:
416                         return "Unknown number of days"
417
418         def __getCDVersion(self, diag_record, nodename):
419                 cdversion = ""
420                 #print "Getting kernel for: %s" % diag_record['nodename']
421                 cdversion = diag_record['kernel']
422                 return cdversion
423
424         def __diagnoseSite(self, loginbase, d_diag_nodes):
425                 """
426                 d_diag_nodes are diagnose_in entries.
427                 """
428                 d_diag_site = {loginbase : { 'config' : 
429                                                                                                 {'squeeze': False,
430                                                                                                  'email': False
431                                                                                                 }, 
432                                                                         'nodes': {}
433                                                                         }
434                                            }
435                 sorted_nodes = d_diag_nodes.keys()
436                 sorted_nodes.sort()
437                 for nodename in sorted_nodes:
438                         node_record = d_diag_nodes[nodename]
439                         diag_record = self.__diagnoseNode(loginbase, node_record)
440
441                         if diag_record != None:
442                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
443
444                                 # NOTE: improvement means, we need to act/squeeze and email.
445                                 #print "DIAG_RECORD", diag_record
446                                 if 'monitor-end-record' in diag_record['stage'] or \
447                                    'nmreset' in diag_record['stage']:
448                                 #       print "resetting loginbase!" 
449                                         d_diag_site[loginbase]['config']['squeeze'] = True
450                                         d_diag_site[loginbase]['config']['email'] = True
451                                 #else:
452                                 #       print "NO IMPROVEMENT!!!!"
453                         else:
454                                 pass # there is nothing to do for this node.
455
456                 # NOTE: these settings can be overridden by command line arguments,
457                 #       or the state of a record, i.e. if already in RT's Support Queue.
458                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
459                 if nodes_up < MINUP:
460                         d_diag_site[loginbase]['config']['squeeze'] = True
461
462                 max_slices = self.getMaxSlices(loginbase)
463                 num_nodes = self.getNumNodes(loginbase)
464                 # NOTE: when max_slices == 0, this is either a new site (the old way)
465                 #       or an old disabled site from previous monitor (before site['enabled'])
466                 if nodes_up < num_nodes and max_slices != 0:
467                         d_diag_site[loginbase]['config']['email'] = True
468
469                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
470                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
471
472                 return d_diag_site
473
474         def diagRecordByCategory(self, node_record):
475                 nodename = node_record['nodename']
476                 category = node_record['category']
477                 state    = node_record['state']
478                 loginbase = self.plcdb_hn2lb[nodename]
479                 diag_record = None
480
481                 if  "ERROR" in category:        # i.e. "DOWN"
482                         diag_record = {}
483                         diag_record.update(node_record)
484                         daysdown = self.__getDaysDown(diag_record, nodename) 
485                         if daysdown < 7:
486                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
487                                 print format % (loginbase, nodename, daysdown)
488                                 return None
489
490                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
491                         diag_record['message'] = emailTxt.mailtxt.newdown
492                         diag_record['args'] = {'nodename': nodename}
493                         diag_record['info'] = (nodename, s_daysdown, "")
494
495                         if 'reboot_node_failed' in node_record:
496                                 # there was a previous attempt to use the PCU.
497                                 if node_record['reboot_node_failed'] == False:
498                                         # then the last attempt apparently, succeeded.
499                                         # But, the category is still 'ERROR'.  Therefore, the
500                                         # PCU-to-Node mapping is broken.
501                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
502                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
503                                         diag_record['email_pcu'] = True
504
505                         if diag_record['ticket_id'] == "":
506                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
507                                         (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
508                         else:
509                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
510                                         (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
511
512                 elif "OLDBOOTCD" in category:
513                         # V2 boot cds as determined by findbad
514                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
515                         s_cdversion = self.__getCDVersion(node_record, nodename)
516                         diag_record = {}
517                         diag_record.update(node_record)
518                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
519                         diag_record['message'] = emailTxt.mailtxt.newbootcd
520                         diag_record['args'] = {'nodename': nodename}
521                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
522                         if diag_record['ticket_id'] == "":
523                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
524                                                                         (loginbase, nodename, diag_record['kernel'], 
525                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
526                         else:
527                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
528                                                                         (loginbase, nodename, diag_record['kernel'], 
529                                                                          diag_record['bootcd'], diag_record['ticket_id'])
530
531                 elif "PROD" in category:
532                         if "DEBUG" in state:
533                                 # Not sure what to do with these yet.  Probably need to
534                                 # reboot, and email.
535                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
536                                 return None
537                         elif "BOOT" in state:
538                                 # no action needed.
539                                 # TODO: remove penalties, if any are applied.
540                                 now = time.time()
541                                 last_contact = node_record['plcnode']['last_contact']
542                                 if last_contact == None:
543                                         time_diff = 0
544                                 else:
545                                         time_diff = now - last_contact;
546
547                                 if 'improvement' in node_record['stage']:
548                                         # then we need to pass this on to 'action'
549                                         diag_record = {}
550                                         diag_record.update(node_record)
551                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
552                                         diag_record['args'] = {'nodename': nodename}
553                                         diag_record['info'] = (nodename, node_record['prev_category'], 
554                                                                                                          node_record['category'])
555                                         if 'email_pcu' in diag_record:
556                                                 if diag_record['email_pcu']:
557                                                         # previously, the pcu failed to reboot, so send
558                                                         # email. Now, reset these values to try the reboot
559                                                         # again.
560                                                         diag_record['email_pcu'] = False
561                                                         del diag_record['reboot_node_failed']
562
563                                         if diag_record['ticket_id'] == "":
564                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
565                                                                         (loginbase, nodename, diag_record['stage'], 
566                                                                          state, category, diag_record['found_rt_ticket'])
567                                         else:
568                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
569                                                                         (loginbase, nodename, diag_record['stage'], 
570                                                                          state, category, diag_record['ticket_id'])
571                                         return diag_record
572                                 elif time_diff >= 6*SPERHOUR:
573                                         # heartbeat is older than 30 min.
574                                         # then reset NM.
575                                         #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
576                                         diag_record = {}
577                                         diag_record.update(node_record)
578                                         diag_record['message'] = emailTxt.mailtxt.NMReset
579                                         diag_record['args'] = {'nodename': nodename}
580                                         diag_record['stage'] = "nmreset"
581                                         diag_record['info'] = (nodename, 
582                                                                                         node_record['prev_category'], 
583                                                                                         node_record['category'])
584                                         if diag_record['ticket_id'] == "":
585                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
586                                                                         (loginbase, nodename, diag_record['stage'], 
587                                                                          state, category, diag_record['found_rt_ticket'])
588                                         else:
589                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
590                                                                         (loginbase, nodename, diag_record['stage'])
591
592                                         return diag_record
593                                 else:
594                                         return None
595                         else:
596                                 # unknown
597                                 pass
598                 elif "ALPHA"    in category:
599                         pass
600                 elif "clock_drift" in category:
601                         pass
602                 elif "dns"    in category:
603                         pass
604                 elif "filerw"    in category:
605                         pass
606                 else:
607                         print "Unknown category!!!! %s" % category
608                         sys.exit(1)
609
610                 return diag_record
611
612         def __diagnoseNode(self, loginbase, node_record):
613                 # TODO: change the format of the hostname in this 
614                 #               record to something more natural.
615                 nodename                = node_record['nodename']
616                 category                = node_record['category']
617                 prev_category   = node_record['prev_category']
618                 state                   = node_record['state']
619                 #if 'prev_category' in node_record:
620                 #       prev_category = node_record['prev_category']
621                 #else:
622                 #       prev_category = "ERROR"
623                 if node_record['prev_category'] != "NORECORD":
624                 
625                         val = cmpCategoryVal(category, prev_category)
626                         print "%s went from %s -> %s" % (nodename, prev_category, category)
627                         if val == 1:
628                                 # improved
629                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
630                                         print "closing record with no ticket: ", node_record['nodename']
631                                         node_record['action'] = ['close_rt']
632                                         node_record['message'] = None
633                                         node_record['stage'] = 'monitor-end-record'
634                                         return node_record
635                                 else:
636                                         node_record['stage'] = 'improvement'
637
638                                 #if 'monitor-end-record' in node_record['stage']:
639                                 #       # just ignore it if it's already ended.
640                                 #       # otherwise, the status should be worse, and we won't get
641                                 #       # here.
642                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
643                                 #       return None
644 #
645 #                                       #return None
646                         elif val == -1:
647                                 # current category is worse than previous, carry on
648                                 pass
649                         else:
650                                 #values are equal, carry on.
651                                 #print "why are we here?"
652                                 pass
653                         
654                 #### COMPARE category and prev_category
655                 # if not_equal
656                 #       then assign a stage based on relative priorities
657                 # else equal
658                 #       then check category for stats.
659                 diag_record = self.diagRecordByCategory(node_record)
660                 if diag_record == None:
661                         #print "diag_record == None"
662                         return None
663
664                 #### found_RT_ticket
665                 # TODO: need to record time found, and maybe add a stage for acting on it...
666                 if 'found_rt_ticket' in diag_record and \
667                         diag_record['found_rt_ticket'] is not None:
668                         if diag_record['stage'] is not 'improvement':
669                                 diag_record['stage'] = 'ticket_waitforever'
670                                 
671                 current_time = time.time()
672                 # take off four days, for the delay that database caused.
673                 # TODO: generalize delays at PLC, and prevent enforcement when there
674                 #               have been no emails.
675                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
676                 #delta = current_time - diag_record['time'] - 7*SPERDAY
677                 delta = current_time - diag_record['time']
678
679                 message = diag_record['message']
680                 act_record = {}
681                 act_record.update(diag_record)
682
683                 #### DIAGNOSE STAGES 
684                 if   'findbad' in diag_record['stage']:
685                         # The node is bad, and there's no previous record of it.
686                         act_record['email'] = TECH
687                         act_record['action'] = ['noop']
688                         act_record['message'] = message[0]
689                         act_record['stage'] = 'stage_actinoneweek'
690
691                 elif 'nmreset' in diag_record['stage']:
692                         act_record['email']  = ADMIN 
693                         act_record['action'] = ['reset_nodemanager']
694                         act_record['message'] = message[0]
695                         act_record['stage']  = 'nmreset'
696                         return None
697
698                 elif 'reboot_node' in diag_record['stage']:
699                         act_record['email'] = TECH
700                         act_record['action'] = ['noop']
701                         act_record['message'] = message[0]
702                         act_record['stage'] = 'stage_actinoneweek'
703                         
704                 elif 'improvement' in diag_record['stage']:
705                         # - backoff previous squeeze actions (slice suspend, nocreate)
706                         # TODO: add a backoff_squeeze section... Needs to runthrough
707                         act_record['action'] = ['close_rt']
708                         act_record['message'] = message[0]
709                         act_record['stage'] = 'monitor-end-record'
710
711                 elif 'actinoneweek' in diag_record['stage']:
712                         if delta >= 7 * SPERDAY: 
713                                 act_record['email'] = TECH | PI
714                                 act_record['stage'] = 'stage_actintwoweeks'
715                                 act_record['message'] = message[1]
716                                 act_record['action'] = ['nocreate' ]
717                                 act_record['time'] = current_time               # reset clock for waitforever
718                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
719                                 act_record['email'] = TECH 
720                                 act_record['message'] = message[0]
721                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
722                                 act_record['second-mail-at-oneweek'] = True
723                         else:
724                                 act_record['message'] = None
725                                 act_record['action'] = ['waitforoneweekaction' ]
726                                 print "ignoring this record for: %s" % act_record['nodename']
727                                 return None                     # don't send if there's no action
728
729                 elif 'actintwoweeks' in diag_record['stage']:
730                         if delta >= 7 * SPERDAY:
731                                 act_record['email'] = TECH | PI | USER
732                                 act_record['stage'] = 'stage_waitforever'
733                                 act_record['message'] = message[2]
734                                 act_record['action'] = ['suspendslices']
735                                 act_record['time'] = current_time               # reset clock for waitforever
736                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
737                                 act_record['email'] = TECH | PI
738                                 act_record['message'] = message[1]
739                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
740                                 act_record['second-mail-at-twoweeks'] = True
741                         else:
742                                 act_record['message'] = None
743                                 act_record['action'] = ['waitfortwoweeksaction']
744                                 return None                     # don't send if there's no action
745
746                 elif 'ticket_waitforever' in diag_record['stage']:
747                         act_record['email'] = TECH
748                         if 'first-found' not in act_record:
749                                 act_record['first-found'] = True
750                                 act_record['log'] += " firstfound"
751                                 act_record['action'] = ['ticket_waitforever']
752                                 act_record['message'] = None
753                                 act_record['time'] = current_time
754                         else:
755                                 if delta >= 7*SPERDAY:
756                                         act_record['action'] = ['ticket_waitforever']
757                                         act_record['message'] = None
758                                         act_record['time'] = current_time               # reset clock
759                                 else:
760                                         act_record['action'] = ['ticket_waitforever']
761                                         act_record['message'] = None
762                                         return None
763
764                 elif 'waitforever' in diag_record['stage']:
765                         # more than 3 days since last action
766                         # TODO: send only on weekdays.
767                         # NOTE: expects that 'time' has been reset before entering waitforever stage
768                         if delta >= 3*SPERDAY:
769                                 act_record['action'] = ['email-againwaitforever']
770                                 act_record['message'] = message[2]
771                                 act_record['time'] = current_time               # reset clock
772                         else:
773                                 act_record['action'] = ['waitforever']
774                                 act_record['message'] = None
775                                 return None                     # don't send if there's no action
776
777                 else:
778                         # There is no action to be taken, possibly b/c the stage has
779                         # already been performed, but diagnose picked it up again.
780                         # two cases, 
781                         #       1. stage is unknown, or 
782                         #       2. delta is not big enough to bump it to the next stage.
783                         # TODO: figure out which. for now assume 2.
784                         print "UNKNOWN stage for %s; nothing done" % nodename
785                         act_record['action'] = ['unknown']
786                         act_record['message'] = message[0]
787
788                         act_record['email'] = TECH
789                         act_record['action'] = ['noop']
790                         act_record['message'] = message[0]
791                         act_record['stage'] = 'stage_actinoneweek'
792                         act_record['time'] = current_time               # reset clock
793                         #print "Exiting..."
794                         #return None
795                         #sys.exit(1)
796
797                 print "%s" % act_record['log'],
798                 print "%15s" % act_record['action']
799                 return act_record
800
801         def getMaxSlices(self, loginbase):
802                 # if sickdb has a loginbase, then it will have at least one node.
803                 site_stats = None
804
805                 for nodename in self.diagnose_in[loginbase].keys():
806                         if nodename in self.findbad['nodes']:
807                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
808                                 break
809
810                 if site_stats == None:
811                         raise Exception, "loginbase with no nodes in findbad"
812                 else:
813                         return site_stats['max_slices']
814
815         def getNumNodes(self, loginbase):
816                 # if sickdb has a loginbase, then it will have at least one node.
817                 site_stats = None
818
819                 for nodename in self.diagnose_in[loginbase].keys():
820                         if nodename in self.findbad['nodes']:
821                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
822                                 break
823
824                 if site_stats == None:
825                         raise Exception, "loginbase with no nodes in findbad"
826                 else:
827                         return site_stats['num_nodes']
828
829         """
830         Returns number of up nodes as the total number *NOT* in act_all with a
831         stage other than 'steady-state' .
832         """
833         def getUpAtSite(self, loginbase, d_diag_site):
834                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
835                 #               that aren't recorded yet.
836
837                 numnodes = self.getNumNodes(loginbase)
838                 # NOTE: assume nodes we have no record of are ok. (too conservative)
839                 # TODO: make the 'up' value more representative
840                 up = numnodes
841                 for nodename in d_diag_site[loginbase]['nodes'].keys():
842
843                         rec = d_diag_site[loginbase]['nodes'][nodename]
844                         if rec['stage'] != 'monitor-end-record':
845                                 up -= 1
846                         else:
847                                 pass # the node is assumed to be up.
848
849                 #if up != numnodes:
850                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
851
852                 return up
853
854
855 class SiteAction:
856         def __init__(self, parameter_names=['hostname', 'ticket_id']):
857                 self.parameter_names = parameter_names
858         def checkParam(self, args):
859                 for param in self.parameter_names:
860                         if param not in args:
861                                 raise Exception("Parameter %s not provided in args"%param)
862         def run(self, args):
863                 self.checkParam(args)
864                 return self._run(args)
865         def _run(self, args):
866                 pass
867
868 class SuspendAction(SiteAction):
869         def _run(self, args):
870                 return plc.suspendSlices(args['hostname'])
871
872 class RemoveSliceCreation(SiteAction):
873         def _run(self, args):
874                 return plc.removeSliceCreation(args['hostname'])
875
876 class BackoffActions(SiteAction):
877         def _run(self, args):
878                 plc.enableSlices(args['hostname'])
879                 plc.enableSliceCreation(args['hostname'])
880                 return True
881
882 # TODO: create class for each action below, 
883 #               allow for lists of actions to be performed...
884
885 def close_rt_backoff(args):
886         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
887                 mailer.closeTicketViaRT(args['ticket_id'], 
888                                                                 "Ticket CLOSED automatically by SiteAssist.")
889                 plc.enableSlices(args['hostname'])
890                 plc.enableSliceCreation(args['hostname'])
891         return
892
893 def reboot_node(args):
894         host = args['hostname']
895         return reboot.reboot_policy(host, True, config.debug)
896
897 def reset_nodemanager(args):
898         os.system("ssh root@%s /sbin/service nm restart" % nodename)
899         return
900
901 class Action(Thread):
902         def __init__(self, l_action):
903                 self.l_action = l_action
904
905                 # the hostname to loginbase mapping
906                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
907
908                 # Actions to take.
909                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
910                 # Actions taken.
911                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
912
913                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
914                 self.actions = {}
915                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
916                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
917                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
918                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
919                 self.actions['noop'] = lambda args: args
920                 self.actions['reboot_node'] = lambda args: reboot_node(args)
921                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
922
923                 self.actions['ticket_waitforever'] = lambda args: args
924                 self.actions['waitforever'] = lambda args: args
925                 self.actions['unknown'] = lambda args: args
926                 self.actions['waitforoneweekaction'] = lambda args: args
927                 self.actions['waitfortwoweeksaction'] = lambda args: args
928                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
929                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
930                 self.actions['email-againwaitforever'] = lambda args: args
931                 self.actions['email-againticket_waitforever'] = lambda args: args
932                                 
933
934                 self.sickdb = {}
935                 Thread.__init__(self)
936
937         def run(self):
938                 self.accumSites()
939                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
940                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
941
942                 try:
943                         stats = self.analyseSites()
944                 except Exception, err:
945                         print "----------------"
946                         import traceback
947                         print traceback.print_exc()
948                         print err
949                         if config.policysavedb:
950                                 print "Saving Databases... act_all"
951                                 soltesz.dbDump("act_all", self.act_all)
952                         sys.exit(1)
953
954                 print_stats("sites_observed", stats)
955                 print_stats("sites_diagnosed", stats)
956                 print_stats("nodes_diagnosed", stats)
957                 print_stats("sites_emailed", stats)
958                 print_stats("nodes_actedon", stats)
959                 print string.join(stats['allsites'], ",")
960
961                 if config.policysavedb:
962                         print "Saving Databases... act_all"
963                         #soltesz.dbDump("policy.eventlog", self.eventlog)
964                         # TODO: remove 'diagnose_out', 
965                         #       or at least the entries that were acted on.
966                         soltesz.dbDump("act_all", self.act_all)
967
968         def accumSites(self):
969                 """
970                 Take all nodes, from l_action, look them up in the diagnose_db database, 
971                 and insert them into sickdb[] as:
972
973                 This way only the given l_action nodes will be acted on regardless
974                 of how many from diagnose_db are available.
975
976                         sickdb[loginbase][nodename] = diag_record
977                 """
978                 # TODO: what if l_action == None ?
979                 for nodename in self.l_action:
980
981                         loginbase = self.plcdb_hn2lb[nodename]
982
983                         if loginbase in self.diagnose_db and \
984                                 nodename in self.diagnose_db[loginbase]['nodes']:
985
986                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
987
988                                 if loginbase not in self.sickdb:
989                                         self.sickdb[loginbase] = {'nodes' : {}}
990
991                                 # NOTE: don't copy all node records, since not all will be in l_action
992                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
993                                 # NOTE: but, we want to get the loginbase config settings, 
994                                 #               this is the easiest way.
995                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
996                         #else:
997                                 #print "%s not in diagnose_db!!" % loginbase
998                 return
999
1000         def __emailSite(self, loginbase, roles, message, args):
1001                 """
1002                 loginbase is the unique site abbreviation, prepended to slice names.
1003                 roles contains TECH, PI, USER roles, and derive email aliases.
1004                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1005                 """
1006                 ticket_id = 0
1007                 args.update({'loginbase':loginbase})
1008
1009                 if not config.mail and not config.debug and config.bcc:
1010                         roles = ADMIN
1011                 if config.mail and config.debug:
1012                         roles = ADMIN
1013
1014                 # build targets
1015                 contacts = []
1016                 if ADMIN & roles:
1017                         contacts += [config.email]
1018                 if TECH & roles:
1019                         contacts += [TECHEMAIL % loginbase]
1020                 if PI & roles:
1021                         contacts += [PIEMAIL % loginbase]
1022                 if USER & roles:
1023                         slices = plc.slices(loginbase)
1024                         if len(slices) >= 1:
1025                                 for slice in slices:
1026                                         contacts += [SLICEMAIL % slice]
1027                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1028                         else:
1029                                 print "SLIC: %20s : 0 slices" % loginbase
1030
1031                 try:
1032                         subject = message[0] % args
1033                         body = message[1] % args
1034                         if ADMIN & roles:
1035                                 # send only to admin
1036                                 if 'ticket_id' in args:
1037                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1038                                 else:
1039                                         subj = "Re: [PL noticket] %s" % subject
1040                                 mailer.email(subj, body, contacts)
1041                                 ticket_id = args['ticket_id']
1042                         else:
1043                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1044                 except Exception, err:
1045                         print "exception on message:"
1046                         import traceback
1047                         print traceback.print_exc()
1048                         print message
1049
1050                 return ticket_id
1051
1052
1053         def _format_diaginfo(self, diag_node):
1054                 info = diag_node['info']
1055                 if diag_node['stage'] == 'monitor-end-record':
1056                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1057                 else:
1058                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1059                 return hlist
1060
1061
1062         def get_email_args(self, act_recordlist, loginbase=None):
1063
1064                 email_args = {}
1065                 email_args['hostname_list'] = ""
1066
1067                 for act_record in act_recordlist:
1068                         email_args['hostname_list'] += act_record['msg_format']
1069                         email_args['hostname'] = act_record['nodename']
1070                         if  'plcnode' in act_record and \
1071                                 'pcu_ids' in act_record['plcnode'] and \
1072                                 len(act_record['plcnode']['pcu_ids']) > 0:
1073                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1074                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1075                         else:
1076                                 email_args['pcu_id'] = "-1"
1077                                         
1078                         if 'ticket_id' in act_record:
1079                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1080                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1081                                         sys.stdout.flush()
1082                                         line = sys.stdin.readline()
1083                                         try:
1084                                                 ticket_id = int(line)
1085                                         except:
1086                                                 print "could not get ticket_id from stdin..."
1087                                                 os._exit(1)
1088                                 else:
1089                                         ticket_id = act_record['ticket_id']
1090                                         
1091                                 email_args['ticket_id'] = ticket_id
1092
1093                 return email_args
1094
1095         def get_unique_issues(self, act_recordlist):
1096                 # NOTE: only send one email per site, per problem...
1097                 unique_issues = {}
1098                 for act_record in act_recordlist:
1099                         act_key = act_record['action'][0]
1100                         if act_key not in unique_issues:
1101                                 unique_issues[act_key] = []
1102                                 
1103                         unique_issues[act_key] += [act_record]
1104                         
1105                 return unique_issues
1106                         
1107
1108         def __actOnSite(self, loginbase, site_record):
1109                 i_nodes_actedon = 0
1110                 i_nodes_emailed = 0
1111
1112                 act_recordlist = []
1113
1114                 for nodename in site_record['nodes'].keys():
1115                         diag_record = site_record['nodes'][nodename]
1116                         act_record  = self.__actOnNode(diag_record)
1117                         #print "nodename: %s %s" % (nodename, act_record)
1118                         if act_record is not None:
1119                                 act_recordlist += [act_record]
1120
1121                 unique_issues = self.get_unique_issues(act_recordlist)
1122
1123                 for issue in unique_issues.keys():
1124                         print "\tworking on issue: %s" % issue
1125                         issue_record_list = unique_issues[issue]
1126                         email_args = self.get_email_args(issue_record_list, loginbase)
1127
1128                         # for each record.
1129                         for act_record in issue_record_list:
1130                                 # if there's a pcu record and email config is set
1131                                 if 'email_pcu' in act_record:
1132                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1133                                                 # and 'reboot_node' in act_record['stage']:
1134
1135                                                 email_args['hostname'] = act_record['nodename']
1136                                                 ticket_id = self.__emailSite(loginbase, 
1137                                                                                         act_record['email'], 
1138                                                                                         emailTxt.mailtxt.pcudown[0],
1139                                                                                         email_args)
1140                                                 if ticket_id == 0:
1141                                                         # error.
1142                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1143                                                         os._exit(1)
1144                                                         pass
1145                                                 email_args['ticket_id'] = ticket_id
1146
1147                         
1148                         act_record = issue_record_list[0]
1149                         # send message before squeezing
1150                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1151                                                                                                 site_record['config']['email'])
1152                         if act_record['message'] != None and site_record['config']['email']:
1153                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1154                                                                                          act_record['message'], email_args)
1155
1156                                 if ticket_id == 0:
1157                                         # error.
1158                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1159                                         os._exit(1)
1160                                         pass
1161
1162                                 # Add ticket_id to ALL nodenames
1163                                 for act_record in issue_record_list:
1164                                         nodename = act_record['nodename']
1165                                         # update node record with RT ticket_id
1166                                         if nodename in self.act_all:
1167                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1168                                         if config.mail: i_nodes_emailed += 1
1169
1170                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1171                                                                                                         site_record['config']['squeeze'])
1172                         if config.squeeze and site_record['config']['squeeze']:
1173                                 for act_key in act_record['action']:
1174                                         self.actions[act_key](email_args)
1175                                 i_nodes_actedon += 1
1176                 
1177                 if config.policysavedb:
1178                         print "Saving Databases... act_all, diagnose_out"
1179                         soltesz.dbDump("act_all", self.act_all)
1180                         # remove site record from diagnose_out, it's in act_all as done.
1181                         del self.diagnose_db[loginbase]
1182                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1183
1184                 print "sleeping for 1 sec"
1185                 time.sleep(1)
1186                 #print "Hit enter to continue..."
1187                 #sys.stdout.flush()
1188                 #line = sys.stdin.readline()
1189
1190                 return (i_nodes_actedon, i_nodes_emailed)
1191
1192         def __actOnNode(self, diag_record):
1193                 nodename = diag_record['nodename']
1194                 message = diag_record['message']
1195
1196                 act_record = {}
1197                 act_record.update(diag_record)
1198                 act_record['nodename'] = nodename
1199                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1200                 print "act_record['stage'] == %s " % act_record['stage']
1201
1202                 # avoid end records, and nmreset records                                        
1203                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1204
1205                 if 'monitor-end-record' not in act_record['stage'] and \
1206                    'nmreset' not in act_record['stage'] and \
1207                    'reboot_node_failed' not in act_record:
1208
1209                         if "DOWN" in act_record['log'] and \
1210                                         'pcu_ids' in act_record['plcnode'] and \
1211                                         len(act_record['plcnode']['pcu_ids']) > 0:
1212
1213                                 print "%s" % act_record['log'],
1214                                 print "%15s" % (['reboot_node'],)
1215                                 # Set node to re-install
1216                                 plc.nodeBootState(act_record['nodename'], "rins")       
1217                                 try:
1218                                         ret = reboot_node({'hostname': act_record['nodename']})
1219                                 except Exception, exc:
1220                                         print "exception on reboot_node:"
1221                                         import traceback
1222                                         print traceback.print_exc()
1223                                         ret = False
1224
1225                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1226                                         # Reboot Succeeded
1227                                         print "reboot succeeded for %s" % act_record['nodename']
1228                                         act_record2 = {}
1229                                         act_record2.update(act_record)
1230                                         act_record2['action'] = ['reboot_node']
1231                                         act_record2['stage'] = "reboot_node"
1232                                         act_record2['reboot_node_failed'] = False
1233                                         act_record2['email_pcu'] = False
1234
1235                                         if nodename not in self.act_all: 
1236                                                 self.act_all[nodename] = []
1237                                         print "inserting 'reboot_node' record into act_all"
1238                                         self.act_all[nodename].insert(0,act_record2)
1239
1240                                         # return None to avoid further action
1241                                         print "Taking no further action"
1242                                         return None
1243                                 else:
1244                                         print "reboot failed for %s" % act_record['nodename']
1245                                         # set email_pcu to also send pcu notice for this record.
1246                                         act_record['reboot_node_failed'] = True
1247                                         act_record['email_pcu'] = True
1248
1249                         print "%s" % act_record['log'],
1250                         print "%15s" % act_record['action']
1251
1252                 if act_record['stage'] is not 'monitor-end-record' and \
1253                    act_record['stage'] is not 'nmreset':
1254                         if nodename not in self.act_all: 
1255                                 self.act_all[nodename] = []
1256
1257                         self.act_all[nodename].insert(0,act_record)
1258                 else:
1259                         print "Not recording %s in act_all" % nodename
1260
1261                 return act_record
1262
1263         def analyseSites(self):
1264                 i_sites_observed = 0
1265                 i_sites_diagnosed = 0
1266                 i_nodes_diagnosed = 0
1267                 i_nodes_actedon = 0
1268                 i_sites_emailed = 0
1269                 l_allsites = []
1270
1271                 sorted_sites = self.sickdb.keys()
1272                 sorted_sites.sort()
1273                 for loginbase in sorted_sites:
1274                         site_record = self.sickdb[loginbase]
1275                         print "sites: %s" % loginbase
1276                         
1277                         i_nodes_diagnosed += len(site_record.keys())
1278                         i_sites_diagnosed += 1
1279
1280                         (na,ne) = self.__actOnSite(loginbase, site_record)
1281
1282                         i_sites_observed += 1
1283                         i_nodes_actedon += na
1284                         i_sites_emailed += ne
1285
1286                         l_allsites += [loginbase]
1287
1288                 return {'sites_observed': i_sites_observed, 
1289                                 'sites_diagnosed': i_sites_diagnosed, 
1290                                 'nodes_diagnosed': i_nodes_diagnosed, 
1291                                 'sites_emailed': i_sites_emailed, 
1292                                 'nodes_actedon': i_nodes_actedon, 
1293                                 'allsites':l_allsites}
1294
1295         def print_stats(self, key, stats):
1296                 print "%20s : %d" % (key, stats[key])
1297
1298
1299
1300         #"""
1301         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1302         #"""
1303         #def status(self):
1304         #       sub = "Monitor Summary"
1305         #       msg = "\nThe following nodes were acted upon:  \n\n"
1306         #       for (node, (type, date)) in self.emailed.items():
1307         #               # Print only things acted on today.
1308         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1309         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1310         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1311         #       for (loginbase, (date, type)) in self.squeezed.items():
1312         #               # Print only things acted on today.
1313         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1314         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1315         #       mailer.email(sub, msg, [SUMTO])
1316         #       logger.info(msg)
1317         #       return 
1318
1319         #"""
1320         #Store/Load state of emails.  When, where, what.
1321         #"""
1322         #def emailedStore(self, action):
1323         #       try:
1324         #               if action == "LOAD":
1325         #                       f = open(DAT, "r+")
1326         #                       logger.info("POLICY:  Found and reading " + DAT)
1327         #                       self.emailed.update(pickle.load(f))
1328         #               if action == "WRITE":
1329         #                       f = open(DAT, "w")
1330         #                       #logger.debug("Writing " + DAT)
1331         #                       pickle.dump(self.emailed, f)
1332         #               f.close()
1333         #       except Exception, err:
1334         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1335
1336
1337 #class Policy(Thread):
1338
1339 def main():
1340         print "policy.py is a module, not a script for running directly."
1341
1342 if __name__ == '__main__':
1343         import os
1344         import plc
1345         try:
1346                 main()
1347         except KeyboardInterrupt:
1348                 print "Killed.  Exitting."
1349                 logger.info('Monitor Killed')
1350                 os._exit(0)