3673b8e43fa6f2ec774e08b851f017ab27a63240
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import soltesz
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 from config import config
26 print "policy"
27 config = config()
28
29 DAT="./monitor.dat"
30
31 logger = logging.getLogger("monitor")
32
33 # Time to enforce policy
34 POLSLEEP = 7200
35
36 # Where to email the summary
37 SUMTO = "soltesz@cs.princeton.edu"
38 TECHEMAIL="tech-%s@sites.planet-lab.org"
39 PIEMAIL="pi-%s@sites.planet-lab.org"
40 SLICEMAIL="%s@slices.planet-lab.org"
41 PLCEMAIL="support@planet-lab.org"
42
43 #Thresholds (DAYS)
44 SPERMIN = 60
45 SPERHOUR = 60*60
46 SPERDAY = 86400
47 PITHRESH = 7 * SPERDAY
48 SLICETHRESH = 7 * SPERDAY
49 # Days before attempting rins again
50 RINSTHRESH = 5 * SPERDAY
51
52 # Days before calling the node dead.
53 DEADTHRESH = 30 * SPERDAY
54 # Minimum number of nodes up before squeezing
55 MINUP = 2
56
57 TECH=1
58 PI=2
59 USER=4
60 ADMIN=8
61
62 # IF:
63 #  no SSH, down.
64 #  bad disk, down
65 #  DNS, kinda down (sick)
66 #  clock, kinda down (sick)
67 #  Full disk, going to be down
68
69 # Actions:
70 #  Email
71 #  suspend slice creation
72 #  kill slices
73 def array_to_priority_map(array):
74         """ Create a mapping where each entry of array is given a priority equal
75         to its position in the array.  This is useful for subsequent use in the
76         cmpMap() function."""
77         map = {}
78         count = 0
79         for i in array:
80                 map[i] = count
81                 count += 1
82         return map
83
84 def getdebug():
85         return config.debug
86
87 def print_stats(key, stats):
88         if key in stats: print "%20s : %d" % (key, stats[key])
89
90 def get_ticket_id(record):
91         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
92                 return record['ticket_id']
93         elif            'found_rt_ticket' in record and \
94                  record['found_rt_ticket'] is not "" and \
95                  record['found_rt_ticket'] is not None:
96                 return record['found_rt_ticket']
97         else:
98                 return None
99
100 class Merge(Thread):
101         def __init__(self, l_merge, toRT):
102                 self.toRT = toRT
103                 self.merge_list = l_merge
104                 # the hostname to loginbase mapping
105                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
106
107                 # Previous actions taken on nodes.
108                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
109                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
110
111                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
112                 self.sickdb = {}
113                 self.mergedb = {}
114                 Thread.__init__(self)
115
116         def run(self):
117                 # populate sickdb
118                 self.accumSickSites()
119                 # read data from findbad and act_all
120                 self.mergeActionsAndBadDB()
121                 # pass node_records to RT
122                 self.sendToRT()
123
124         def accumSickSites(self):
125                 """
126                 Take all nodes, from l_diagnose, look them up in the act_all database, 
127                 and insert them into sickdb[] as:
128
129                         sickdb[loginbase][nodename] = fb_record
130                 """
131                 # look at all problems reported by findbad
132                 l_nodes = self.findbad['nodes'].keys()
133                 count = 0
134                 for nodename in l_nodes:
135                         if nodename not in self.merge_list:
136                                 continue                # skip this node, since it's not wanted
137
138                         count += 1
139                         loginbase = self.plcdb_hn2lb[nodename]
140                         values = self.findbad['nodes'][nodename]['values']
141
142                         fb_record = {}
143                         fb_record['nodename'] = nodename
144                         try:
145                                 fb_record['category'] = values['category']
146                         except:
147                                 print values
148                                 print nodename
149                                 print self.findbad['nodes'][nodename]
150                                 count -= 1
151                                 continue
152                         fb_record['state'] = values['state']
153                         fb_record['comonstats'] = values['comonstats']
154                         fb_record['plcnode'] = values['plcnode']
155                         fb_record['kernel'] = self.getKernel(values['kernel'])
156                         fb_record['stage'] = "findbad"
157                         fb_record['message'] = None
158                         fb_record['bootcd'] = values['bootcd']
159                         fb_record['args'] = None
160                         fb_record['info'] = None
161                         fb_record['time'] = time.time()
162                         fb_record['date_created'] = time.time()
163
164                         if loginbase not in self.sickdb:
165                                 self.sickdb[loginbase] = {}
166
167                         self.sickdb[loginbase][nodename] = fb_record
168
169                 print "Found %d nodes" % count
170
171         def getKernel(self, unamestr):
172                 s = unamestr.split()
173                 if len(s) > 2:
174                         return s[2]
175                 else:
176                         return ""
177
178         def mergeActionsAndBadDB(self): 
179                 """
180                 - Look at the sick node_records as reported in findbad, 
181                 - Then look at the node_records in act_all.  
182
183                 There are four cases:
184                 1) Problem in findbad, no problem in act_all
185                         this ok, b/c it just means it's a new problem
186                 2) Problem in findbad, problem in act_all
187                         -Did the problem get better or worse?  
188                                 -If Same, or Worse, then continue looking for open tickets.
189                                 -If Better, or No problem, then "back-off" penalties.
190                                         This judgement may need to wait until 'Diagnose()'
191
192                 3) No problem in findbad, problem in act_all
193                         The the node is operational again according to Findbad()
194
195                 4) No problem in findbad, no problem in act_all
196                         There won't be a record in either db, so there's no code.
197                 """
198
199                 sorted_sites = self.sickdb.keys()
200                 sorted_sites.sort()
201                 # look at all problems reported by findbad
202                 for loginbase in sorted_sites:
203                         d_fb_nodes = self.sickdb[loginbase]
204                         sorted_nodes = d_fb_nodes.keys()
205                         sorted_nodes.sort()
206                         for nodename in sorted_nodes:
207                                 fb_record = self.sickdb[loginbase][nodename]
208                                 x = fb_record
209                                 if loginbase not in self.mergedb:
210                                         self.mergedb[loginbase] = {}
211
212                                 # take the info either from act_all or fb-record.
213                                 # if node not in act_all
214                                 #       then take it from fbrecord, obviously.
215                                 # else node in act_all
216                                 #   if act_all == 0 length (no previous records)
217                                 #               then take it from fbrecord.
218                                 #   else
219                                 #           take it from act_all.
220                                 #   
221
222                                 # We must compare findbad state with act_all state
223                                 if nodename not in self.act_all:
224                                         # 1) ok, b/c it's a new problem. set ticket_id to null
225                                         self.mergedb[loginbase][nodename] = {} 
226                                         self.mergedb[loginbase][nodename].update(x)
227                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
228                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
229                                 else: 
230                                         if len(self.act_all[nodename]) == 0:
231                                                 self.mergedb[loginbase][nodename] = {} 
232                                                 self.mergedb[loginbase][nodename].update(x)
233                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
234                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
235                                         else:
236                                                 y = self.act_all[nodename][0]
237                                                 y['prev_category'] = y['category']
238
239                                                 self.mergedb[loginbase][nodename] = {}
240                                                 self.mergedb[loginbase][nodename].update(y)
241                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
242                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
243                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
244                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
245                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
246                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
247                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
248                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
249
250                                         # delete the entry from cache_all to keep it out of case 3)
251                                         del self.cache_all[nodename]
252
253                 # 3) nodes that remin in cache_all were not identified by findbad.
254                 #        Do we keep them or not?
255                 #   NOTE: i think that since the categories are performed before this
256                 #               step now, and by a monitor-controlled agent.
257
258                 # TODO: This does not work correctly.  Do we need this? 
259                 #for hn in self.cache_all.keys():
260                 #       y = self.act_all[hn][0]
261                 #       if 'monitor' in y['bucket']:
262                 #               loginbase = self.plcdb_hn2lb[hn] 
263                 #               if loginbase not in self.sickdb:
264                 #                       self.sickdb[loginbase] = {}
265                 #               self.sickdb[loginbase][hn] = y
266                 #       else:
267                 #               del self.cache_all[hn]
268
269                 print "len of cache_all: %d" % len(self.cache_all.keys())
270                 return
271
272         def sendToRT(self):
273                 sorted_sites = self.mergedb.keys()
274                 sorted_sites.sort()
275                 # look at all problems reported by merge
276                 for loginbase in sorted_sites:
277                         d_merge_nodes = self.mergedb[loginbase]
278                         for nodename in d_merge_nodes.keys():
279                                 record = self.mergedb[loginbase][nodename]
280                                 self.toRT.put(record)
281
282                 # send signal to stop reading
283                 self.toRT.put(None)
284                 return
285
286 class Diagnose(Thread):
287         def __init__(self, fromRT):
288                 self.fromRT = fromRT
289                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
290                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
291
292                 self.diagnose_in = {}
293                 self.diagnose_out = {}
294                 Thread.__init__(self)
295
296
297         def run(self):
298                 self.accumSickSites()
299
300                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
301                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
302
303                 try:
304                         stats = self.diagnoseAll()
305                 except Exception, err:
306                         print "----------------"
307                         import traceback
308                         print traceback.print_exc()
309                         print err
310                         #if config.policysavedb:
311                         sys.exit(1)
312
313                 print_stats("sites_observed", stats)
314                 print_stats("sites_diagnosed", stats)
315                 print_stats("nodes_diagnosed", stats)
316
317                 if config.policysavedb:
318                         print "Saving Databases... diagnose_out"
319                         soltesz.dbDump("diagnose_out", self.diagnose_out)
320
321         def accumSickSites(self):
322                 """
323                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
324                 and insert them into diagnose_in[] as:
325
326                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
327                 """
328                 while 1:
329                         node_record = self.fromRT.get(block = True)
330                         if node_record == None:
331                                 break;
332
333                         nodename = node_record['nodename']
334                         loginbase = self.plcdb_hn2lb[nodename]
335
336                         if loginbase not in self.diagnose_in:
337                                 self.diagnose_in[loginbase] = {}
338
339                         self.diagnose_in[loginbase][nodename] = node_record
340
341                 return
342
343         def diagnoseAll(self):
344                 i_sites_observed = 0
345                 i_sites_diagnosed = 0
346                 i_nodes_diagnosed = 0
347                 i_nodes_actedon = 0
348                 i_sites_emailed = 0
349                 l_allsites = []
350
351                 sorted_sites = self.diagnose_in.keys()
352                 sorted_sites.sort()
353                 self.diagnose_out= {}
354                 for loginbase in sorted_sites:
355                         l_allsites += [loginbase]
356
357                         d_diag_nodes = self.diagnose_in[loginbase]
358                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
359                         # store records in diagnose_out, for saving later.
360                         self.diagnose_out.update(d_act_records)
361                         
362                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
363                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
364                                 i_sites_diagnosed += 1
365                         i_sites_observed += 1
366
367                 return {'sites_observed': i_sites_observed, 
368                                 'sites_diagnosed': i_sites_diagnosed, 
369                                 'nodes_diagnosed': i_nodes_diagnosed, 
370                                 'allsites':l_allsites}
371
372                 pass
373                 
374         def __getDaysDown(self, diag_record, nodename):
375                 daysdown = -1
376                 if diag_record['comonstats']['sshstatus'] != "null":
377                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
378                 elif diag_record['comonstats']['lastcotop'] != "null":
379                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
380                 else:
381                         now = time.time()
382                         last_contact = diag_record['plcnode']['last_contact']
383                         if last_contact == None:
384                                 # the node has never been up, so give it a break
385                                 daysdown = -1
386                         else:
387                                 diff = now - last_contact
388                                 daysdown = diff // (60*60*24)
389                 return daysdown
390
391         def __getStrDaysDown(self, diag_record, nodename):
392                 daysdown = self.__getDaysDown(diag_record, nodename)
393                 if daysdown > 0:
394                         return "(%d days down)"%daysdown
395                 else:
396                         return "Unknown number of days"
397
398         def __getCDVersion(self, diag_record, nodename):
399                 cdversion = ""
400                 #print "Getting kernel for: %s" % diag_record['nodename']
401                 cdversion = diag_record['kernel']
402                 return cdversion
403
404         def __diagnoseSite(self, loginbase, d_diag_nodes):
405                 """
406                 d_diag_nodes are diagnose_in entries.
407                 """
408                 d_diag_site = {loginbase : { 'config' : 
409                                                                                                 {'squeeze': False,
410                                                                                                  'email': False
411                                                                                                 }, 
412                                                                         'nodes': {}
413                                                                         }
414                                            }
415                 sorted_nodes = d_diag_nodes.keys()
416                 sorted_nodes.sort()
417                 for nodename in sorted_nodes:
418                         node_record = d_diag_nodes[nodename]
419                         diag_record = self.__diagnoseNode(loginbase, node_record)
420
421                         if diag_record != None:
422                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
423
424                                 # NOTE: improvement means, we need to act/squeeze and email.
425                                 #print "DIAG_RECORD", diag_record
426                                 if 'monitor-end-record' in diag_record['stage'] or \
427                                    'nmreset' in diag_record['stage']:
428                                 #       print "resetting loginbase!" 
429                                         d_diag_site[loginbase]['config']['squeeze'] = True
430                                         d_diag_site[loginbase]['config']['email'] = True
431                                 #else:
432                                 #       print "NO IMPROVEMENT!!!!"
433                         else:
434                                 pass # there is nothing to do for this node.
435
436                 # NOTE: these settings can be overridden by command line arguments,
437                 #       or the state of a record, i.e. if already in RT's Support Queue.
438                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
439                 if nodes_up < MINUP:
440                         d_diag_site[loginbase]['config']['squeeze'] = True
441
442                 max_slices = self.getMaxSlices(loginbase)
443                 num_nodes = self.getNumNodes(loginbase)
444                 # NOTE: when max_slices == 0, this is either a new site (the old way)
445                 #       or an old disabled site from previous monitor (before site['enabled'])
446                 if nodes_up < num_nodes and max_slices != 0:
447                         d_diag_site[loginbase]['config']['email'] = True
448
449                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
450                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
451
452                 return d_diag_site
453
454         def diagRecordByCategory(self, node_record):
455                 nodename = node_record['nodename']
456                 category = node_record['category']
457                 state    = node_record['state']
458                 loginbase = self.plcdb_hn2lb[nodename]
459                 diag_record = None
460
461                 if  "ERROR" in category:        # i.e. "DOWN"
462                         diag_record = {}
463                         diag_record.update(node_record)
464                         daysdown = self.__getDaysDown(diag_record, nodename) 
465                         if daysdown < 7:
466                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
467                                 print format % (loginbase, nodename, daysdown)
468                                 return None
469
470                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
471                         diag_record['message'] = emailTxt.mailtxt.newdown
472                         diag_record['args'] = {'nodename': nodename}
473                         diag_record['info'] = (nodename, s_daysdown, "")
474
475                         if 'reboot_node_failed' in node_record:
476                                 # there was a previous attempt to use the PCU.
477                                 if node_record['reboot_node_failed'] == False:
478                                         # then the last attempt apparently, succeeded.
479                                         # But, the category is still 'ERROR'.  Therefore, the
480                                         # PCU-to-Node mapping is broken.
481                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
482                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
483                                         diag_record['email_pcu'] = True
484
485                         if diag_record['ticket_id'] == "":
486                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
487                                         (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
488                         else:
489                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
490                                         (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
491
492                 elif "OLDBOOTCD" in category:
493                         # V2 boot cds as determined by findbad
494                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
495                         s_cdversion = self.__getCDVersion(node_record, nodename)
496                         diag_record = {}
497                         diag_record.update(node_record)
498                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
499                         diag_record['message'] = emailTxt.mailtxt.newbootcd
500                         diag_record['args'] = {'nodename': nodename}
501                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
502                         if diag_record['ticket_id'] == "":
503                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
504                                                                         (loginbase, nodename, diag_record['kernel'], 
505                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
506                         else:
507                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
508                                                                         (loginbase, nodename, diag_record['kernel'], 
509                                                                          diag_record['bootcd'], diag_record['ticket_id'])
510
511                 elif "PROD" in category:
512                         if "DEBUG" in state:
513                                 # Not sure what to do with these yet.  Probably need to
514                                 # reboot, and email.
515                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
516                                 return None
517                         elif "BOOT" in state:
518                                 # no action needed.
519                                 # TODO: remove penalties, if any are applied.
520                                 now = time.time()
521                                 last_contact = node_record['plcnode']['last_contact']
522                                 if last_contact == None:
523                                         time_diff = 0
524                                 else:
525                                         time_diff = now - last_contact;
526
527                                 if 'improvement' in node_record['stage']:
528                                         # then we need to pass this on to 'action'
529                                         diag_record = {}
530                                         diag_record.update(node_record)
531                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
532                                         diag_record['args'] = {'nodename': nodename}
533                                         diag_record['info'] = (nodename, node_record['prev_category'], 
534                                                                                                          node_record['category'])
535                                         if 'email_pcu' in diag_record:
536                                                 if diag_record['email_pcu']:
537                                                         # previously, the pcu failed to reboot, so send
538                                                         # email. Now, reset these values to try the reboot
539                                                         # again.
540                                                         diag_record['email_pcu'] = False
541                                                         del diag_record['reboot_node_failed']
542
543                                         if diag_record['ticket_id'] == "":
544                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
545                                                                         (loginbase, nodename, diag_record['stage'], 
546                                                                          state, category, diag_record['found_rt_ticket'])
547                                         else:
548                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
549                                                                         (loginbase, nodename, diag_record['stage'], 
550                                                                          state, category, diag_record['ticket_id'])
551                                         return diag_record
552                                 #elif time_diff >= 6*SPERHOUR:
553                                 #       # heartbeat is older than 30 min.
554                                 #       # then reset NM.
555                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
556                                 #       diag_record = {}
557                                 #       diag_record.update(node_record)
558                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
559                                 #       diag_record['args'] = {'nodename': nodename}
560                                 #       diag_record['stage'] = "nmreset"
561                                 #       diag_record['info'] = (nodename, 
562                                 #                                                       node_record['prev_category'], 
563                                 #                                                       node_record['category'])
564                                 #       if diag_record['ticket_id'] == "":
565                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
566                                 #                                       (loginbase, nodename, diag_record['stage'], 
567                                 #                                        state, category, diag_record['found_rt_ticket'])
568                                 #       else:
569                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
570                                 #                                       (loginbase, nodename, diag_record['stage'])
571 #
572 #                                       return diag_record
573                                 else:
574                                         return None
575                         else:
576                                 # unknown
577                                 pass
578                 elif "ALPHA"    in category:
579                         pass
580                 elif "clock_drift" in category:
581                         pass
582                 elif "dns"    in category:
583                         pass
584                 elif "filerw"    in category:
585                         pass
586                 else:
587                         print "Unknown category!!!! %s" % category
588                         sys.exit(1)
589
590                 return diag_record
591
592         def __diagnoseNode(self, loginbase, node_record):
593                 # TODO: change the format of the hostname in this 
594                 #               record to something more natural.
595                 nodename                = node_record['nodename']
596                 category                = node_record['category']
597                 prev_category   = node_record['prev_category']
598                 state                   = node_record['state']
599                 #if 'prev_category' in node_record:
600                 #       prev_category = node_record['prev_category']
601                 #else:
602                 #       prev_category = "ERROR"
603                 if node_record['prev_category'] != "NORECORD":
604                 
605                         val = cmpCategoryVal(category, prev_category)
606                         print "%s went from %s -> %s" % (nodename, prev_category, category)
607                         if val == 1:
608                                 # improved
609                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
610                                         print "closing record with no ticket: ", node_record['nodename']
611                                         node_record['action'] = ['close_rt']
612                                         node_record['message'] = None
613                                         node_record['stage'] = 'monitor-end-record'
614                                         return node_record
615                                 else:
616                                         node_record['stage'] = 'improvement'
617
618                                 #if 'monitor-end-record' in node_record['stage']:
619                                 #       # just ignore it if it's already ended.
620                                 #       # otherwise, the status should be worse, and we won't get
621                                 #       # here.
622                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
623                                 #       return None
624 #
625 #                                       #return None
626                         elif val == -1:
627                                 # current category is worse than previous, carry on
628                                 pass
629                         else:
630                                 #values are equal, carry on.
631                                 #print "why are we here?"
632                                 pass
633
634                 if 'rt' in node_record and 'Status' in node_record['rt']:
635                         if node_record['stage'] == 'ticket_waitforever':
636                                 if 'resolved' in node_record['rt']['Status']:
637                                         print "ending waitforever record for: ", node_record['nodename']
638                                         node_record['action'] = ['noop']
639                                         node_record['message'] = None
640                                         node_record['stage'] = 'monitor-end-record'
641                                         print "oldlog: %s" % node_record['log'],
642                                         print "%15s" % node_record['action']
643                                         return node_record
644                                 if 'new' in node_record['rt']['Status'] and \
645                                         'Queue' in node_record['rt'] and \
646                                         'Monitor' in node_record['rt']['Queue']:
647
648                                         print "RESETTING stage to findbad"
649                                         node_record['stage'] = 'findbad'
650                         
651                 #### COMPARE category and prev_category
652                 # if not_equal
653                 #       then assign a stage based on relative priorities
654                 # else equal
655                 #       then check category for stats.
656                 diag_record = self.diagRecordByCategory(node_record)
657                 if diag_record == None:
658                         #print "diag_record == None"
659                         return None
660
661                 #### found_RT_ticket
662                 # TODO: need to record time found, and maybe add a stage for acting on it...
663                 # NOTE: after found, if the support ticket is resolved, the block is
664                 #               not removed. How to remove the block on this?
665                 if 'found_rt_ticket' in diag_record and \
666                         diag_record['found_rt_ticket'] is not None:
667                         if diag_record['stage'] is not 'improvement':
668                                 diag_record['stage'] = 'ticket_waitforever'
669                                 
670                 current_time = time.time()
671                 # take off four days, for the delay that database caused.
672                 # TODO: generalize delays at PLC, and prevent enforcement when there
673                 #               have been no emails.
674                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
675                 #delta = current_time - diag_record['time'] - 7*SPERDAY
676                 delta = current_time - diag_record['time']
677
678                 message = diag_record['message']
679                 act_record = {}
680                 act_record.update(diag_record)
681
682                 #### DIAGNOSE STAGES 
683                 if   'findbad' in diag_record['stage']:
684                         # The node is bad, and there's no previous record of it.
685                         act_record['email'] = TECH
686                         act_record['action'] = ['noop']
687                         act_record['message'] = message[0]
688                         act_record['stage'] = 'stage_actinoneweek'
689
690                 elif 'nmreset' in diag_record['stage']:
691                         act_record['email']  = ADMIN 
692                         act_record['action'] = ['reset_nodemanager']
693                         act_record['message'] = message[0]
694                         act_record['stage']  = 'nmreset'
695                         return None
696
697                 elif 'reboot_node' in diag_record['stage']:
698                         act_record['email'] = TECH
699                         act_record['action'] = ['noop']
700                         act_record['message'] = message[0]
701                         act_record['stage'] = 'stage_actinoneweek'
702                         
703                 elif 'improvement' in diag_record['stage']:
704                         # - backoff previous squeeze actions (slice suspend, nocreate)
705                         # TODO: add a backoff_squeeze section... Needs to runthrough
706                         print "backing off of %s" % nodename
707                         act_record['action'] = ['close_rt']
708                         act_record['message'] = message[0]
709                         act_record['stage'] = 'monitor-end-record'
710
711                 elif 'actinoneweek' in diag_record['stage']:
712                         if delta >= 7 * SPERDAY: 
713                                 act_record['email'] = TECH | PI
714                                 act_record['stage'] = 'stage_actintwoweeks'
715                                 act_record['message'] = message[1]
716                                 act_record['action'] = ['nocreate' ]
717                                 act_record['time'] = current_time               # reset clock for waitforever
718                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
719                                 act_record['email'] = TECH 
720                                 act_record['message'] = message[0]
721                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
722                                 act_record['second-mail-at-oneweek'] = True
723                         else:
724                                 act_record['message'] = None
725                                 act_record['action'] = ['waitforoneweekaction' ]
726                                 print "ignoring this record for: %s" % act_record['nodename']
727                                 return None                     # don't send if there's no action
728
729                 elif 'actintwoweeks' in diag_record['stage']:
730                         if delta >= 7 * SPERDAY:
731                                 act_record['email'] = TECH | PI | USER
732                                 act_record['stage'] = 'stage_waitforever'
733                                 act_record['message'] = message[2]
734                                 act_record['action'] = ['suspendslices']
735                                 act_record['time'] = current_time               # reset clock for waitforever
736                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
737                                 act_record['email'] = TECH | PI
738                                 act_record['message'] = message[1]
739                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
740                                 act_record['second-mail-at-twoweeks'] = True
741                         else:
742                                 act_record['message'] = None
743                                 act_record['action'] = ['waitfortwoweeksaction']
744                                 return None                     # don't send if there's no action
745
746                 elif 'ticket_waitforever' in diag_record['stage']:
747                         act_record['email'] = TECH
748                         if 'first-found' not in act_record:
749                                 act_record['first-found'] = True
750                                 act_record['log'] += " firstfound"
751                                 act_record['action'] = ['ticket_waitforever']
752                                 act_record['message'] = None
753                                 act_record['time'] = current_time
754                         else:
755                                 if delta >= 7*SPERDAY:
756                                         act_record['action'] = ['ticket_waitforever']
757                                         act_record['message'] = None
758                                         act_record['time'] = current_time               # reset clock
759                                 else:
760                                         act_record['action'] = ['ticket_waitforever']
761                                         act_record['message'] = None
762                                         return None
763
764                 elif 'waitforever' in diag_record['stage']:
765                         # more than 3 days since last action
766                         # TODO: send only on weekdays.
767                         # NOTE: expects that 'time' has been reset before entering waitforever stage
768                         if delta >= 3*SPERDAY:
769                                 act_record['action'] = ['email-againwaitforever']
770                                 act_record['message'] = message[2]
771                                 act_record['time'] = current_time               # reset clock
772                         else:
773                                 act_record['action'] = ['waitforever']
774                                 act_record['message'] = None
775                                 return None                     # don't send if there's no action
776
777                 else:
778                         # There is no action to be taken, possibly b/c the stage has
779                         # already been performed, but diagnose picked it up again.
780                         # two cases, 
781                         #       1. stage is unknown, or 
782                         #       2. delta is not big enough to bump it to the next stage.
783                         # TODO: figure out which. for now assume 2.
784                         print "UNKNOWN stage for %s; nothing done" % nodename
785                         act_record['action'] = ['unknown']
786                         act_record['message'] = message[0]
787
788                         act_record['email'] = TECH
789                         act_record['action'] = ['noop']
790                         act_record['message'] = message[0]
791                         act_record['stage'] = 'stage_actinoneweek'
792                         act_record['time'] = current_time               # reset clock
793                         #print "Exiting..."
794                         #return None
795                         #sys.exit(1)
796
797                 print "%s" % act_record['log'],
798                 print "%15s" % act_record['action']
799                 return act_record
800
801         def getMaxSlices(self, loginbase):
802                 # if sickdb has a loginbase, then it will have at least one node.
803                 site_stats = None
804
805                 for nodename in self.diagnose_in[loginbase].keys():
806                         if nodename in self.findbad['nodes']:
807                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
808                                 break
809
810                 if site_stats == None:
811                         raise Exception, "loginbase with no nodes in findbad"
812                 else:
813                         return site_stats['max_slices']
814
815         def getNumNodes(self, loginbase):
816                 # if sickdb has a loginbase, then it will have at least one node.
817                 site_stats = None
818
819                 for nodename in self.diagnose_in[loginbase].keys():
820                         if nodename in self.findbad['nodes']:
821                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
822                                 break
823
824                 if site_stats == None:
825                         raise Exception, "loginbase with no nodes in findbad"
826                 else:
827                         return site_stats['num_nodes']
828
829         """
830         Returns number of up nodes as the total number *NOT* in act_all with a
831         stage other than 'steady-state' .
832         """
833         def getUpAtSite(self, loginbase, d_diag_site):
834                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
835                 #               that aren't recorded yet.
836
837                 numnodes = self.getNumNodes(loginbase)
838                 # NOTE: assume nodes we have no record of are ok. (too conservative)
839                 # TODO: make the 'up' value more representative
840                 up = numnodes
841                 for nodename in d_diag_site[loginbase]['nodes'].keys():
842
843                         rec = d_diag_site[loginbase]['nodes'][nodename]
844                         if rec['stage'] != 'monitor-end-record':
845                                 up -= 1
846                         else:
847                                 pass # the node is assumed to be up.
848
849                 #if up != numnodes:
850                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
851
852                 return up
853
854
855 class SiteAction:
856         def __init__(self, parameter_names=['hostname', 'ticket_id']):
857                 self.parameter_names = parameter_names
858         def checkParam(self, args):
859                 for param in self.parameter_names:
860                         if param not in args:
861                                 raise Exception("Parameter %s not provided in args"%param)
862         def run(self, args):
863                 self.checkParam(args)
864                 return self._run(args)
865         def _run(self, args):
866                 pass
867
868 class SuspendAction(SiteAction):
869         def _run(self, args):
870                 return plc.suspendSlices(args['hostname'])
871
872 class RemoveSliceCreation(SiteAction):
873         def _run(self, args):
874                 return plc.removeSliceCreation(args['hostname'])
875
876 class BackoffActions(SiteAction):
877         def _run(self, args):
878                 plc.enableSlices(args['hostname'])
879                 plc.enableSliceCreation(args['hostname'])
880                 return True
881
882 # TODO: create class for each action below, 
883 #               allow for lists of actions to be performed...
884
885 def close_rt_backoff(args):
886         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
887                 mailer.closeTicketViaRT(args['ticket_id'], 
888                                                                 "Ticket CLOSED automatically by SiteAssist.")
889                 plc.enableSlices(args['hostname'])
890                 plc.enableSliceCreation(args['hostname'])
891         return
892
893 def reboot_node(args):
894         host = args['hostname']
895         return reboot.reboot_policy(host, True, config.debug)
896
897 def reset_nodemanager(args):
898         os.system("ssh root@%s /sbin/service nm restart" % nodename)
899         return
900
901 class Action(Thread):
902         def __init__(self, l_action):
903                 self.l_action = l_action
904
905                 # the hostname to loginbase mapping
906                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
907
908                 # Actions to take.
909                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
910                 # Actions taken.
911                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
912
913                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
914                 self.actions = {}
915                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
916                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
917                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
918                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
919                 self.actions['noop'] = lambda args: args
920                 self.actions['reboot_node'] = lambda args: reboot_node(args)
921                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
922
923                 self.actions['ticket_waitforever'] = lambda args: args
924                 self.actions['waitforever'] = lambda args: args
925                 self.actions['unknown'] = lambda args: args
926                 self.actions['waitforoneweekaction'] = lambda args: args
927                 self.actions['waitfortwoweeksaction'] = lambda args: args
928                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
929                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
930                 self.actions['email-againwaitforever'] = lambda args: args
931                 self.actions['email-againticket_waitforever'] = lambda args: args
932                                 
933
934                 self.sickdb = {}
935                 Thread.__init__(self)
936
937         def run(self):
938                 self.accumSites()
939                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
940                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
941
942                 try:
943                         stats = self.analyseSites()
944                 except Exception, err:
945                         print "----------------"
946                         import traceback
947                         print traceback.print_exc()
948                         print err
949                         if config.policysavedb:
950                                 print "Saving Databases... act_all"
951                                 soltesz.dbDump("act_all", self.act_all)
952                         sys.exit(1)
953
954                 print_stats("sites_observed", stats)
955                 print_stats("sites_diagnosed", stats)
956                 print_stats("nodes_diagnosed", stats)
957                 print_stats("sites_emailed", stats)
958                 print_stats("nodes_actedon", stats)
959                 print string.join(stats['allsites'], ",")
960
961                 if config.policysavedb:
962                         print "Saving Databases... act_all"
963                         #soltesz.dbDump("policy.eventlog", self.eventlog)
964                         # TODO: remove 'diagnose_out', 
965                         #       or at least the entries that were acted on.
966                         soltesz.dbDump("act_all", self.act_all)
967
968         def accumSites(self):
969                 """
970                 Take all nodes, from l_action, look them up in the diagnose_db database, 
971                 and insert them into sickdb[] as:
972
973                 This way only the given l_action nodes will be acted on regardless
974                 of how many from diagnose_db are available.
975
976                         sickdb[loginbase][nodename] = diag_record
977                 """
978                 # TODO: what if l_action == None ?
979                 for nodename in self.l_action:
980
981                         loginbase = self.plcdb_hn2lb[nodename]
982
983                         if loginbase in self.diagnose_db and \
984                                 nodename in self.diagnose_db[loginbase]['nodes']:
985
986                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
987
988                                 if loginbase not in self.sickdb:
989                                         self.sickdb[loginbase] = {'nodes' : {}}
990
991                                 # NOTE: don't copy all node records, since not all will be in l_action
992                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
993                                 # NOTE: but, we want to get the loginbase config settings, 
994                                 #               this is the easiest way.
995                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
996                         #else:
997                                 #print "%s not in diagnose_db!!" % loginbase
998                 return
999
1000         def __emailSite(self, loginbase, roles, message, args):
1001                 """
1002                 loginbase is the unique site abbreviation, prepended to slice names.
1003                 roles contains TECH, PI, USER roles, and derive email aliases.
1004                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1005                 """
1006                 ticket_id = 0
1007                 args.update({'loginbase':loginbase})
1008
1009                 if not config.mail and not config.debug and config.bcc:
1010                         roles = ADMIN
1011                 if config.mail and config.debug:
1012                         roles = ADMIN
1013
1014                 # build targets
1015                 contacts = []
1016                 if ADMIN & roles:
1017                         contacts += [config.email]
1018                 if TECH & roles:
1019                         contacts += [TECHEMAIL % loginbase]
1020                 if PI & roles:
1021                         contacts += [PIEMAIL % loginbase]
1022                 if USER & roles:
1023                         slices = plc.slices(loginbase)
1024                         if len(slices) >= 1:
1025                                 for slice in slices:
1026                                         contacts += [SLICEMAIL % slice]
1027                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1028                         else:
1029                                 print "SLIC: %20s : 0 slices" % loginbase
1030
1031                 try:
1032                         subject = message[0] % args
1033                         body = message[1] % args
1034                         if ADMIN & roles:
1035                                 # send only to admin
1036                                 if 'ticket_id' in args:
1037                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1038                                 else:
1039                                         subj = "Re: [PL noticket] %s" % subject
1040                                 mailer.email(subj, body, contacts)
1041                                 ticket_id = args['ticket_id']
1042                         else:
1043                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1044                 except Exception, err:
1045                         print "exception on message:"
1046                         import traceback
1047                         print traceback.print_exc()
1048                         print message
1049
1050                 return ticket_id
1051
1052
1053         def _format_diaginfo(self, diag_node):
1054                 info = diag_node['info']
1055                 if diag_node['stage'] == 'monitor-end-record':
1056                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1057                 else:
1058                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1059                 return hlist
1060
1061
1062         def get_email_args(self, act_recordlist, loginbase=None):
1063
1064                 email_args = {}
1065                 email_args['hostname_list'] = ""
1066
1067                 for act_record in act_recordlist:
1068                         email_args['hostname_list'] += act_record['msg_format']
1069                         email_args['hostname'] = act_record['nodename']
1070                         if  'plcnode' in act_record and \
1071                                 'pcu_ids' in act_record['plcnode'] and \
1072                                 len(act_record['plcnode']['pcu_ids']) > 0:
1073                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1074                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1075                         else:
1076                                 email_args['pcu_id'] = "-1"
1077                                         
1078                         if 'ticket_id' in act_record:
1079                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1080                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1081                                         sys.stdout.flush()
1082                                         line = sys.stdin.readline()
1083                                         try:
1084                                                 ticket_id = int(line)
1085                                         except:
1086                                                 print "could not get ticket_id from stdin..."
1087                                                 os._exit(1)
1088                                 else:
1089                                         ticket_id = act_record['ticket_id']
1090                                         
1091                                 email_args['ticket_id'] = ticket_id
1092
1093                 return email_args
1094
1095         def get_unique_issues(self, act_recordlist):
1096                 # NOTE: only send one email per site, per problem...
1097                 unique_issues = {}
1098                 for act_record in act_recordlist:
1099                         act_key = act_record['action'][0]
1100                         if act_key not in unique_issues:
1101                                 unique_issues[act_key] = []
1102                                 
1103                         unique_issues[act_key] += [act_record]
1104                         
1105                 return unique_issues
1106                         
1107
1108         def __actOnSite(self, loginbase, site_record):
1109                 i_nodes_actedon = 0
1110                 i_nodes_emailed = 0
1111
1112                 act_recordlist = []
1113
1114                 for nodename in site_record['nodes'].keys():
1115                         diag_record = site_record['nodes'][nodename]
1116                         act_record  = self.__actOnNode(diag_record)
1117                         #print "nodename: %s %s" % (nodename, act_record)
1118                         if act_record is not None:
1119                                 act_recordlist += [act_record]
1120
1121                 unique_issues = self.get_unique_issues(act_recordlist)
1122
1123                 for issue in unique_issues.keys():
1124                         print "\tworking on issue: %s" % issue
1125                         issue_record_list = unique_issues[issue]
1126                         email_args = self.get_email_args(issue_record_list, loginbase)
1127
1128                         # for each record.
1129                         for act_record in issue_record_list:
1130                                 # if there's a pcu record and email config is set
1131                                 if 'email_pcu' in act_record:
1132                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1133                                                 # and 'reboot_node' in act_record['stage']:
1134
1135                                                 email_args['hostname'] = act_record['nodename']
1136                                                 ticket_id = self.__emailSite(loginbase, 
1137                                                                                         act_record['email'], 
1138                                                                                         emailTxt.mailtxt.pcudown[0],
1139                                                                                         email_args)
1140                                                 if ticket_id == 0:
1141                                                         # error.
1142                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1143                                                         os._exit(1)
1144                                                         pass
1145                                                 email_args['ticket_id'] = ticket_id
1146
1147                         
1148                         act_record = issue_record_list[0]
1149                         # send message before squeezing
1150                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1151                                                                                                 site_record['config']['email'])
1152                         if act_record['message'] != None and site_record['config']['email']:
1153                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1154                                                                                          act_record['message'], email_args)
1155
1156                                 if ticket_id == 0:
1157                                         # error.
1158                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1159                                         os._exit(1)
1160                                         pass
1161
1162                                 # Add ticket_id to ALL nodenames
1163                                 for act_record in issue_record_list:
1164                                         nodename = act_record['nodename']
1165                                         # update node record with RT ticket_id
1166                                         if nodename in self.act_all:
1167                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1168                                                 # if the ticket was previously resolved, reset it to new.
1169                                                 if 'rt' in act_record and \
1170                                                         'Status' in act_record['rt'] and \
1171                                                         act_record['rt']['Status'] == 'resolved':
1172                                                         mailer.setTicketStatus(ticket_id, "new")
1173                                                 status = mailer.getTicketStatus(ticket_id)
1174                                                 self.act_all[nodename][0]['rt'] = status
1175                                         if config.mail: i_nodes_emailed += 1
1176
1177                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1178                                                                                                         site_record['config']['squeeze'])
1179                         if config.squeeze and site_record['config']['squeeze']:
1180                                 for act_key in act_record['action']:
1181                                         self.actions[act_key](email_args)
1182                                 i_nodes_actedon += 1
1183                 
1184                 if config.policysavedb:
1185                         print "Saving Databases... act_all, diagnose_out"
1186                         soltesz.dbDump("act_all", self.act_all)
1187                         # remove site record from diagnose_out, it's in act_all as done.
1188                         del self.diagnose_db[loginbase]
1189                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1190
1191                 print "sleeping for 1 sec"
1192                 time.sleep(1)
1193                 #print "Hit enter to continue..."
1194                 #sys.stdout.flush()
1195                 #line = sys.stdin.readline()
1196
1197                 return (i_nodes_actedon, i_nodes_emailed)
1198
1199         def __actOnNode(self, diag_record):
1200                 nodename = diag_record['nodename']
1201                 message = diag_record['message']
1202
1203                 act_record = {}
1204                 act_record.update(diag_record)
1205                 act_record['nodename'] = nodename
1206                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1207                 print "act_record['stage'] == %s " % act_record['stage']
1208
1209                 # avoid end records, and nmreset records                                        
1210                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1211
1212                 if 'monitor-end-record' not in act_record['stage'] and \
1213                    'nmreset' not in act_record['stage'] and \
1214                    'reboot_node_failed' not in act_record:
1215
1216                         if "DOWN" in act_record['log'] and \
1217                                         'pcu_ids' in act_record['plcnode'] and \
1218                                         len(act_record['plcnode']['pcu_ids']) > 0:
1219
1220                                 print "%s" % act_record['log'],
1221                                 print "%15s" % (['reboot_node'],)
1222                                 # Set node to re-install
1223                                 plc.nodeBootState(act_record['nodename'], "rins")       
1224                                 try:
1225                                         ret = reboot_node({'hostname': act_record['nodename']})
1226                                 except Exception, exc:
1227                                         print "exception on reboot_node:"
1228                                         import traceback
1229                                         print traceback.print_exc()
1230                                         ret = False
1231
1232                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1233                                         # Reboot Succeeded
1234                                         print "reboot succeeded for %s" % act_record['nodename']
1235                                         act_record2 = {}
1236                                         act_record2.update(act_record)
1237                                         act_record2['action'] = ['reboot_node']
1238                                         act_record2['stage'] = "reboot_node"
1239                                         act_record2['reboot_node_failed'] = False
1240                                         act_record2['email_pcu'] = False
1241
1242                                         if nodename not in self.act_all: 
1243                                                 self.act_all[nodename] = []
1244                                         print "inserting 'reboot_node' record into act_all"
1245                                         self.act_all[nodename].insert(0,act_record2)
1246
1247                                         # return None to avoid further action
1248                                         print "Taking no further action"
1249                                         return None
1250                                 else:
1251                                         print "reboot failed for %s" % act_record['nodename']
1252                                         # set email_pcu to also send pcu notice for this record.
1253                                         act_record['reboot_node_failed'] = True
1254                                         act_record['email_pcu'] = True
1255
1256                         print "%s" % act_record['log'],
1257                         print "%15s" % act_record['action']
1258
1259                 if act_record['stage'] is not 'monitor-end-record' and \
1260                    act_record['stage'] is not 'nmreset':
1261                         if nodename not in self.act_all: 
1262                                 self.act_all[nodename] = []
1263
1264                         self.act_all[nodename].insert(0,act_record)
1265                 else:
1266                         print "Not recording %s in act_all" % nodename
1267
1268                 return act_record
1269
1270         def analyseSites(self):
1271                 i_sites_observed = 0
1272                 i_sites_diagnosed = 0
1273                 i_nodes_diagnosed = 0
1274                 i_nodes_actedon = 0
1275                 i_sites_emailed = 0
1276                 l_allsites = []
1277
1278                 sorted_sites = self.sickdb.keys()
1279                 sorted_sites.sort()
1280                 for loginbase in sorted_sites:
1281                         site_record = self.sickdb[loginbase]
1282                         print "sites: %s" % loginbase
1283                         
1284                         i_nodes_diagnosed += len(site_record.keys())
1285                         i_sites_diagnosed += 1
1286
1287                         (na,ne) = self.__actOnSite(loginbase, site_record)
1288
1289                         i_sites_observed += 1
1290                         i_nodes_actedon += na
1291                         i_sites_emailed += ne
1292
1293                         l_allsites += [loginbase]
1294
1295                 return {'sites_observed': i_sites_observed, 
1296                                 'sites_diagnosed': i_sites_diagnosed, 
1297                                 'nodes_diagnosed': i_nodes_diagnosed, 
1298                                 'sites_emailed': i_sites_emailed, 
1299                                 'nodes_actedon': i_nodes_actedon, 
1300                                 'allsites':l_allsites}
1301
1302         def print_stats(self, key, stats):
1303                 print "%20s : %d" % (key, stats[key])
1304
1305
1306
1307         #"""
1308         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1309         #"""
1310         #def status(self):
1311         #       sub = "Monitor Summary"
1312         #       msg = "\nThe following nodes were acted upon:  \n\n"
1313         #       for (node, (type, date)) in self.emailed.items():
1314         #               # Print only things acted on today.
1315         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1316         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1317         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1318         #       for (loginbase, (date, type)) in self.squeezed.items():
1319         #               # Print only things acted on today.
1320         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1321         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1322         #       mailer.email(sub, msg, [SUMTO])
1323         #       logger.info(msg)
1324         #       return 
1325
1326         #"""
1327         #Store/Load state of emails.  When, where, what.
1328         #"""
1329         #def emailedStore(self, action):
1330         #       try:
1331         #               if action == "LOAD":
1332         #                       f = open(DAT, "r+")
1333         #                       logger.info("POLICY:  Found and reading " + DAT)
1334         #                       self.emailed.update(pickle.load(f))
1335         #               if action == "WRITE":
1336         #                       f = open(DAT, "w")
1337         #                       #logger.debug("Writing " + DAT)
1338         #                       pickle.dump(self.emailed, f)
1339         #               f.close()
1340         #       except Exception, err:
1341         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1342
1343
1344 #class Policy(Thread):
1345
1346 def main():
1347         print "policy.py is a module, not a script for running directly."
1348
1349 if __name__ == '__main__':
1350         import os
1351         import plc
1352         try:
1353                 main()
1354         except KeyboardInterrupt:
1355                 print "Killed.  Exitting."
1356                 logger.info('Monitor Killed')
1357                 os._exit(0)