allow RT module to be removed.
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import soltesz
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 from config import config
26 print "policy"
27 config = config()
28
29 DAT="./monitor.dat"
30
31 logger = logging.getLogger("monitor")
32
33 # Time to enforce policy
34 POLSLEEP = 7200
35
36 # Where to email the summary
37 SUMTO = "soltesz@cs.princeton.edu"
38 TECHEMAIL="tech-%s@sites.planet-lab.org"
39 PIEMAIL="pi-%s@sites.planet-lab.org"
40 SLICEMAIL="%s@slices.planet-lab.org"
41 PLCEMAIL="support@planet-lab.org"
42
43 #Thresholds (DAYS)
44 SPERMIN = 60
45 SPERHOUR = 60*60
46 SPERDAY = 86400
47 PITHRESH = 7 * SPERDAY
48 SLICETHRESH = 7 * SPERDAY
49 # Days before attempting rins again
50 RINSTHRESH = 5 * SPERDAY
51
52 # Days before calling the node dead.
53 DEADTHRESH = 30 * SPERDAY
54 # Minimum number of nodes up before squeezing
55 MINUP = 2
56
57 TECH=1
58 PI=2
59 USER=4
60 ADMIN=8
61
62 # IF:
63 #  no SSH, down.
64 #  bad disk, down
65 #  DNS, kinda down (sick)
66 #  clock, kinda down (sick)
67 #  Full disk, going to be down
68
69 # Actions:
70 #  Email
71 #  suspend slice creation
72 #  kill slices
73 def array_to_priority_map(array):
74         """ Create a mapping where each entry of array is given a priority equal
75         to its position in the array.  This is useful for subsequent use in the
76         cmpMap() function."""
77         map = {}
78         count = 0
79         for i in array:
80                 map[i] = count
81                 count += 1
82         return map
83
84 def getdebug():
85         return config.debug
86
87 def print_stats(key, stats):
88         if key in stats: print "%20s : %d" % (key, stats[key])
89
90 def get_ticket_id(record):
91         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
92                 return record['ticket_id']
93         elif            'found_rt_ticket' in record and \
94                  record['found_rt_ticket'] is not "" and \
95                  record['found_rt_ticket'] is not None:
96                 return record['found_rt_ticket']
97         else:
98                 return None
99
100 class Merge(Thread):
101         def __init__(self, l_merge, toRT):
102                 self.toRT = toRT
103                 self.merge_list = l_merge
104                 # the hostname to loginbase mapping
105                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
106
107                 # Previous actions taken on nodes.
108                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
109                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
110
111                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
112                 self.sickdb = {}
113                 self.mergedb = {}
114                 Thread.__init__(self)
115
116         def run(self):
117                 # populate sickdb
118                 self.accumSickSites()
119                 # read data from findbad and act_all
120                 self.mergeActionsAndBadDB()
121                 # pass node_records to RT
122                 self.sendToRT()
123
124         def accumSickSites(self):
125                 """
126                 Take all nodes, from l_diagnose, look them up in the act_all database, 
127                 and insert them into sickdb[] as:
128
129                         sickdb[loginbase][nodename] = fb_record
130                 """
131                 # look at all problems reported by findbad
132                 l_nodes = self.findbad['nodes'].keys()
133                 count = 0
134                 for nodename in l_nodes:
135                         if nodename not in self.merge_list:
136                                 continue                # skip this node, since it's not wanted
137
138                         count += 1
139                         loginbase = self.plcdb_hn2lb[nodename]
140                         values = self.findbad['nodes'][nodename]['values']
141
142                         fb_record = {}
143                         fb_record['nodename'] = nodename
144                         try:
145                                 fb_record['category'] = values['category']
146                         except:
147                                 print values
148                                 print nodename
149                                 print self.findbad['nodes'][nodename]
150                                 count -= 1
151                                 continue
152                         fb_record['state'] = values['state']
153                         fb_record['comonstats'] = values['comonstats']
154                         fb_record['plcnode'] = values['plcnode']
155                         fb_record['kernel'] = self.getKernel(values['kernel'])
156                         fb_record['stage'] = "findbad"
157                         fb_record['message'] = None
158                         fb_record['bootcd'] = values['bootcd']
159                         fb_record['args'] = None
160                         fb_record['info'] = None
161                         fb_record['time'] = time.time()
162                         fb_record['date_created'] = time.time()
163
164                         if loginbase not in self.sickdb:
165                                 self.sickdb[loginbase] = {}
166
167                         self.sickdb[loginbase][nodename] = fb_record
168
169                 print "Found %d nodes" % count
170
171         def getKernel(self, unamestr):
172                 s = unamestr.split()
173                 if len(s) > 2:
174                         return s[2]
175                 else:
176                         return ""
177
178         def mergeActionsAndBadDB(self): 
179                 """
180                 - Look at the sick node_records as reported in findbad, 
181                 - Then look at the node_records in act_all.  
182
183                 There are four cases:
184                 1) Problem in findbad, no problem in act_all
185                         this ok, b/c it just means it's a new problem
186                 2) Problem in findbad, problem in act_all
187                         -Did the problem get better or worse?  
188                                 -If Same, or Worse, then continue looking for open tickets.
189                                 -If Better, or No problem, then "back-off" penalties.
190                                         This judgement may need to wait until 'Diagnose()'
191
192                 3) No problem in findbad, problem in act_all
193                         The the node is operational again according to Findbad()
194
195                 4) No problem in findbad, no problem in act_all
196                         There won't be a record in either db, so there's no code.
197                 """
198
199                 sorted_sites = self.sickdb.keys()
200                 sorted_sites.sort()
201                 # look at all problems reported by findbad
202                 for loginbase in sorted_sites:
203                         d_fb_nodes = self.sickdb[loginbase]
204                         sorted_nodes = d_fb_nodes.keys()
205                         sorted_nodes.sort()
206                         for nodename in sorted_nodes:
207                                 fb_record = self.sickdb[loginbase][nodename]
208                                 x = fb_record
209                                 if loginbase not in self.mergedb:
210                                         self.mergedb[loginbase] = {}
211
212                                 # take the info either from act_all or fb-record.
213                                 # if node not in act_all
214                                 #       then take it from fbrecord, obviously.
215                                 # else node in act_all
216                                 #   if act_all == 0 length (no previous records)
217                                 #               then take it from fbrecord.
218                                 #   else
219                                 #           take it from act_all.
220                                 #   
221
222                                 # We must compare findbad state with act_all state
223                                 if nodename not in self.act_all:
224                                         # 1) ok, b/c it's a new problem. set ticket_id to null
225                                         self.mergedb[loginbase][nodename] = {} 
226                                         self.mergedb[loginbase][nodename].update(x)
227                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
228                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
229                                 else: 
230                                         if len(self.act_all[nodename]) == 0:
231                                                 self.mergedb[loginbase][nodename] = {} 
232                                                 self.mergedb[loginbase][nodename].update(x)
233                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
234                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
235                                         else:
236                                                 y = self.act_all[nodename][0]
237                                                 y['prev_category'] = y['category']
238
239                                                 self.mergedb[loginbase][nodename] = {}
240                                                 self.mergedb[loginbase][nodename].update(y)
241                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
242                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
243                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
244                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
245                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
246                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
247                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
248                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
249
250                                         # delete the entry from cache_all to keep it out of case 3)
251                                         del self.cache_all[nodename]
252
253                 # 3) nodes that remin in cache_all were not identified by findbad.
254                 #        Do we keep them or not?
255                 #   NOTE: i think that since the categories are performed before this
256                 #               step now, and by a monitor-controlled agent.
257
258                 # TODO: This does not work correctly.  Do we need this? 
259                 #for hn in self.cache_all.keys():
260                 #       y = self.act_all[hn][0]
261                 #       if 'monitor' in y['bucket']:
262                 #               loginbase = self.plcdb_hn2lb[hn] 
263                 #               if loginbase not in self.sickdb:
264                 #                       self.sickdb[loginbase] = {}
265                 #               self.sickdb[loginbase][hn] = y
266                 #       else:
267                 #               del self.cache_all[hn]
268
269                 print "len of cache_all: %d" % len(self.cache_all.keys())
270                 return
271
272         def sendToRT(self):
273                 sorted_sites = self.mergedb.keys()
274                 sorted_sites.sort()
275                 # look at all problems reported by merge
276                 for loginbase in sorted_sites:
277                         d_merge_nodes = self.mergedb[loginbase]
278                         for nodename in d_merge_nodes.keys():
279                                 record = self.mergedb[loginbase][nodename]
280                                 self.toRT.put(record)
281
282                 # send signal to stop reading
283                 self.toRT.put(None)
284                 return
285
286 class Diagnose(Thread):
287         def __init__(self, fromRT):
288                 self.fromRT = fromRT
289                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
290                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
291
292                 self.diagnose_in = {}
293                 self.diagnose_out = {}
294                 Thread.__init__(self)
295
296
297         def run(self):
298                 self.accumSickSites()
299
300                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
301                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
302
303                 try:
304                         stats = self.diagnoseAll()
305                 except Exception, err:
306                         print "----------------"
307                         import traceback
308                         print traceback.print_exc()
309                         print err
310                         #if config.policysavedb:
311                         sys.exit(1)
312
313                 print_stats("sites_observed", stats)
314                 print_stats("sites_diagnosed", stats)
315                 print_stats("nodes_diagnosed", stats)
316
317                 if config.policysavedb:
318                         print "Saving Databases... diagnose_out"
319                         soltesz.dbDump("diagnose_out", self.diagnose_out)
320
321         def accumSickSites(self):
322                 """
323                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
324                 and insert them into diagnose_in[] as:
325
326                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
327                 """
328                 while 1:
329                         node_record = self.fromRT.get(block = True)
330                         if node_record == None:
331                                 break;
332
333                         nodename = node_record['nodename']
334                         loginbase = self.plcdb_hn2lb[nodename]
335
336                         if loginbase not in self.diagnose_in:
337                                 self.diagnose_in[loginbase] = {}
338
339                         self.diagnose_in[loginbase][nodename] = node_record
340
341                 return
342
343         def diagnoseAll(self):
344                 i_sites_observed = 0
345                 i_sites_diagnosed = 0
346                 i_nodes_diagnosed = 0
347                 i_nodes_actedon = 0
348                 i_sites_emailed = 0
349                 l_allsites = []
350
351                 sorted_sites = self.diagnose_in.keys()
352                 sorted_sites.sort()
353                 self.diagnose_out= {}
354                 for loginbase in sorted_sites:
355                         l_allsites += [loginbase]
356
357                         d_diag_nodes = self.diagnose_in[loginbase]
358                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
359                         # store records in diagnose_out, for saving later.
360                         self.diagnose_out.update(d_act_records)
361                         
362                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
363                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
364                                 i_sites_diagnosed += 1
365                         i_sites_observed += 1
366
367                 return {'sites_observed': i_sites_observed, 
368                                 'sites_diagnosed': i_sites_diagnosed, 
369                                 'nodes_diagnosed': i_nodes_diagnosed, 
370                                 'allsites':l_allsites}
371
372                 pass
373                 
374         def __getDaysDown(self, diag_record, nodename):
375                 daysdown = -1
376                 if diag_record['comonstats']['sshstatus'] != "null":
377                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
378                 elif diag_record['comonstats']['lastcotop'] != "null":
379                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
380                 else:
381                         now = time.time()
382                         last_contact = diag_record['plcnode']['last_contact']
383                         if last_contact == None:
384                                 # the node has never been up, so give it a break
385                                 daysdown = -1
386                         else:
387                                 diff = now - last_contact
388                                 daysdown = diff // (60*60*24)
389                 return daysdown
390
391         def __getStrDaysDown(self, diag_record, nodename):
392                 daysdown = self.__getDaysDown(diag_record, nodename)
393                 if daysdown > 0:
394                         return "(%d days down)"%daysdown
395                 else:
396                         return "Unknown number of days"
397
398         def __getCDVersion(self, diag_record, nodename):
399                 cdversion = ""
400                 #print "Getting kernel for: %s" % diag_record['nodename']
401                 cdversion = diag_record['kernel']
402                 return cdversion
403
404         def __diagnoseSite(self, loginbase, d_diag_nodes):
405                 """
406                 d_diag_nodes are diagnose_in entries.
407                 """
408                 d_diag_site = {loginbase : { 'config' : 
409                                                                                                 {'squeeze': False,
410                                                                                                  'email': False
411                                                                                                 }, 
412                                                                         'nodes': {}
413                                                                         }
414                                            }
415                 sorted_nodes = d_diag_nodes.keys()
416                 sorted_nodes.sort()
417                 for nodename in sorted_nodes:
418                         node_record = d_diag_nodes[nodename]
419                         diag_record = self.__diagnoseNode(loginbase, node_record)
420
421                         if diag_record != None:
422                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
423
424                                 # NOTE: improvement means, we need to act/squeeze and email.
425                                 #print "DIAG_RECORD", diag_record
426                                 if 'monitor-end-record' in diag_record['stage'] or \
427                                    'nmreset' in diag_record['stage']:
428                                 #       print "resetting loginbase!" 
429                                         d_diag_site[loginbase]['config']['squeeze'] = True
430                                         d_diag_site[loginbase]['config']['email'] = True
431                                 #else:
432                                 #       print "NO IMPROVEMENT!!!!"
433                         else:
434                                 pass # there is nothing to do for this node.
435
436                 # NOTE: these settings can be overridden by command line arguments,
437                 #       or the state of a record, i.e. if already in RT's Support Queue.
438                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
439                 if nodes_up < MINUP:
440                         d_diag_site[loginbase]['config']['squeeze'] = True
441
442                 max_slices = self.getMaxSlices(loginbase)
443                 num_nodes = self.getNumNodes(loginbase)
444                 # NOTE: when max_slices == 0, this is either a new site (the old way)
445                 #       or an old disabled site from previous monitor (before site['enabled'])
446                 if nodes_up < num_nodes and max_slices != 0:
447                         d_diag_site[loginbase]['config']['email'] = True
448
449                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
450                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
451
452                 return d_diag_site
453
454         def diagRecordByCategory(self, node_record):
455                 nodename = node_record['nodename']
456                 category = node_record['category']
457                 state    = node_record['state']
458                 loginbase = self.plcdb_hn2lb[nodename]
459                 diag_record = None
460
461                 if  "ERROR" in category:        # i.e. "DOWN"
462                         diag_record = {}
463                         diag_record.update(node_record)
464                         daysdown = self.__getDaysDown(diag_record, nodename) 
465                         if daysdown < 7:
466                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
467                                 print format % (loginbase, nodename, daysdown)
468                                 return None
469
470                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
471                         diag_record['message'] = emailTxt.mailtxt.newdown
472                         diag_record['args'] = {'nodename': nodename}
473                         diag_record['info'] = (nodename, s_daysdown, "")
474
475                         if 'reboot_node_failed' in node_record:
476                                 # there was a previous attempt to use the PCU.
477                                 if node_record['reboot_node_failed'] == False:
478                                         # then the last attempt apparently, succeeded.
479                                         # But, the category is still 'ERROR'.  Therefore, the
480                                         # PCU-to-Node mapping is broken.
481                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
482                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
483                                         diag_record['email_pcu'] = True
484
485                         if 'ticket_id' in diag_record:
486                                 if diag_record['ticket_id'] == "":
487                                         if 'found_rt_ticket' in diag_record:
488                                                 ticket_id = diag_record['found_rt_ticket']
489                                         else:
490                                                 ticket_id = "None"
491                                 else:
492                                         ticket_id = diag_record['ticket_id']
493                         else:
494                                 ticket_id = "None"
495
496                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
497                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
498
499                 elif "OLDBOOTCD" in category:
500                         # V2 boot cds as determined by findbad
501                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
502                         s_cdversion = self.__getCDVersion(node_record, nodename)
503                         diag_record = {}
504                         diag_record.update(node_record)
505                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
506                         diag_record['message'] = emailTxt.mailtxt.newbootcd
507                         diag_record['args'] = {'nodename': nodename}
508                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
509                         if diag_record['ticket_id'] == "":
510                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
511                                                                         (loginbase, nodename, diag_record['kernel'], 
512                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
513                         else:
514                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
515                                                                         (loginbase, nodename, diag_record['kernel'], 
516                                                                          diag_record['bootcd'], diag_record['ticket_id'])
517
518                 elif "PROD" in category:
519                         if "DEBUG" in state:
520                                 # Not sure what to do with these yet.  Probably need to
521                                 # reboot, and email.
522                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
523                                 return None
524                         elif "BOOT" in state:
525                                 # no action needed.
526                                 # TODO: remove penalties, if any are applied.
527                                 now = time.time()
528                                 last_contact = node_record['plcnode']['last_contact']
529                                 if last_contact == None:
530                                         time_diff = 0
531                                 else:
532                                         time_diff = now - last_contact;
533
534                                 if 'improvement' in node_record['stage']:
535                                         # then we need to pass this on to 'action'
536                                         diag_record = {}
537                                         diag_record.update(node_record)
538                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
539                                         diag_record['args'] = {'nodename': nodename}
540                                         diag_record['info'] = (nodename, node_record['prev_category'], 
541                                                                                                          node_record['category'])
542                                         if 'email_pcu' in diag_record:
543                                                 if diag_record['email_pcu']:
544                                                         # previously, the pcu failed to reboot, so send
545                                                         # email. Now, reset these values to try the reboot
546                                                         # again.
547                                                         diag_record['email_pcu'] = False
548                                                         del diag_record['reboot_node_failed']
549
550                                         if diag_record['ticket_id'] == "":
551                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
552                                                                         (loginbase, nodename, diag_record['stage'], 
553                                                                          state, category, diag_record['found_rt_ticket'])
554                                         else:
555                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
556                                                                         (loginbase, nodename, diag_record['stage'], 
557                                                                          state, category, diag_record['ticket_id'])
558                                         return diag_record
559                                 #elif time_diff >= 6*SPERHOUR:
560                                 #       # heartbeat is older than 30 min.
561                                 #       # then reset NM.
562                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
563                                 #       diag_record = {}
564                                 #       diag_record.update(node_record)
565                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
566                                 #       diag_record['args'] = {'nodename': nodename}
567                                 #       diag_record['stage'] = "nmreset"
568                                 #       diag_record['info'] = (nodename, 
569                                 #                                                       node_record['prev_category'], 
570                                 #                                                       node_record['category'])
571                                 #       if diag_record['ticket_id'] == "":
572                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
573                                 #                                       (loginbase, nodename, diag_record['stage'], 
574                                 #                                        state, category, diag_record['found_rt_ticket'])
575                                 #       else:
576                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
577                                 #                                       (loginbase, nodename, diag_record['stage'])
578 #
579 #                                       return diag_record
580                                 else:
581                                         return None
582                         else:
583                                 # unknown
584                                 pass
585                 elif "ALPHA"    in category:
586                         pass
587                 elif "clock_drift" in category:
588                         pass
589                 elif "dns"    in category:
590                         pass
591                 elif "filerw"    in category:
592                         pass
593                 else:
594                         print "Unknown category!!!! %s" % category
595                         sys.exit(1)
596
597                 return diag_record
598
599         def __diagnoseNode(self, loginbase, node_record):
600                 # TODO: change the format of the hostname in this 
601                 #               record to something more natural.
602                 nodename                = node_record['nodename']
603                 category                = node_record['category']
604                 prev_category   = node_record['prev_category']
605                 state                   = node_record['state']
606                 #if 'prev_category' in node_record:
607                 #       prev_category = node_record['prev_category']
608                 #else:
609                 #       prev_category = "ERROR"
610                 if node_record['prev_category'] != "NORECORD":
611                 
612                         val = cmpCategoryVal(category, prev_category)
613                         print "%s went from %s -> %s" % (nodename, prev_category, category)
614                         if val == 1:
615                                 # improved
616                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
617                                         print "closing record with no ticket: ", node_record['nodename']
618                                         node_record['action'] = ['close_rt']
619                                         node_record['message'] = None
620                                         node_record['stage'] = 'monitor-end-record'
621                                         return node_record
622                                 else:
623                                         node_record['stage'] = 'improvement'
624
625                                 #if 'monitor-end-record' in node_record['stage']:
626                                 #       # just ignore it if it's already ended.
627                                 #       # otherwise, the status should be worse, and we won't get
628                                 #       # here.
629                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
630                                 #       return None
631 #
632 #                                       #return None
633                         elif val == -1:
634                                 # current category is worse than previous, carry on
635                                 pass
636                         else:
637                                 #values are equal, carry on.
638                                 #print "why are we here?"
639                                 pass
640
641                 if 'rt' in node_record and 'Status' in node_record['rt']:
642                         if node_record['stage'] == 'ticket_waitforever':
643                                 if 'resolved' in node_record['rt']['Status']:
644                                         print "ending waitforever record for: ", node_record['nodename']
645                                         node_record['action'] = ['noop']
646                                         node_record['message'] = None
647                                         node_record['stage'] = 'monitor-end-record'
648                                         print "oldlog: %s" % node_record['log'],
649                                         print "%15s" % node_record['action']
650                                         return node_record
651                                 if 'new' in node_record['rt']['Status'] and \
652                                         'Queue' in node_record['rt'] and \
653                                         'Monitor' in node_record['rt']['Queue']:
654
655                                         print "RESETTING stage to findbad"
656                                         node_record['stage'] = 'findbad'
657                         
658                 #### COMPARE category and prev_category
659                 # if not_equal
660                 #       then assign a stage based on relative priorities
661                 # else equal
662                 #       then check category for stats.
663                 diag_record = self.diagRecordByCategory(node_record)
664                 if diag_record == None:
665                         #print "diag_record == None"
666                         return None
667
668                 #### found_RT_ticket
669                 # TODO: need to record time found, and maybe add a stage for acting on it...
670                 # NOTE: after found, if the support ticket is resolved, the block is
671                 #               not removed. How to remove the block on this?
672                 if 'found_rt_ticket' in diag_record and \
673                         diag_record['found_rt_ticket'] is not None:
674                         if diag_record['stage'] is not 'improvement':
675                                 diag_record['stage'] = 'ticket_waitforever'
676                                 
677                 current_time = time.time()
678                 # take off four days, for the delay that database caused.
679                 # TODO: generalize delays at PLC, and prevent enforcement when there
680                 #               have been no emails.
681                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
682                 #delta = current_time - diag_record['time'] - 7*SPERDAY
683                 delta = current_time - diag_record['time']
684
685                 message = diag_record['message']
686                 act_record = {}
687                 act_record.update(diag_record)
688
689                 #### DIAGNOSE STAGES 
690                 if   'findbad' in diag_record['stage']:
691                         # The node is bad, and there's no previous record of it.
692                         act_record['email'] = TECH
693                         act_record['action'] = ['noop']
694                         act_record['message'] = message[0]
695                         act_record['stage'] = 'stage_actinoneweek'
696
697                 elif 'nmreset' in diag_record['stage']:
698                         act_record['email']  = ADMIN 
699                         act_record['action'] = ['reset_nodemanager']
700                         act_record['message'] = message[0]
701                         act_record['stage']  = 'nmreset'
702                         return None
703
704                 elif 'reboot_node' in diag_record['stage']:
705                         act_record['email'] = TECH
706                         act_record['action'] = ['noop']
707                         act_record['message'] = message[0]
708                         act_record['stage'] = 'stage_actinoneweek'
709                         
710                 elif 'improvement' in diag_record['stage']:
711                         # - backoff previous squeeze actions (slice suspend, nocreate)
712                         # TODO: add a backoff_squeeze section... Needs to runthrough
713                         print "backing off of %s" % nodename
714                         act_record['action'] = ['close_rt']
715                         act_record['message'] = message[0]
716                         act_record['stage'] = 'monitor-end-record'
717
718                 elif 'actinoneweek' in diag_record['stage']:
719                         if delta >= 7 * SPERDAY: 
720                                 act_record['email'] = TECH | PI
721                                 act_record['stage'] = 'stage_actintwoweeks'
722                                 act_record['message'] = message[1]
723                                 act_record['action'] = ['nocreate' ]
724                                 act_record['time'] = current_time               # reset clock for waitforever
725                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
726                                 act_record['email'] = TECH 
727                                 act_record['message'] = message[0]
728                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
729                                 act_record['second-mail-at-oneweek'] = True
730                         else:
731                                 act_record['message'] = None
732                                 act_record['action'] = ['waitforoneweekaction' ]
733                                 print "ignoring this record for: %s" % act_record['nodename']
734                                 return None                     # don't send if there's no action
735
736                 elif 'actintwoweeks' in diag_record['stage']:
737                         if delta >= 7 * SPERDAY:
738                                 act_record['email'] = TECH | PI | USER
739                                 act_record['stage'] = 'stage_waitforever'
740                                 act_record['message'] = message[2]
741                                 act_record['action'] = ['suspendslices']
742                                 act_record['time'] = current_time               # reset clock for waitforever
743                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
744                                 act_record['email'] = TECH | PI
745                                 act_record['message'] = message[1]
746                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
747                                 act_record['second-mail-at-twoweeks'] = True
748                         else:
749                                 act_record['message'] = None
750                                 act_record['action'] = ['waitfortwoweeksaction']
751                                 return None                     # don't send if there's no action
752
753                 elif 'ticket_waitforever' in diag_record['stage']:
754                         act_record['email'] = TECH
755                         if 'first-found' not in act_record:
756                                 act_record['first-found'] = True
757                                 act_record['log'] += " firstfound"
758                                 act_record['action'] = ['ticket_waitforever']
759                                 act_record['message'] = None
760                                 act_record['time'] = current_time
761                         else:
762                                 if delta >= 7*SPERDAY:
763                                         act_record['action'] = ['ticket_waitforever']
764                                         act_record['message'] = None
765                                         act_record['time'] = current_time               # reset clock
766                                 else:
767                                         act_record['action'] = ['ticket_waitforever']
768                                         act_record['message'] = None
769                                         return None
770
771                 elif 'waitforever' in diag_record['stage']:
772                         # more than 3 days since last action
773                         # TODO: send only on weekdays.
774                         # NOTE: expects that 'time' has been reset before entering waitforever stage
775                         if delta >= 3*SPERDAY:
776                                 act_record['action'] = ['email-againwaitforever']
777                                 act_record['message'] = message[2]
778                                 act_record['time'] = current_time               # reset clock
779                         else:
780                                 act_record['action'] = ['waitforever']
781                                 act_record['message'] = None
782                                 return None                     # don't send if there's no action
783
784                 else:
785                         # There is no action to be taken, possibly b/c the stage has
786                         # already been performed, but diagnose picked it up again.
787                         # two cases, 
788                         #       1. stage is unknown, or 
789                         #       2. delta is not big enough to bump it to the next stage.
790                         # TODO: figure out which. for now assume 2.
791                         print "UNKNOWN stage for %s; nothing done" % nodename
792                         act_record['action'] = ['unknown']
793                         act_record['message'] = message[0]
794
795                         act_record['email'] = TECH
796                         act_record['action'] = ['noop']
797                         act_record['message'] = message[0]
798                         act_record['stage'] = 'stage_actinoneweek'
799                         act_record['time'] = current_time               # reset clock
800                         #print "Exiting..."
801                         #return None
802                         #sys.exit(1)
803
804                 print "%s" % act_record['log'],
805                 print "%15s" % act_record['action']
806                 return act_record
807
808         def getMaxSlices(self, loginbase):
809                 # if sickdb has a loginbase, then it will have at least one node.
810                 site_stats = None
811
812                 for nodename in self.diagnose_in[loginbase].keys():
813                         if nodename in self.findbad['nodes']:
814                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
815                                 break
816
817                 if site_stats == None:
818                         raise Exception, "loginbase with no nodes in findbad"
819                 else:
820                         return site_stats['max_slices']
821
822         def getNumNodes(self, loginbase):
823                 # if sickdb has a loginbase, then it will have at least one node.
824                 site_stats = None
825
826                 for nodename in self.diagnose_in[loginbase].keys():
827                         if nodename in self.findbad['nodes']:
828                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
829                                 break
830
831                 if site_stats == None:
832                         raise Exception, "loginbase with no nodes in findbad"
833                 else:
834                         return site_stats['num_nodes']
835
836         """
837         Returns number of up nodes as the total number *NOT* in act_all with a
838         stage other than 'steady-state' .
839         """
840         def getUpAtSite(self, loginbase, d_diag_site):
841                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
842                 #               that aren't recorded yet.
843
844                 numnodes = self.getNumNodes(loginbase)
845                 # NOTE: assume nodes we have no record of are ok. (too conservative)
846                 # TODO: make the 'up' value more representative
847                 up = numnodes
848                 for nodename in d_diag_site[loginbase]['nodes'].keys():
849
850                         rec = d_diag_site[loginbase]['nodes'][nodename]
851                         if rec['stage'] != 'monitor-end-record':
852                                 up -= 1
853                         else:
854                                 pass # the node is assumed to be up.
855
856                 #if up != numnodes:
857                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
858
859                 return up
860
861
862 class SiteAction:
863         def __init__(self, parameter_names=['hostname', 'ticket_id']):
864                 self.parameter_names = parameter_names
865         def checkParam(self, args):
866                 for param in self.parameter_names:
867                         if param not in args:
868                                 raise Exception("Parameter %s not provided in args"%param)
869         def run(self, args):
870                 self.checkParam(args)
871                 return self._run(args)
872         def _run(self, args):
873                 pass
874
875 class SuspendAction(SiteAction):
876         def _run(self, args):
877                 return plc.suspendSlices(args['hostname'])
878
879 class RemoveSliceCreation(SiteAction):
880         def _run(self, args):
881                 return plc.removeSliceCreation(args['hostname'])
882
883 class BackoffActions(SiteAction):
884         def _run(self, args):
885                 plc.enableSlices(args['hostname'])
886                 plc.enableSliceCreation(args['hostname'])
887                 return True
888
889 # TODO: create class for each action below, 
890 #               allow for lists of actions to be performed...
891
892 def close_rt_backoff(args):
893         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
894                 mailer.closeTicketViaRT(args['ticket_id'], 
895                                                                 "Ticket CLOSED automatically by SiteAssist.")
896                 plc.enableSlices(args['hostname'])
897                 plc.enableSliceCreation(args['hostname'])
898         return
899
900 def reboot_node(args):
901         host = args['hostname']
902         return reboot.reboot_policy(host, True, config.debug)
903
904 def reset_nodemanager(args):
905         os.system("ssh root@%s /sbin/service nm restart" % nodename)
906         return
907
908 class Action(Thread):
909         def __init__(self, l_action):
910                 self.l_action = l_action
911
912                 # the hostname to loginbase mapping
913                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
914
915                 # Actions to take.
916                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
917                 # Actions taken.
918                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
919
920                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
921                 self.actions = {}
922                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
923                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
924                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
925                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
926                 self.actions['noop'] = lambda args: args
927                 self.actions['reboot_node'] = lambda args: reboot_node(args)
928                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
929
930                 self.actions['ticket_waitforever'] = lambda args: args
931                 self.actions['waitforever'] = lambda args: args
932                 self.actions['unknown'] = lambda args: args
933                 self.actions['waitforoneweekaction'] = lambda args: args
934                 self.actions['waitfortwoweeksaction'] = lambda args: args
935                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
936                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
937                 self.actions['email-againwaitforever'] = lambda args: args
938                 self.actions['email-againticket_waitforever'] = lambda args: args
939                                 
940
941                 self.sickdb = {}
942                 Thread.__init__(self)
943
944         def run(self):
945                 self.accumSites()
946                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
947                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
948
949                 try:
950                         stats = self.analyseSites()
951                 except Exception, err:
952                         print "----------------"
953                         import traceback
954                         print traceback.print_exc()
955                         print err
956                         if config.policysavedb:
957                                 print "Saving Databases... act_all"
958                                 soltesz.dbDump("act_all", self.act_all)
959                         sys.exit(1)
960
961                 print_stats("sites_observed", stats)
962                 print_stats("sites_diagnosed", stats)
963                 print_stats("nodes_diagnosed", stats)
964                 print_stats("sites_emailed", stats)
965                 print_stats("nodes_actedon", stats)
966                 print string.join(stats['allsites'], ",")
967
968                 if config.policysavedb:
969                         print "Saving Databases... act_all"
970                         #soltesz.dbDump("policy.eventlog", self.eventlog)
971                         # TODO: remove 'diagnose_out', 
972                         #       or at least the entries that were acted on.
973                         soltesz.dbDump("act_all", self.act_all)
974
975         def accumSites(self):
976                 """
977                 Take all nodes, from l_action, look them up in the diagnose_db database, 
978                 and insert them into sickdb[] as:
979
980                 This way only the given l_action nodes will be acted on regardless
981                 of how many from diagnose_db are available.
982
983                         sickdb[loginbase][nodename] = diag_record
984                 """
985                 # TODO: what if l_action == None ?
986                 for nodename in self.l_action:
987
988                         loginbase = self.plcdb_hn2lb[nodename]
989
990                         if loginbase in self.diagnose_db and \
991                                 nodename in self.diagnose_db[loginbase]['nodes']:
992
993                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
994
995                                 if loginbase not in self.sickdb:
996                                         self.sickdb[loginbase] = {'nodes' : {}}
997
998                                 # NOTE: don't copy all node records, since not all will be in l_action
999                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1000                                 # NOTE: but, we want to get the loginbase config settings, 
1001                                 #               this is the easiest way.
1002                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1003                         #else:
1004                                 #print "%s not in diagnose_db!!" % loginbase
1005                 return
1006
1007         def __emailSite(self, loginbase, roles, message, args):
1008                 """
1009                 loginbase is the unique site abbreviation, prepended to slice names.
1010                 roles contains TECH, PI, USER roles, and derive email aliases.
1011                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1012                 """
1013                 ticket_id = 0
1014                 args.update({'loginbase':loginbase})
1015
1016                 if not config.mail and not config.debug and config.bcc:
1017                         roles = ADMIN
1018                 if config.mail and config.debug:
1019                         roles = ADMIN
1020
1021                 # build targets
1022                 contacts = []
1023                 if ADMIN & roles:
1024                         contacts += [config.email]
1025                 if TECH & roles:
1026                         contacts += [TECHEMAIL % loginbase]
1027                 if PI & roles:
1028                         contacts += [PIEMAIL % loginbase]
1029                 if USER & roles:
1030                         slices = plc.slices(loginbase)
1031                         if len(slices) >= 1:
1032                                 for slice in slices:
1033                                         contacts += [SLICEMAIL % slice]
1034                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1035                         else:
1036                                 print "SLIC: %20s : 0 slices" % loginbase
1037
1038                 try:
1039                         subject = message[0] % args
1040                         body = message[1] % args
1041                         if ADMIN & roles:
1042                                 # send only to admin
1043                                 if 'ticket_id' in args:
1044                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1045                                 else:
1046                                         subj = "Re: [PL noticket] %s" % subject
1047                                 mailer.email(subj, body, contacts)
1048                                 ticket_id = args['ticket_id']
1049                         else:
1050                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1051                 except Exception, err:
1052                         print "exception on message:"
1053                         import traceback
1054                         print traceback.print_exc()
1055                         print message
1056
1057                 return ticket_id
1058
1059
1060         def _format_diaginfo(self, diag_node):
1061                 info = diag_node['info']
1062                 if diag_node['stage'] == 'monitor-end-record':
1063                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1064                 else:
1065                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1066                 return hlist
1067
1068
1069         def get_email_args(self, act_recordlist, loginbase=None):
1070
1071                 email_args = {}
1072                 email_args['hostname_list'] = ""
1073
1074                 for act_record in act_recordlist:
1075                         email_args['hostname_list'] += act_record['msg_format']
1076                         email_args['hostname'] = act_record['nodename']
1077                         if  'plcnode' in act_record and \
1078                                 'pcu_ids' in act_record['plcnode'] and \
1079                                 len(act_record['plcnode']['pcu_ids']) > 0:
1080                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1081                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1082                         else:
1083                                 email_args['pcu_id'] = "-1"
1084                                         
1085                         if 'ticket_id' in act_record:
1086                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1087                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1088                                         sys.stdout.flush()
1089                                         line = sys.stdin.readline()
1090                                         try:
1091                                                 ticket_id = int(line)
1092                                         except:
1093                                                 print "could not get ticket_id from stdin..."
1094                                                 os._exit(1)
1095                                 else:
1096                                         ticket_id = act_record['ticket_id']
1097                                         
1098                                 email_args['ticket_id'] = ticket_id
1099
1100                 return email_args
1101
1102         def get_unique_issues(self, act_recordlist):
1103                 # NOTE: only send one email per site, per problem...
1104                 unique_issues = {}
1105                 for act_record in act_recordlist:
1106                         act_key = act_record['action'][0]
1107                         if act_key not in unique_issues:
1108                                 unique_issues[act_key] = []
1109                                 
1110                         unique_issues[act_key] += [act_record]
1111                         
1112                 return unique_issues
1113                         
1114
1115         def __actOnSite(self, loginbase, site_record):
1116                 i_nodes_actedon = 0
1117                 i_nodes_emailed = 0
1118
1119                 act_recordlist = []
1120
1121                 for nodename in site_record['nodes'].keys():
1122                         diag_record = site_record['nodes'][nodename]
1123                         act_record  = self.__actOnNode(diag_record)
1124                         #print "nodename: %s %s" % (nodename, act_record)
1125                         if act_record is not None:
1126                                 act_recordlist += [act_record]
1127
1128                 unique_issues = self.get_unique_issues(act_recordlist)
1129
1130                 for issue in unique_issues.keys():
1131                         print "\tworking on issue: %s" % issue
1132                         issue_record_list = unique_issues[issue]
1133                         email_args = self.get_email_args(issue_record_list, loginbase)
1134
1135                         # for each record.
1136                         for act_record in issue_record_list:
1137                                 # if there's a pcu record and email config is set
1138                                 if 'email_pcu' in act_record:
1139                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1140                                                 # and 'reboot_node' in act_record['stage']:
1141
1142                                                 email_args['hostname'] = act_record['nodename']
1143                                                 ticket_id = self.__emailSite(loginbase, 
1144                                                                                         act_record['email'], 
1145                                                                                         emailTxt.mailtxt.pcudown[0],
1146                                                                                         email_args)
1147                                                 if ticket_id == 0:
1148                                                         # error.
1149                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1150                                                         os._exit(1)
1151                                                         pass
1152                                                 email_args['ticket_id'] = ticket_id
1153
1154                         
1155                         act_record = issue_record_list[0]
1156                         # send message before squeezing
1157                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1158                                                                                                 site_record['config']['email'])
1159                         if act_record['message'] != None and site_record['config']['email']:
1160                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1161                                                                                          act_record['message'], email_args)
1162
1163                                 if ticket_id == 0:
1164                                         # error.
1165                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1166                                         os._exit(1)
1167                                         pass
1168
1169                                 # Add ticket_id to ALL nodenames
1170                                 for act_record in issue_record_list:
1171                                         nodename = act_record['nodename']
1172                                         # update node record with RT ticket_id
1173                                         if nodename in self.act_all:
1174                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1175                                                 # if the ticket was previously resolved, reset it to new.
1176                                                 if 'rt' in act_record and \
1177                                                         'Status' in act_record['rt'] and \
1178                                                         act_record['rt']['Status'] == 'resolved':
1179                                                         mailer.setTicketStatus(ticket_id, "new")
1180                                                 status = mailer.getTicketStatus(ticket_id)
1181                                                 self.act_all[nodename][0]['rt'] = status
1182                                         if config.mail: i_nodes_emailed += 1
1183
1184                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1185                                                                                                         site_record['config']['squeeze'])
1186                         if config.squeeze and site_record['config']['squeeze']:
1187                                 for act_key in act_record['action']:
1188                                         self.actions[act_key](email_args)
1189                                 i_nodes_actedon += 1
1190                 
1191                 if config.policysavedb:
1192                         print "Saving Databases... act_all, diagnose_out"
1193                         soltesz.dbDump("act_all", self.act_all)
1194                         # remove site record from diagnose_out, it's in act_all as done.
1195                         del self.diagnose_db[loginbase]
1196                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1197
1198                 print "sleeping for 1 sec"
1199                 time.sleep(1)
1200                 #print "Hit enter to continue..."
1201                 #sys.stdout.flush()
1202                 #line = sys.stdin.readline()
1203
1204                 return (i_nodes_actedon, i_nodes_emailed)
1205
1206         def __actOnNode(self, diag_record):
1207                 nodename = diag_record['nodename']
1208                 message = diag_record['message']
1209
1210                 act_record = {}
1211                 act_record.update(diag_record)
1212                 act_record['nodename'] = nodename
1213                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1214                 print "act_record['stage'] == %s " % act_record['stage']
1215
1216                 # avoid end records, and nmreset records                                        
1217                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1218
1219                 if 'monitor-end-record' not in act_record['stage'] and \
1220                    'nmreset' not in act_record['stage'] and \
1221                    'reboot_node_failed' not in act_record:
1222
1223                         if "DOWN" in act_record['log'] and \
1224                                         'pcu_ids' in act_record['plcnode'] and \
1225                                         len(act_record['plcnode']['pcu_ids']) > 0:
1226
1227                                 print "%s" % act_record['log'],
1228                                 print "%15s" % (['reboot_node'],)
1229                                 # Set node to re-install
1230                                 plc.nodeBootState(act_record['nodename'], "rins")       
1231                                 try:
1232                                         ret = reboot_node({'hostname': act_record['nodename']})
1233                                 except Exception, exc:
1234                                         print "exception on reboot_node:"
1235                                         import traceback
1236                                         print traceback.print_exc()
1237                                         ret = False
1238
1239                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1240                                         # Reboot Succeeded
1241                                         print "reboot succeeded for %s" % act_record['nodename']
1242                                         act_record2 = {}
1243                                         act_record2.update(act_record)
1244                                         act_record2['action'] = ['reboot_node']
1245                                         act_record2['stage'] = "reboot_node"
1246                                         act_record2['reboot_node_failed'] = False
1247                                         act_record2['email_pcu'] = False
1248
1249                                         if nodename not in self.act_all: 
1250                                                 self.act_all[nodename] = []
1251                                         print "inserting 'reboot_node' record into act_all"
1252                                         self.act_all[nodename].insert(0,act_record2)
1253
1254                                         # return None to avoid further action
1255                                         print "Taking no further action"
1256                                         return None
1257                                 else:
1258                                         print "reboot failed for %s" % act_record['nodename']
1259                                         # set email_pcu to also send pcu notice for this record.
1260                                         act_record['reboot_node_failed'] = True
1261                                         act_record['email_pcu'] = True
1262
1263                         print "%s" % act_record['log'],
1264                         print "%15s" % act_record['action']
1265
1266                 if act_record['stage'] is not 'monitor-end-record' and \
1267                    act_record['stage'] is not 'nmreset':
1268                         if nodename not in self.act_all: 
1269                                 self.act_all[nodename] = []
1270
1271                         self.act_all[nodename].insert(0,act_record)
1272                 else:
1273                         print "Not recording %s in act_all" % nodename
1274
1275                 return act_record
1276
1277         def analyseSites(self):
1278                 i_sites_observed = 0
1279                 i_sites_diagnosed = 0
1280                 i_nodes_diagnosed = 0
1281                 i_nodes_actedon = 0
1282                 i_sites_emailed = 0
1283                 l_allsites = []
1284
1285                 sorted_sites = self.sickdb.keys()
1286                 sorted_sites.sort()
1287                 for loginbase in sorted_sites:
1288                         site_record = self.sickdb[loginbase]
1289                         print "sites: %s" % loginbase
1290                         
1291                         i_nodes_diagnosed += len(site_record.keys())
1292                         i_sites_diagnosed += 1
1293
1294                         (na,ne) = self.__actOnSite(loginbase, site_record)
1295
1296                         i_sites_observed += 1
1297                         i_nodes_actedon += na
1298                         i_sites_emailed += ne
1299
1300                         l_allsites += [loginbase]
1301
1302                 return {'sites_observed': i_sites_observed, 
1303                                 'sites_diagnosed': i_sites_diagnosed, 
1304                                 'nodes_diagnosed': i_nodes_diagnosed, 
1305                                 'sites_emailed': i_sites_emailed, 
1306                                 'nodes_actedon': i_nodes_actedon, 
1307                                 'allsites':l_allsites}
1308
1309         def print_stats(self, key, stats):
1310                 print "%20s : %d" % (key, stats[key])
1311
1312
1313
1314         #"""
1315         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1316         #"""
1317         #def status(self):
1318         #       sub = "Monitor Summary"
1319         #       msg = "\nThe following nodes were acted upon:  \n\n"
1320         #       for (node, (type, date)) in self.emailed.items():
1321         #               # Print only things acted on today.
1322         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1323         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1324         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1325         #       for (loginbase, (date, type)) in self.squeezed.items():
1326         #               # Print only things acted on today.
1327         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1328         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1329         #       mailer.email(sub, msg, [SUMTO])
1330         #       logger.info(msg)
1331         #       return 
1332
1333         #"""
1334         #Store/Load state of emails.  When, where, what.
1335         #"""
1336         #def emailedStore(self, action):
1337         #       try:
1338         #               if action == "LOAD":
1339         #                       f = open(DAT, "r+")
1340         #                       logger.info("POLICY:  Found and reading " + DAT)
1341         #                       self.emailed.update(pickle.load(f))
1342         #               if action == "WRITE":
1343         #                       f = open(DAT, "w")
1344         #                       #logger.debug("Writing " + DAT)
1345         #                       pickle.dump(self.emailed, f)
1346         #               f.close()
1347         #       except Exception, err:
1348         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1349
1350
1351 #class Policy(Thread):
1352
1353 def main():
1354         print "policy.py is a module, not a script for running directly."
1355
1356 if __name__ == '__main__':
1357         import os
1358         import plc
1359         try:
1360                 main()
1361         except KeyboardInterrupt:
1362                 print "Killed.  Exitting."
1363                 logger.info('Monitor Killed')
1364                 os._exit(0)