changes for 3.0
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import database
23 import string
24 from unified_model import cmpCategoryVal
25 import config
26
27 DAT="./monitor.dat"
28
29 logger = logging.getLogger("monitor")
30
31 # Time to enforce policy
32 POLSLEEP = 7200
33
34 # Where to email the summary
35 SUMTO = "soltesz@cs.princeton.edu"
36 TECHEMAIL="tech-%s@sites.planet-lab.org"
37 PIEMAIL="pi-%s@sites.planet-lab.org"
38 SLICEMAIL="%s@slices.planet-lab.org"
39 PLCEMAIL="support@planet-lab.org"
40
41 #Thresholds (DAYS)
42 SPERMIN = 60
43 SPERHOUR = 60*60
44 SPERDAY = 86400
45 PITHRESH = 7 * SPERDAY
46 SLICETHRESH = 7 * SPERDAY
47 # Days before attempting rins again
48 RINSTHRESH = 5 * SPERDAY
49
50 # Days before calling the node dead.
51 DEADTHRESH = 30 * SPERDAY
52 # Minimum number of nodes up before squeezing
53 MINUP = 2
54
55 TECH=1
56 PI=2
57 USER=4
58 ADMIN=8
59
60 # IF:
61 #  no SSH, down.
62 #  bad disk, down
63 #  DNS, kinda down (sick)
64 #  clock, kinda down (sick)
65 #  Full disk, going to be down
66
67 # Actions:
68 #  Email
69 #  suspend slice creation
70 #  kill slices
71 def array_to_priority_map(array):
72         """ Create a mapping where each entry of array is given a priority equal
73         to its position in the array.  This is useful for subsequent use in the
74         cmpMap() function."""
75         map = {}
76         count = 0
77         for i in array:
78                 map[i] = count
79                 count += 1
80         return map
81
82 def getdebug():
83         return config.debug
84
85 def print_stats(key, stats):
86         if key in stats: print "%20s : %d" % (key, stats[key])
87
88
89 class Merge(Thread):
90         def __init__(self, l_merge, toRT):
91                 self.toRT = toRT
92                 self.merge_list = l_merge
93                 # the hostname to loginbase mapping
94                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
95
96                 # Previous actions taken on nodes.
97                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
98                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
99
100                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
101                 self.sickdb = {}
102                 self.mergedb = {}
103                 Thread.__init__(self)
104
105         def run(self):
106                 # populate sickdb
107                 self.accumSickSites()
108                 # read data from findbad and act_all
109                 self.mergeActionsAndBadDB()
110                 # pass node_records to RT
111                 self.sendToRT()
112
113         def accumSickSites(self):
114                 """
115                 Take all nodes, from l_diagnose, look them up in the act_all database, 
116                 and insert them into sickdb[] as:
117
118                         sickdb[loginbase][nodename] = fb_record
119                 """
120                 # look at all problems reported by findbad
121                 l_nodes = self.findbad['nodes'].keys()
122                 count = 0
123                 for nodename in l_nodes:
124                         if nodename not in self.merge_list:
125                                 continue                # skip this node, since it's not wanted
126
127                         count += 1
128                         loginbase = self.plcdb_hn2lb[nodename]
129                         values = self.findbad['nodes'][nodename]['values']
130
131                         fb_record = {}
132                         fb_record['nodename'] = nodename
133                         try:
134                                 fb_record['category'] = values['category']
135                         except:
136                                 print values
137                                 print nodename
138                                 print self.findbad['nodes'][nodename]
139                                 count -= 1
140                                 continue
141                         fb_record['state'] = values['state']
142                         fb_record['comonstats'] = values['comonstats']
143                         fb_record['plcnode'] = values['plcnode']
144                         fb_record['kernel'] = self.getKernel(values['kernel'])
145                         fb_record['stage'] = "findbad"
146                         fb_record['message'] = None
147                         fb_record['bootcd'] = values['bootcd']
148                         fb_record['args'] = None
149                         fb_record['info'] = None
150                         fb_record['time'] = time.time()
151                         fb_record['date_created'] = time.time()
152
153                         if loginbase not in self.sickdb:
154                                 self.sickdb[loginbase] = {}
155
156                         self.sickdb[loginbase][nodename] = fb_record
157
158                 print "Found %d nodes" % count
159
160         def getKernel(self, unamestr):
161                 s = unamestr.split()
162                 if len(s) > 2:
163                         return s[2]
164                 else:
165                         return ""
166
167         def mergeActionsAndBadDB(self): 
168                 """
169                 - Look at the sick node_records as reported in findbad, 
170                 - Then look at the node_records in act_all.  
171
172                 There are four cases:
173                 1) Problem in findbad, no problem in act_all
174                         this ok, b/c it just means it's a new problem
175                 2) Problem in findbad, problem in act_all
176                         -Did the problem get better or worse?  
177                                 -If Same, or Worse, then continue looking for open tickets.
178                                 -If Better, or No problem, then "back-off" penalties.
179                                         This judgement may need to wait until 'Diagnose()'
180
181                 3) No problem in findbad, problem in act_all
182                         The the node is operational again according to Findbad()
183
184                 4) No problem in findbad, no problem in act_all
185                         There won't be a record in either db, so there's no code.
186                 """
187
188                 sorted_sites = self.sickdb.keys()
189                 sorted_sites.sort()
190                 # look at all problems reported by findbad
191                 for loginbase in sorted_sites:
192                         d_fb_nodes = self.sickdb[loginbase]
193                         sorted_nodes = d_fb_nodes.keys()
194                         sorted_nodes.sort()
195                         for nodename in sorted_nodes:
196                                 fb_record = self.sickdb[loginbase][nodename]
197                                 x = fb_record
198                                 if loginbase not in self.mergedb:
199                                         self.mergedb[loginbase] = {}
200
201                                 # take the info either from act_all or fb-record.
202                                 # if node not in act_all
203                                 #       then take it from fbrecord, obviously.
204                                 # else node in act_all
205                                 #   if act_all == 0 length (no previous records)
206                                 #               then take it from fbrecord.
207                                 #   else
208                                 #           take it from act_all.
209                                 #   
210
211                                 # We must compare findbad state with act_all state
212                                 if nodename not in self.act_all:
213                                         # 1) ok, b/c it's a new problem. set ticket_id to null
214                                         self.mergedb[loginbase][nodename] = {} 
215                                         self.mergedb[loginbase][nodename].update(x)
216                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
217                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
218                                 else: 
219                                         if len(self.act_all[nodename]) == 0:
220                                                 self.mergedb[loginbase][nodename] = {} 
221                                                 self.mergedb[loginbase][nodename].update(x)
222                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
223                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
224                                         else:
225                                                 y = self.act_all[nodename][0]
226                                                 y['prev_category'] = y['category']
227
228                                                 self.mergedb[loginbase][nodename] = {}
229                                                 self.mergedb[loginbase][nodename].update(y)
230                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
231                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
232                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
233                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
234                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
235                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
236                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
237                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
238
239                                         # delete the entry from cache_all to keep it out of case 3)
240                                         del self.cache_all[nodename]
241
242                 # 3) nodes that remin in cache_all were not identified by findbad.
243                 #        Do we keep them or not?
244                 #   NOTE: i think that since the categories are performed before this
245                 #               step now, and by a monitor-controlled agent.
246
247                 # TODO: This does not work correctly.  Do we need this? 
248                 #for hn in self.cache_all.keys():
249                 #       y = self.act_all[hn][0]
250                 #       if 'monitor' in y['bucket']:
251                 #               loginbase = self.plcdb_hn2lb[hn] 
252                 #               if loginbase not in self.sickdb:
253                 #                       self.sickdb[loginbase] = {}
254                 #               self.sickdb[loginbase][hn] = y
255                 #       else:
256                 #               del self.cache_all[hn]
257
258                 print "len of cache_all: %d" % len(self.cache_all.keys())
259                 return
260
261         def sendToRT(self):
262                 sorted_sites = self.mergedb.keys()
263                 sorted_sites.sort()
264                 # look at all problems reported by merge
265                 for loginbase in sorted_sites:
266                         d_merge_nodes = self.mergedb[loginbase]
267                         for nodename in d_merge_nodes.keys():
268                                 record = self.mergedb[loginbase][nodename]
269                                 self.toRT.put(record)
270
271                 # send signal to stop reading
272                 self.toRT.put(None)
273                 return
274
275 class Diagnose(Thread):
276         def __init__(self, fromRT):
277                 self.fromRT = fromRT
278                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
279                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
280
281                 self.diagnose_in = {}
282                 self.diagnose_out = {}
283                 Thread.__init__(self)
284
285
286         def run(self):
287                 self.accumSickSites()
288
289                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
290                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
291
292                 try:
293                         stats = self.diagnoseAll()
294                 except Exception, err:
295                         print "----------------"
296                         import traceback
297                         print traceback.print_exc()
298                         from nodecommon import email_exception
299                         email_exception()
300                         print err
301                         #if config.policysavedb:
302                         sys.exit(1)
303
304                 print_stats("sites_observed", stats)
305                 print_stats("sites_diagnosed", stats)
306                 print_stats("nodes_diagnosed", stats)
307
308                 if config.policysavedb:
309                         print "Saving Databases... diagnose_out"
310                         database.dbDump("diagnose_out", self.diagnose_out)
311
312         def accumSickSites(self):
313                 """
314                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
315                 and insert them into diagnose_in[] as:
316
317                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
318                 """
319                 while 1:
320                         node_record = self.fromRT.get(block = True)
321                         if node_record == None:
322                                 break;
323
324                         nodename = node_record['nodename']
325                         loginbase = self.plcdb_hn2lb[nodename]
326
327                         if loginbase not in self.diagnose_in:
328                                 self.diagnose_in[loginbase] = {}
329
330                         self.diagnose_in[loginbase][nodename] = node_record
331
332                 return
333
334         def diagnoseAll(self):
335                 i_sites_observed = 0
336                 i_sites_diagnosed = 0
337                 i_nodes_diagnosed = 0
338                 i_nodes_actedon = 0
339                 i_sites_emailed = 0
340                 l_allsites = []
341
342                 sorted_sites = self.diagnose_in.keys()
343                 sorted_sites.sort()
344                 self.diagnose_out= {}
345                 for loginbase in sorted_sites:
346                         l_allsites += [loginbase]
347
348                         d_diag_nodes = self.diagnose_in[loginbase]
349                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
350                         # store records in diagnose_out, for saving later.
351                         self.diagnose_out.update(d_act_records)
352                         
353                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
354                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
355                                 i_sites_diagnosed += 1
356                         i_sites_observed += 1
357
358                 return {'sites_observed': i_sites_observed, 
359                                 'sites_diagnosed': i_sites_diagnosed, 
360                                 'nodes_diagnosed': i_nodes_diagnosed, 
361                                 'allsites':l_allsites}
362
363                 pass
364                 
365         def getDaysDown(cls, diag_record):
366                 daysdown = -1
367                 last_contact = diag_record['plcnode']['last_contact']
368                 date_created = diag_record['plcnode']['date_created']
369
370                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
371                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
372                 elif last_contact is None:
373                         if date_created is not None:
374                                 now = time.time()
375                                 diff = now - date_created
376                                 daysdown = diff // (60*60*24)
377                         else:
378                                 daysdown = -1
379                 else:
380                         now = time.time()
381                         diff = now - last_contact
382                         daysdown = diff // (60*60*24)
383                 return daysdown
384         getDaysDown = classmethod(getDaysDown)
385
386         def getStrDaysDown(cls, diag_record):
387                 daysdown = "unknown"
388                 last_contact = diag_record['plcnode']['last_contact']
389                 date_created = diag_record['plcnode']['date_created']
390
391                 if      diag_record['comonstats']['uptime'] != "null" and \
392                         diag_record['comonstats']['uptime'] != "-1":
393                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
394                         daysdown = "%d days up" % daysdown
395
396                 elif last_contact is None:
397                         if date_created is not None:
398                                 now = time.time()
399                                 diff = now - date_created
400                                 daysdown = diff // (60*60*24)
401                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
402                         else:
403                                 daysdown = "Never contacted PLC"
404                 else:
405                         now = time.time()
406                         diff = now - last_contact
407                         daysdown = diff // (60*60*24)
408                         daysdown = "%s days down" % daysdown
409                 return daysdown
410         getStrDaysDown = classmethod(getStrDaysDown)
411         #def getStrDaysDown(cls, diag_record):
412         #       daysdown = cls.getDaysDown(diag_record)
413         #       if daysdown > -1:
414         #               return "%d days down"%daysdown
415         #       elif daysdown == -1:
416         #               return "Has never contacted PLC"
417         #       else:
418         #               return "%d days up"% -daysdown
419         #getStrDaysDown = classmethod(getStrDaysDown)
420
421         def __getCDVersion(self, diag_record, nodename):
422                 cdversion = ""
423                 #print "Getting kernel for: %s" % diag_record['nodename']
424                 cdversion = diag_record['kernel']
425                 return cdversion
426
427         def __diagnoseSite(self, loginbase, d_diag_nodes):
428                 """
429                 d_diag_nodes are diagnose_in entries.
430                 """
431                 d_diag_site = {loginbase : { 'config' : 
432                                                                                                 {'squeeze': False,
433                                                                                                  'email': False
434                                                                                                 }, 
435                                                                         'nodes': {}
436                                                                         }
437                                            }
438                 sorted_nodes = d_diag_nodes.keys()
439                 sorted_nodes.sort()
440                 for nodename in sorted_nodes:
441                         node_record = d_diag_nodes[nodename]
442                         diag_record = self.__diagnoseNode(loginbase, node_record)
443
444                         if diag_record != None:
445                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
446
447                                 # NOTE: improvement means, we need to act/squeeze and email.
448                                 #print "DIAG_RECORD", diag_record
449                                 if 'monitor-end-record' in diag_record['stage'] or \
450                                    'nmreset' in diag_record['stage']:
451                                 #       print "resetting loginbase!" 
452                                         d_diag_site[loginbase]['config']['squeeze'] = True
453                                         d_diag_site[loginbase]['config']['email'] = True
454                                 #else:
455                                 #       print "NO IMPROVEMENT!!!!"
456                         else:
457                                 pass # there is nothing to do for this node.
458
459                 # NOTE: these settings can be overridden by command line arguments,
460                 #       or the state of a record, i.e. if already in RT's Support Queue.
461                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
462                 if nodes_up < MINUP:
463                         d_diag_site[loginbase]['config']['squeeze'] = True
464
465                 max_slices = self.getMaxSlices(loginbase)
466                 num_nodes = self.getNumNodes(loginbase)
467                 # NOTE: when max_slices == 0, this is either a new site (the old way)
468                 #       or an old disabled site from previous monitor (before site['enabled'])
469                 if nodes_up < num_nodes and max_slices != 0:
470                         d_diag_site[loginbase]['config']['email'] = True
471
472                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
473                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
474
475                 return d_diag_site
476
477         def diagRecordByCategory(self, node_record):
478                 nodename = node_record['nodename']
479                 category = node_record['category']
480                 state    = node_record['state']
481                 loginbase = self.plcdb_hn2lb[nodename]
482                 diag_record = None
483
484                 if  "ERROR" in category:        # i.e. "DOWN"
485                         diag_record = {}
486                         diag_record.update(node_record)
487                         daysdown = self.getDaysDown(diag_record) 
488                         if daysdown < 7:
489                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
490                                 print format % (loginbase, nodename, daysdown)
491                                 return None
492
493                         s_daysdown = self.getStrDaysDown(diag_record)
494                         diag_record['message'] = emailTxt.mailtxt.newdown
495                         diag_record['args'] = {'nodename': nodename}
496                         diag_record['info'] = (nodename, s_daysdown, "")
497
498                         if 'reboot_node_failed' in node_record:
499                                 # there was a previous attempt to use the PCU.
500                                 if node_record['reboot_node_failed'] == False:
501                                         # then the last attempt apparently, succeeded.
502                                         # But, the category is still 'ERROR'.  Therefore, the
503                                         # PCU-to-Node mapping is broken.
504                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
505                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
506                                         diag_record['email_pcu'] = True
507
508                         if 'ticket_id' in diag_record:
509                                 if diag_record['ticket_id'] == "":
510                                         if 'found_rt_ticket' in diag_record:
511                                                 ticket_id = diag_record['found_rt_ticket']
512                                         else:
513                                                 ticket_id = "None"
514                                 else:
515                                         ticket_id = diag_record['ticket_id']
516                         else:
517                                 ticket_id = "None"
518
519                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
520                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
521
522                 elif "OLDBOOTCD" in category:
523                         # V2 boot cds as determined by findbad
524                         s_daysdown = self.getStrDaysDown(node_record)
525                         s_cdversion = self.__getCDVersion(node_record, nodename)
526                         diag_record = {}
527                         diag_record.update(node_record)
528                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
529                         diag_record['message'] = emailTxt.mailtxt.newbootcd
530                         diag_record['args'] = {'nodename': nodename}
531                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
532                         if diag_record['ticket_id'] == "":
533                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
534                                                                         (loginbase, nodename, diag_record['kernel'], 
535                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
536                         else:
537                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
538                                                                         (loginbase, nodename, diag_record['kernel'], 
539                                                                          diag_record['bootcd'], diag_record['ticket_id'])
540
541                 elif "PROD" in category:
542                         if "DEBUG" in state:
543                                 # Not sure what to do with these yet.  Probably need to
544                                 # reboot, and email.
545                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
546                                 return None
547                         elif "BOOT" in state:
548                                 # no action needed.
549                                 # TODO: remove penalties, if any are applied.
550                                 now = time.time()
551                                 last_contact = node_record['plcnode']['last_contact']
552                                 if last_contact == None:
553                                         time_diff = 0
554                                 else:
555                                         time_diff = now - last_contact;
556
557                                 if 'improvement' in node_record['stage']:
558                                         # then we need to pass this on to 'action'
559                                         diag_record = {}
560                                         diag_record.update(node_record)
561                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
562                                         diag_record['args'] = {'nodename': nodename}
563                                         diag_record['info'] = (nodename, node_record['prev_category'], 
564                                                                                                          node_record['category'])
565                                         if 'email_pcu' in diag_record:
566                                                 if diag_record['email_pcu']:
567                                                         # previously, the pcu failed to reboot, so send
568                                                         # email. Now, reset these values to try the reboot
569                                                         # again.
570                                                         diag_record['email_pcu'] = False
571                                                         del diag_record['reboot_node_failed']
572
573                                         if diag_record['ticket_id'] == "":
574                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
575                                                                         (loginbase, nodename, diag_record['stage'], 
576                                                                          state, category, diag_record['found_rt_ticket'])
577                                         else:
578                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
579                                                                         (loginbase, nodename, diag_record['stage'], 
580                                                                          state, category, diag_record['ticket_id'])
581                                         return diag_record
582                                 #elif time_diff >= 6*SPERHOUR:
583                                 #       # heartbeat is older than 30 min.
584                                 #       # then reset NM.
585                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
586                                 #       diag_record = {}
587                                 #       diag_record.update(node_record)
588                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
589                                 #       diag_record['args'] = {'nodename': nodename}
590                                 #       diag_record['stage'] = "nmreset"
591                                 #       diag_record['info'] = (nodename, 
592                                 #                                                       node_record['prev_category'], 
593                                 #                                                       node_record['category'])
594                                 #       if diag_record['ticket_id'] == "":
595                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
596                                 #                                       (loginbase, nodename, diag_record['stage'], 
597                                 #                                        state, category, diag_record['found_rt_ticket'])
598                                 #       else:
599                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
600                                 #                                       (loginbase, nodename, diag_record['stage'])
601 #
602 #                                       return diag_record
603                                 else:
604                                         return None
605                         else:
606                                 # unknown
607                                 pass
608                 elif "ALPHA"    in category:
609                         pass
610                 elif "clock_drift" in category:
611                         pass
612                 elif "dns"    in category:
613                         pass
614                 elif "filerw"    in category:
615                         pass
616                 else:
617                         print "Unknown category!!!! %s" % category
618                         sys.exit(1)
619
620                 return diag_record
621
622         def __diagnoseNode(self, loginbase, node_record):
623                 # TODO: change the format of the hostname in this 
624                 #               record to something more natural.
625                 nodename                = node_record['nodename']
626                 category                = node_record['category']
627                 prev_category   = node_record['prev_category']
628                 state                   = node_record['state']
629                 #if 'prev_category' in node_record:
630                 #       prev_category = node_record['prev_category']
631                 #else:
632                 #       prev_category = "ERROR"
633                 if node_record['prev_category'] != "NORECORD":
634                 
635                         val = cmpCategoryVal(category, prev_category)
636                         print "%s went from %s -> %s" % (nodename, prev_category, category)
637                         if val == 1:
638                                 # improved
639                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
640                                         print "closing record with no ticket: ", node_record['nodename']
641                                         node_record['action'] = ['close_rt']
642                                         node_record['message'] = None
643                                         node_record['stage'] = 'monitor-end-record'
644                                         return node_record
645                                 else:
646                                         node_record['stage'] = 'improvement'
647
648                                 #if 'monitor-end-record' in node_record['stage']:
649                                 #       # just ignore it if it's already ended.
650                                 #       # otherwise, the status should be worse, and we won't get
651                                 #       # here.
652                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
653                                 #       return None
654 #
655 #                                       #return None
656                         elif val == -1:
657                                 # current category is worse than previous, carry on
658                                 pass
659                         else:
660                                 #values are equal, carry on.
661                                 #print "why are we here?"
662                                 pass
663
664                 if 'rt' in node_record and 'Status' in node_record['rt']:
665                         if node_record['stage'] == 'ticket_waitforever':
666                                 if 'resolved' in node_record['rt']['Status']:
667                                         print "ending waitforever record for: ", node_record['nodename']
668                                         node_record['action'] = ['noop']
669                                         node_record['message'] = None
670                                         node_record['stage'] = 'monitor-end-record'
671                                         print "oldlog: %s" % node_record['log'],
672                                         print "%15s" % node_record['action']
673                                         return node_record
674                                 if 'new' in node_record['rt']['Status'] and \
675                                         'Queue' in node_record['rt'] and \
676                                         'Monitor' in node_record['rt']['Queue']:
677
678                                         print "RESETTING stage to findbad"
679                                         node_record['stage'] = 'findbad'
680                         
681                 #### COMPARE category and prev_category
682                 # if not_equal
683                 #       then assign a stage based on relative priorities
684                 # else equal
685                 #       then check category for stats.
686                 diag_record = self.diagRecordByCategory(node_record)
687                 if diag_record == None:
688                         #print "diag_record == None"
689                         return None
690
691                 #### found_RT_ticket
692                 # TODO: need to record time found, and maybe add a stage for acting on it...
693                 # NOTE: after found, if the support ticket is resolved, the block is
694                 #               not removed. How to remove the block on this?
695                 if 'found_rt_ticket' in diag_record and \
696                         diag_record['found_rt_ticket'] is not None:
697                         if diag_record['stage'] is not 'improvement':
698                                 diag_record['stage'] = 'ticket_waitforever'
699                                 
700                 current_time = time.time()
701                 # take off four days, for the delay that database caused.
702                 # TODO: generalize delays at PLC, and prevent enforcement when there
703                 #               have been no emails.
704                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
705                 #delta = current_time - diag_record['time'] - 7*SPERDAY
706                 delta = current_time - diag_record['time']
707
708                 message = diag_record['message']
709                 act_record = {}
710                 act_record.update(diag_record)
711
712                 #### DIAGNOSE STAGES 
713                 if   'findbad' in diag_record['stage']:
714                         # The node is bad, and there's no previous record of it.
715                         act_record['email'] = TECH
716                         act_record['action'] = ['noop']
717                         act_record['message'] = message[0]
718                         act_record['stage'] = 'stage_actinoneweek'
719
720                 elif 'nmreset' in diag_record['stage']:
721                         act_record['email']  = ADMIN 
722                         act_record['action'] = ['reset_nodemanager']
723                         act_record['message'] = message[0]
724                         act_record['stage']  = 'nmreset'
725                         return None
726
727                 elif 'reboot_node' in diag_record['stage']:
728                         act_record['email'] = TECH
729                         act_record['action'] = ['noop']
730                         act_record['message'] = message[0]
731                         act_record['stage'] = 'stage_actinoneweek'
732                         
733                 elif 'improvement' in diag_record['stage']:
734                         # - backoff previous squeeze actions (slice suspend, nocreate)
735                         # TODO: add a backoff_squeeze section... Needs to runthrough
736                         print "backing off of %s" % nodename
737                         act_record['action'] = ['close_rt']
738                         act_record['message'] = message[0]
739                         act_record['stage'] = 'monitor-end-record'
740
741                 elif 'actinoneweek' in diag_record['stage']:
742                         if delta >= 7 * SPERDAY: 
743                                 act_record['email'] = TECH | PI
744                                 act_record['stage'] = 'stage_actintwoweeks'
745                                 act_record['message'] = message[1]
746                                 act_record['action'] = ['nocreate' ]
747                                 act_record['time'] = current_time               # reset clock for waitforever
748                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
749                                 act_record['email'] = TECH 
750                                 act_record['message'] = message[0]
751                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
752                                 act_record['second-mail-at-oneweek'] = True
753                         else:
754                                 act_record['message'] = None
755                                 act_record['action'] = ['waitforoneweekaction' ]
756                                 print "ignoring this record for: %s" % act_record['nodename']
757                                 return None                     # don't send if there's no action
758
759                 elif 'actintwoweeks' in diag_record['stage']:
760                         if delta >= 7 * SPERDAY:
761                                 act_record['email'] = TECH | PI | USER
762                                 act_record['stage'] = 'stage_waitforever'
763                                 act_record['message'] = message[2]
764                                 act_record['action'] = ['suspendslices']
765                                 act_record['time'] = current_time               # reset clock for waitforever
766                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
767                                 act_record['email'] = TECH | PI
768                                 act_record['message'] = message[1]
769                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
770                                 act_record['second-mail-at-twoweeks'] = True
771                         else:
772                                 act_record['message'] = None
773                                 act_record['action'] = ['waitfortwoweeksaction']
774                                 return None                     # don't send if there's no action
775
776                 elif 'ticket_waitforever' in diag_record['stage']:
777                         act_record['email'] = TECH
778                         if 'first-found' not in act_record:
779                                 act_record['first-found'] = True
780                                 act_record['log'] += " firstfound"
781                                 act_record['action'] = ['ticket_waitforever']
782                                 act_record['message'] = message[0]
783                                 act_record['time'] = current_time
784                         else:
785                                 if delta >= 7*SPERDAY:
786                                         act_record['action'] = ['ticket_waitforever']
787                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
788                                                         act_record['rt']['Status'] == 'new':
789                                                 act_record['message'] = message[0]
790                                         else:
791                                                 act_record['message'] = None
792                                                 
793                                         act_record['time'] = current_time               # reset clock
794                                 else:
795                                         act_record['action'] = ['ticket_waitforever']
796                                         act_record['message'] = None
797                                         return None
798
799                 elif 'waitforever' in diag_record['stage']:
800                         # more than 3 days since last action
801                         # TODO: send only on weekdays.
802                         # NOTE: expects that 'time' has been reset before entering waitforever stage
803                         if delta >= 3*SPERDAY:
804                                 act_record['action'] = ['email-againwaitforever']
805                                 act_record['message'] = message[2]
806                                 act_record['time'] = current_time               # reset clock
807                         else:
808                                 act_record['action'] = ['waitforever']
809                                 act_record['message'] = None
810                                 return None                     # don't send if there's no action
811
812                 else:
813                         # There is no action to be taken, possibly b/c the stage has
814                         # already been performed, but diagnose picked it up again.
815                         # two cases, 
816                         #       1. stage is unknown, or 
817                         #       2. delta is not big enough to bump it to the next stage.
818                         # TODO: figure out which. for now assume 2.
819                         print "UNKNOWN stage for %s; nothing done" % nodename
820                         act_record['action'] = ['unknown']
821                         act_record['message'] = message[0]
822
823                         act_record['email'] = TECH
824                         act_record['action'] = ['noop']
825                         act_record['message'] = message[0]
826                         act_record['stage'] = 'stage_actinoneweek'
827                         act_record['time'] = current_time               # reset clock
828                         #print "Exiting..."
829                         #return None
830                         #sys.exit(1)
831
832                 print "%s" % act_record['log'],
833                 print "%15s" % act_record['action']
834                 return act_record
835
836         def getMaxSlices(self, loginbase):
837                 # if sickdb has a loginbase, then it will have at least one node.
838                 site_stats = None
839
840                 for nodename in self.diagnose_in[loginbase].keys():
841                         if nodename in self.findbad['nodes']:
842                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
843                                 break
844
845                 if site_stats == None:
846                         raise Exception, "loginbase with no nodes in findbad"
847                 else:
848                         return site_stats['max_slices']
849
850         def getNumNodes(self, loginbase):
851                 # if sickdb has a loginbase, then it will have at least one node.
852                 site_stats = None
853
854                 for nodename in self.diagnose_in[loginbase].keys():
855                         if nodename in self.findbad['nodes']:
856                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
857                                 break
858
859                 if site_stats == None:
860                         raise Exception, "loginbase with no nodes in findbad"
861                 else:
862                         if 'num_nodes' in site_stats:
863                                 return site_stats['num_nodes']
864                         else:
865                                 return 0
866
867         """
868         Returns number of up nodes as the total number *NOT* in act_all with a
869         stage other than 'steady-state' .
870         """
871         def getUpAtSite(self, loginbase, d_diag_site):
872                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
873                 #               that aren't recorded yet.
874
875                 numnodes = self.getNumNodes(loginbase)
876                 # NOTE: assume nodes we have no record of are ok. (too conservative)
877                 # TODO: make the 'up' value more representative
878                 up = numnodes
879                 for nodename in d_diag_site[loginbase]['nodes'].keys():
880
881                         rec = d_diag_site[loginbase]['nodes'][nodename]
882                         if rec['stage'] != 'monitor-end-record':
883                                 up -= 1
884                         else:
885                                 pass # the node is assumed to be up.
886
887                 #if up != numnodes:
888                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
889
890                 return up
891
892
893 class SiteAction:
894         def __init__(self, parameter_names=['hostname', 'ticket_id']):
895                 self.parameter_names = parameter_names
896         def checkParam(self, args):
897                 for param in self.parameter_names:
898                         if param not in args:
899                                 raise Exception("Parameter %s not provided in args"%param)
900         def run(self, args):
901                 self.checkParam(args)
902                 return self._run(args)
903         def _run(self, args):
904                 pass
905
906 class SuspendAction(SiteAction):
907         def _run(self, args):
908                 return plc.suspendSlices(args['hostname'])
909
910 class RemoveSliceCreation(SiteAction):
911         def _run(self, args):
912                 return plc.removeSliceCreation(args['hostname'])
913
914 class BackoffActions(SiteAction):
915         def _run(self, args):
916                 plc.enableSlices(args['hostname'])
917                 plc.enableSliceCreation(args['hostname'])
918                 return True
919
920 # TODO: create class for each action below, 
921 #               allow for lists of actions to be performed...
922
923
924
925 def reset_nodemanager(args):
926         os.system("ssh root@%s /sbin/service nm restart" % nodename)
927         return
928
929 class Action(Thread):
930         def __init__(self, l_action):
931                 self.l_action = l_action
932
933                 # the hostname to loginbase mapping
934                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
935
936                 # Actions to take.
937                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
938                 # Actions taken.
939                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
940
941                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
942                 self.actions = {}
943                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
944                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
945                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
946                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "reinstall")    
947                 self.actions['noop'] = lambda args: args
948                 self.actions['reboot_node'] = lambda args: reboot_node(args)
949                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
950
951                 self.actions['ticket_waitforever'] = lambda args: args
952                 self.actions['waitforever'] = lambda args: args
953                 self.actions['unknown'] = lambda args: args
954                 self.actions['waitforoneweekaction'] = lambda args: args
955                 self.actions['waitfortwoweeksaction'] = lambda args: args
956                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
957                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
958                 self.actions['email-againwaitforever'] = lambda args: args
959                 self.actions['email-againticket_waitforever'] = lambda args: args
960                                 
961
962                 self.sickdb = {}
963                 Thread.__init__(self)
964
965         def run(self):
966                 self.accumSites()
967                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
968                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
969
970                 try:
971                         stats = self.analyseSites()
972                 except Exception, err:
973                         print "----------------"
974                         import traceback
975                         print traceback.print_exc()
976                         print err
977                         if config.policysavedb:
978                                 print "Saving Databases... act_all"
979                                 database.dbDump("act_all", self.act_all)
980                         sys.exit(1)
981
982                 print_stats("sites_observed", stats)
983                 print_stats("sites_diagnosed", stats)
984                 print_stats("nodes_diagnosed", stats)
985                 print_stats("sites_emailed", stats)
986                 print_stats("nodes_actedon", stats)
987                 print string.join(stats['allsites'], ",")
988
989                 if config.policysavedb:
990                         print "Saving Databases... act_all"
991                         #database.dbDump("policy.eventlog", self.eventlog)
992                         # TODO: remove 'diagnose_out', 
993                         #       or at least the entries that were acted on.
994                         database.dbDump("act_all", self.act_all)
995
996         def accumSites(self):
997                 """
998                 Take all nodes, from l_action, look them up in the diagnose_db database, 
999                 and insert them into sickdb[] as:
1000
1001                 This way only the given l_action nodes will be acted on regardless
1002                 of how many from diagnose_db are available.
1003
1004                         sickdb[loginbase][nodename] = diag_record
1005                 """
1006                 # TODO: what if l_action == None ?
1007                 for nodename in self.l_action:
1008
1009                         loginbase = self.plcdb_hn2lb[nodename]
1010
1011                         if loginbase in self.diagnose_db and \
1012                                 nodename in self.diagnose_db[loginbase]['nodes']:
1013
1014                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1015
1016                                 if loginbase not in self.sickdb:
1017                                         self.sickdb[loginbase] = {'nodes' : {}}
1018
1019                                 # NOTE: don't copy all node records, since not all will be in l_action
1020                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1021                                 # NOTE: but, we want to get the loginbase config settings, 
1022                                 #               this is the easiest way.
1023                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1024                         #else:
1025                                 #print "%s not in diagnose_db!!" % loginbase
1026                 return
1027
1028         def __emailSite(self, loginbase, roles, message, args):
1029                 """
1030                 loginbase is the unique site abbreviation, prepended to slice names.
1031                 roles contains TECH, PI, USER roles, and derive email aliases.
1032                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1033                 """
1034                 ticket_id = 0
1035                 args.update({'loginbase':loginbase})
1036
1037                 if not config.mail and not config.debug and config.bcc:
1038                         roles = ADMIN
1039                 if config.mail and config.debug:
1040                         roles = ADMIN
1041
1042                 # build targets
1043                 contacts = []
1044                 if ADMIN & roles:
1045                         contacts += [config.email]
1046                 if TECH & roles:
1047                         contacts += [TECHEMAIL % loginbase]
1048                 if PI & roles:
1049                         contacts += [PIEMAIL % loginbase]
1050                 if USER & roles:
1051                         slices = plc.slices(loginbase)
1052                         if len(slices) >= 1:
1053                                 for slice in slices:
1054                                         contacts += [SLICEMAIL % slice]
1055                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1056                         else:
1057                                 print "SLIC: %20s : 0 slices" % loginbase
1058
1059                 try:
1060                         subject = message[0] % args
1061                         body = message[1] % args
1062                         if ADMIN & roles:
1063                                 # send only to admin
1064                                 if 'ticket_id' in args:
1065                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1066                                 else:
1067                                         subj = "Re: [PL noticket] %s" % subject
1068                                 mailer.email(subj, body, contacts)
1069                                 ticket_id = args['ticket_id']
1070                         else:
1071                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1072                 except Exception, err:
1073                         print "exception on message:"
1074                         import traceback
1075                         print traceback.print_exc()
1076                         print message
1077
1078                 return ticket_id
1079
1080
1081         def _format_diaginfo(self, diag_node):
1082                 info = diag_node['info']
1083                 if diag_node['stage'] == 'monitor-end-record':
1084                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1085                 else:
1086                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1087                 return hlist
1088
1089
1090         def get_email_args(self, act_recordlist, loginbase=None):
1091
1092                 email_args = {}
1093                 email_args['hostname_list'] = ""
1094
1095                 for act_record in act_recordlist:
1096                         email_args['hostname_list'] += act_record['msg_format']
1097                         email_args['hostname'] = act_record['nodename']
1098                         if  'plcnode' in act_record and \
1099                                 'pcu_ids' in act_record['plcnode'] and \
1100                                 len(act_record['plcnode']['pcu_ids']) > 0:
1101                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1102                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1103                         else:
1104                                 email_args['pcu_id'] = "-1"
1105                                         
1106                         if 'ticket_id' in act_record:
1107                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1108                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1109                                         sys.stdout.flush()
1110                                         line = sys.stdin.readline()
1111                                         try:
1112                                                 ticket_id = int(line)
1113                                         except:
1114                                                 print "could not get ticket_id from stdin..."
1115                                                 os._exit(1)
1116                                 else:
1117                                         ticket_id = act_record['ticket_id']
1118                                         
1119                                 email_args['ticket_id'] = ticket_id
1120
1121                 return email_args
1122
1123         def get_unique_issues(self, act_recordlist):
1124                 # NOTE: only send one email per site, per problem...
1125                 unique_issues = {}
1126                 for act_record in act_recordlist:
1127                         act_key = act_record['action'][0]
1128                         if act_key not in unique_issues:
1129                                 unique_issues[act_key] = []
1130                                 
1131                         unique_issues[act_key] += [act_record]
1132                         
1133                 return unique_issues
1134                         
1135
1136         def __actOnSite(self, loginbase, site_record):
1137                 i_nodes_actedon = 0
1138                 i_nodes_emailed = 0
1139
1140                 act_recordlist = []
1141
1142                 for nodename in site_record['nodes'].keys():
1143                         diag_record = site_record['nodes'][nodename]
1144                         act_record  = self.__actOnNode(diag_record)
1145                         #print "nodename: %s %s" % (nodename, act_record)
1146                         if act_record is not None:
1147                                 act_recordlist += [act_record]
1148
1149                 unique_issues = self.get_unique_issues(act_recordlist)
1150
1151                 for issue in unique_issues.keys():
1152                         print "\tworking on issue: %s" % issue
1153                         issue_record_list = unique_issues[issue]
1154                         email_args = self.get_email_args(issue_record_list, loginbase)
1155
1156                         # for each record.
1157                         for act_record in issue_record_list:
1158                                 # if there's a pcu record and email config is set
1159                                 if 'email_pcu' in act_record:
1160                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1161                                                 # and 'reboot_node' in act_record['stage']:
1162
1163                                                 email_args['hostname'] = act_record['nodename']
1164                                                 ticket_id = self.__emailSite(loginbase, 
1165                                                                                         act_record['email'], 
1166                                                                                         emailTxt.mailtxt.pcudown[0],
1167                                                                                         email_args)
1168                                                 if ticket_id == 0:
1169                                                         # error.
1170                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1171                                                         os._exit(1)
1172                                                         pass
1173                                                 email_args['ticket_id'] = ticket_id
1174
1175                         
1176                         act_record = issue_record_list[0]
1177                         # send message before squeezing
1178                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1179                                                                                                 site_record['config']['email'])
1180                         if act_record['message'] != None and site_record['config']['email']:
1181                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1182                                                                                          act_record['message'], email_args)
1183
1184                                 if ticket_id == 0:
1185                                         # error.
1186                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1187                                         os._exit(1)
1188                                         pass
1189
1190                                 # Add ticket_id to ALL nodenames
1191                                 for act_record in issue_record_list:
1192                                         nodename = act_record['nodename']
1193                                         # update node record with RT ticket_id
1194                                         if nodename in self.act_all:
1195                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1196                                                 # if the ticket was previously resolved, reset it to new.
1197                                                 if 'rt' in act_record and \
1198                                                         'Status' in act_record['rt'] and \
1199                                                         act_record['rt']['Status'] == 'resolved':
1200                                                         mailer.setTicketStatus(ticket_id, "new")
1201                                                 status = mailer.getTicketStatus(ticket_id)
1202                                                 self.act_all[nodename][0]['rt'] = status
1203                                         if config.mail: i_nodes_emailed += 1
1204
1205                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1206                                                                                                         site_record['config']['squeeze'])
1207                         if config.squeeze and site_record['config']['squeeze']:
1208                                 for act_key in act_record['action']:
1209                                         self.actions[act_key](email_args)
1210                                 i_nodes_actedon += 1
1211                 
1212                 if config.policysavedb:
1213                         print "Saving Databases... act_all, diagnose_out"
1214                         database.dbDump("act_all", self.act_all)
1215                         # remove site record from diagnose_out, it's in act_all as done.
1216                         del self.diagnose_db[loginbase]
1217                         database.dbDump("diagnose_out", self.diagnose_db)
1218
1219                 print "sleeping for 1 sec"
1220                 time.sleep(1)
1221                 #print "Hit enter to continue..."
1222                 #sys.stdout.flush()
1223                 #line = sys.stdin.readline()
1224
1225                 return (i_nodes_actedon, i_nodes_emailed)
1226
1227         def __actOnNode(self, diag_record):
1228                 nodename = diag_record['nodename']
1229                 message = diag_record['message']
1230
1231                 act_record = {}
1232                 act_record.update(diag_record)
1233                 act_record['nodename'] = nodename
1234                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1235                 print "act_record['stage'] == %s " % act_record['stage']
1236
1237                 # avoid end records, and nmreset records                                        
1238                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1239
1240                 if 'monitor-end-record' not in act_record['stage'] and \
1241                    'nmreset' not in act_record['stage'] and \
1242                    'reboot_node_failed' not in act_record:
1243
1244                         if "DOWN" in act_record['log'] and \
1245                                         'pcu_ids' in act_record['plcnode'] and \
1246                                         len(act_record['plcnode']['pcu_ids']) > 0:
1247
1248                                 print "%s" % act_record['log'],
1249                                 print "%15s" % (['reboot_node'],)
1250                                 # Set node to re-install
1251                                 plc.nodeBootState(act_record['nodename'], "reinstall")  
1252                                 try:
1253                                         ret = reboot_node({'hostname': act_record['nodename']})
1254                                 except Exception, exc:
1255                                         print "exception on reboot_node:"
1256                                         import traceback
1257                                         print traceback.print_exc()
1258                                         ret = False
1259
1260                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1261                                         # Reboot Succeeded
1262                                         print "reboot succeeded for %s" % act_record['nodename']
1263                                         act_record2 = {}
1264                                         act_record2.update(act_record)
1265                                         act_record2['action'] = ['reboot_node']
1266                                         act_record2['stage'] = "reboot_node"
1267                                         act_record2['reboot_node_failed'] = False
1268                                         act_record2['email_pcu'] = False
1269
1270                                         if nodename not in self.act_all: 
1271                                                 self.act_all[nodename] = []
1272                                         print "inserting 'reboot_node' record into act_all"
1273                                         self.act_all[nodename].insert(0,act_record2)
1274
1275                                         # return None to avoid further action
1276                                         print "Taking no further action"
1277                                         return None
1278                                 else:
1279                                         print "reboot failed for %s" % act_record['nodename']
1280                                         # set email_pcu to also send pcu notice for this record.
1281                                         act_record['reboot_node_failed'] = True
1282                                         act_record['email_pcu'] = True
1283
1284                         print "%s" % act_record['log'],
1285                         print "%15s" % act_record['action']
1286
1287                 if act_record['stage'] is not 'monitor-end-record' and \
1288                    act_record['stage'] is not 'nmreset':
1289                         if nodename not in self.act_all: 
1290                                 self.act_all[nodename] = []
1291
1292                         self.act_all[nodename].insert(0,act_record)
1293                 else:
1294                         print "Not recording %s in act_all" % nodename
1295
1296                 return act_record
1297
1298         def analyseSites(self):
1299                 i_sites_observed = 0
1300                 i_sites_diagnosed = 0
1301                 i_nodes_diagnosed = 0
1302                 i_nodes_actedon = 0
1303                 i_sites_emailed = 0
1304                 l_allsites = []
1305
1306                 sorted_sites = self.sickdb.keys()
1307                 sorted_sites.sort()
1308                 for loginbase in sorted_sites:
1309                         site_record = self.sickdb[loginbase]
1310                         print "sites: %s" % loginbase
1311                         
1312                         i_nodes_diagnosed += len(site_record.keys())
1313                         i_sites_diagnosed += 1
1314
1315                         (na,ne) = self.__actOnSite(loginbase, site_record)
1316
1317                         i_sites_observed += 1
1318                         i_nodes_actedon += na
1319                         i_sites_emailed += ne
1320
1321                         l_allsites += [loginbase]
1322
1323                 return {'sites_observed': i_sites_observed, 
1324                                 'sites_diagnosed': i_sites_diagnosed, 
1325                                 'nodes_diagnosed': i_nodes_diagnosed, 
1326                                 'sites_emailed': i_sites_emailed, 
1327                                 'nodes_actedon': i_nodes_actedon, 
1328                                 'allsites':l_allsites}
1329
1330         def print_stats(self, key, stats):
1331                 print "%20s : %d" % (key, stats[key])
1332
1333
1334
1335         #"""
1336         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1337         #"""
1338         #def status(self):
1339         #       sub = "Monitor Summary"
1340         #       msg = "\nThe following nodes were acted upon:  \n\n"
1341         #       for (node, (type, date)) in self.emailed.items():
1342         #               # Print only things acted on today.
1343         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1344         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1345         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1346         #       for (loginbase, (date, type)) in self.squeezed.items():
1347         #               # Print only things acted on today.
1348         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1349         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1350         #       mailer.email(sub, msg, [SUMTO])
1351         #       logger.info(msg)
1352         #       return 
1353
1354         #"""
1355         #Store/Load state of emails.  When, where, what.
1356         #"""
1357         #def emailedStore(self, action):
1358         #       try:
1359         #               if action == "LOAD":
1360         #                       f = open(DAT, "r+")
1361         #                       logger.info("POLICY:  Found and reading " + DAT)
1362         #                       self.emailed.update(pickle.load(f))
1363         #               if action == "WRITE":
1364         #                       f = open(DAT, "w")
1365         #                       #logger.debug("Writing " + DAT)
1366         #                       pickle.dump(self.emailed, f)
1367         #               f.close()
1368         #       except Exception, err:
1369         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1370
1371
1372 #class Policy(Thread):
1373
1374 def main():
1375         print "policy.py is a module, not a script for running directly."
1376
1377 if __name__ == '__main__':
1378         import os
1379         import plc
1380         try:
1381                 main()
1382         except KeyboardInterrupt:
1383                 print "Killed.  Exitting."
1384                 logger.info('Monitor Killed')
1385                 os._exit(0)