027da355ca1a766d39b7de691bf01d4ec72d5511
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import database
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 import config
26
27 DAT="./monitor.dat"
28
29 logger = logging.getLogger("monitor")
30
31 # Time to enforce policy
32 POLSLEEP = 7200
33
34 # Where to email the summary
35 SUMTO = "soltesz@cs.princeton.edu"
36 TECHEMAIL="tech-%s@sites.planet-lab.org"
37 PIEMAIL="pi-%s@sites.planet-lab.org"
38 SLICEMAIL="%s@slices.planet-lab.org"
39 PLCEMAIL="support@planet-lab.org"
40
41 #Thresholds (DAYS)
42 SPERMIN = 60
43 SPERHOUR = 60*60
44 SPERDAY = 86400
45 PITHRESH = 7 * SPERDAY
46 SLICETHRESH = 7 * SPERDAY
47 # Days before attempting rins again
48 RINSTHRESH = 5 * SPERDAY
49
50 # Days before calling the node dead.
51 DEADTHRESH = 30 * SPERDAY
52 # Minimum number of nodes up before squeezing
53 MINUP = 2
54
55 TECH=1
56 PI=2
57 USER=4
58 ADMIN=8
59
60 # IF:
61 #  no SSH, down.
62 #  bad disk, down
63 #  DNS, kinda down (sick)
64 #  clock, kinda down (sick)
65 #  Full disk, going to be down
66
67 # Actions:
68 #  Email
69 #  suspend slice creation
70 #  kill slices
71 def array_to_priority_map(array):
72         """ Create a mapping where each entry of array is given a priority equal
73         to its position in the array.  This is useful for subsequent use in the
74         cmpMap() function."""
75         map = {}
76         count = 0
77         for i in array:
78                 map[i] = count
79                 count += 1
80         return map
81
82 def getdebug():
83         return config.debug
84
85 def print_stats(key, stats):
86         if key in stats: print "%20s : %d" % (key, stats[key])
87
88 def get_ticket_id(record):
89         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
90                 return record['ticket_id']
91         elif            'found_rt_ticket' in record and \
92                  record['found_rt_ticket'] is not "" and \
93                  record['found_rt_ticket'] is not None:
94                 return record['found_rt_ticket']
95         else:
96                 return None
97
98 class Merge(Thread):
99         def __init__(self, l_merge, toRT):
100                 self.toRT = toRT
101                 self.merge_list = l_merge
102                 # the hostname to loginbase mapping
103                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
104
105                 # Previous actions taken on nodes.
106                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
107                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
108
109                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
110                 self.sickdb = {}
111                 self.mergedb = {}
112                 Thread.__init__(self)
113
114         def run(self):
115                 # populate sickdb
116                 self.accumSickSites()
117                 # read data from findbad and act_all
118                 self.mergeActionsAndBadDB()
119                 # pass node_records to RT
120                 self.sendToRT()
121
122         def accumSickSites(self):
123                 """
124                 Take all nodes, from l_diagnose, look them up in the act_all database, 
125                 and insert them into sickdb[] as:
126
127                         sickdb[loginbase][nodename] = fb_record
128                 """
129                 # look at all problems reported by findbad
130                 l_nodes = self.findbad['nodes'].keys()
131                 count = 0
132                 for nodename in l_nodes:
133                         if nodename not in self.merge_list:
134                                 continue                # skip this node, since it's not wanted
135
136                         count += 1
137                         loginbase = self.plcdb_hn2lb[nodename]
138                         values = self.findbad['nodes'][nodename]['values']
139
140                         fb_record = {}
141                         fb_record['nodename'] = nodename
142                         try:
143                                 fb_record['category'] = values['category']
144                         except:
145                                 print values
146                                 print nodename
147                                 print self.findbad['nodes'][nodename]
148                                 count -= 1
149                                 continue
150                         fb_record['state'] = values['state']
151                         fb_record['comonstats'] = values['comonstats']
152                         fb_record['plcnode'] = values['plcnode']
153                         fb_record['kernel'] = self.getKernel(values['kernel'])
154                         fb_record['stage'] = "findbad"
155                         fb_record['message'] = None
156                         fb_record['bootcd'] = values['bootcd']
157                         fb_record['args'] = None
158                         fb_record['info'] = None
159                         fb_record['time'] = time.time()
160                         fb_record['date_created'] = time.time()
161
162                         if loginbase not in self.sickdb:
163                                 self.sickdb[loginbase] = {}
164
165                         self.sickdb[loginbase][nodename] = fb_record
166
167                 print "Found %d nodes" % count
168
169         def getKernel(self, unamestr):
170                 s = unamestr.split()
171                 if len(s) > 2:
172                         return s[2]
173                 else:
174                         return ""
175
176         def mergeActionsAndBadDB(self): 
177                 """
178                 - Look at the sick node_records as reported in findbad, 
179                 - Then look at the node_records in act_all.  
180
181                 There are four cases:
182                 1) Problem in findbad, no problem in act_all
183                         this ok, b/c it just means it's a new problem
184                 2) Problem in findbad, problem in act_all
185                         -Did the problem get better or worse?  
186                                 -If Same, or Worse, then continue looking for open tickets.
187                                 -If Better, or No problem, then "back-off" penalties.
188                                         This judgement may need to wait until 'Diagnose()'
189
190                 3) No problem in findbad, problem in act_all
191                         The the node is operational again according to Findbad()
192
193                 4) No problem in findbad, no problem in act_all
194                         There won't be a record in either db, so there's no code.
195                 """
196
197                 sorted_sites = self.sickdb.keys()
198                 sorted_sites.sort()
199                 # look at all problems reported by findbad
200                 for loginbase in sorted_sites:
201                         d_fb_nodes = self.sickdb[loginbase]
202                         sorted_nodes = d_fb_nodes.keys()
203                         sorted_nodes.sort()
204                         for nodename in sorted_nodes:
205                                 fb_record = self.sickdb[loginbase][nodename]
206                                 x = fb_record
207                                 if loginbase not in self.mergedb:
208                                         self.mergedb[loginbase] = {}
209
210                                 # take the info either from act_all or fb-record.
211                                 # if node not in act_all
212                                 #       then take it from fbrecord, obviously.
213                                 # else node in act_all
214                                 #   if act_all == 0 length (no previous records)
215                                 #               then take it from fbrecord.
216                                 #   else
217                                 #           take it from act_all.
218                                 #   
219
220                                 # We must compare findbad state with act_all state
221                                 if nodename not in self.act_all:
222                                         # 1) ok, b/c it's a new problem. set ticket_id to null
223                                         self.mergedb[loginbase][nodename] = {} 
224                                         self.mergedb[loginbase][nodename].update(x)
225                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
226                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
227                                 else: 
228                                         if len(self.act_all[nodename]) == 0:
229                                                 self.mergedb[loginbase][nodename] = {} 
230                                                 self.mergedb[loginbase][nodename].update(x)
231                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
232                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
233                                         else:
234                                                 y = self.act_all[nodename][0]
235                                                 y['prev_category'] = y['category']
236
237                                                 self.mergedb[loginbase][nodename] = {}
238                                                 self.mergedb[loginbase][nodename].update(y)
239                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
240                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
241                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
242                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
243                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
244                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
245                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
246                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
247
248                                         # delete the entry from cache_all to keep it out of case 3)
249                                         del self.cache_all[nodename]
250
251                 # 3) nodes that remin in cache_all were not identified by findbad.
252                 #        Do we keep them or not?
253                 #   NOTE: i think that since the categories are performed before this
254                 #               step now, and by a monitor-controlled agent.
255
256                 # TODO: This does not work correctly.  Do we need this? 
257                 #for hn in self.cache_all.keys():
258                 #       y = self.act_all[hn][0]
259                 #       if 'monitor' in y['bucket']:
260                 #               loginbase = self.plcdb_hn2lb[hn] 
261                 #               if loginbase not in self.sickdb:
262                 #                       self.sickdb[loginbase] = {}
263                 #               self.sickdb[loginbase][hn] = y
264                 #       else:
265                 #               del self.cache_all[hn]
266
267                 print "len of cache_all: %d" % len(self.cache_all.keys())
268                 return
269
270         def sendToRT(self):
271                 sorted_sites = self.mergedb.keys()
272                 sorted_sites.sort()
273                 # look at all problems reported by merge
274                 for loginbase in sorted_sites:
275                         d_merge_nodes = self.mergedb[loginbase]
276                         for nodename in d_merge_nodes.keys():
277                                 record = self.mergedb[loginbase][nodename]
278                                 self.toRT.put(record)
279
280                 # send signal to stop reading
281                 self.toRT.put(None)
282                 return
283
284 class Diagnose(Thread):
285         def __init__(self, fromRT):
286                 self.fromRT = fromRT
287                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
288                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
289
290                 self.diagnose_in = {}
291                 self.diagnose_out = {}
292                 Thread.__init__(self)
293
294
295         def run(self):
296                 self.accumSickSites()
297
298                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
299                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
300
301                 try:
302                         stats = self.diagnoseAll()
303                 except Exception, err:
304                         print "----------------"
305                         import traceback
306                         print traceback.print_exc()
307                         print err
308                         #if config.policysavedb:
309                         sys.exit(1)
310
311                 print_stats("sites_observed", stats)
312                 print_stats("sites_diagnosed", stats)
313                 print_stats("nodes_diagnosed", stats)
314
315                 if config.policysavedb:
316                         print "Saving Databases... diagnose_out"
317                         database.dbDump("diagnose_out", self.diagnose_out)
318
319         def accumSickSites(self):
320                 """
321                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
322                 and insert them into diagnose_in[] as:
323
324                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
325                 """
326                 while 1:
327                         node_record = self.fromRT.get(block = True)
328                         if node_record == None:
329                                 break;
330
331                         nodename = node_record['nodename']
332                         loginbase = self.plcdb_hn2lb[nodename]
333
334                         if loginbase not in self.diagnose_in:
335                                 self.diagnose_in[loginbase] = {}
336
337                         self.diagnose_in[loginbase][nodename] = node_record
338
339                 return
340
341         def diagnoseAll(self):
342                 i_sites_observed = 0
343                 i_sites_diagnosed = 0
344                 i_nodes_diagnosed = 0
345                 i_nodes_actedon = 0
346                 i_sites_emailed = 0
347                 l_allsites = []
348
349                 sorted_sites = self.diagnose_in.keys()
350                 sorted_sites.sort()
351                 self.diagnose_out= {}
352                 for loginbase in sorted_sites:
353                         l_allsites += [loginbase]
354
355                         d_diag_nodes = self.diagnose_in[loginbase]
356                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
357                         # store records in diagnose_out, for saving later.
358                         self.diagnose_out.update(d_act_records)
359                         
360                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
361                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
362                                 i_sites_diagnosed += 1
363                         i_sites_observed += 1
364
365                 return {'sites_observed': i_sites_observed, 
366                                 'sites_diagnosed': i_sites_diagnosed, 
367                                 'nodes_diagnosed': i_nodes_diagnosed, 
368                                 'allsites':l_allsites}
369
370                 pass
371                 
372         def getDaysDown(cls, diag_record):
373                 daysdown = -1
374                 last_contact = diag_record['plcnode']['last_contact']
375                 date_created = diag_record['plcnode']['date_created']
376
377                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
378                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
379                 elif last_contact is None:
380                         if date_created is not None:
381                                 now = time.time()
382                                 diff = now - date_created
383                                 daysdown = diff // (60*60*24)
384                         else:
385                                 daysdown = -1
386                 else:
387                         now = time.time()
388                         diff = now - last_contact
389                         daysdown = diff // (60*60*24)
390                 return daysdown
391         getDaysDown = classmethod(getDaysDown)
392
393         def getStrDaysDown(cls, diag_record):
394                 daysdown = "unknown"
395                 last_contact = diag_record['plcnode']['last_contact']
396                 date_created = diag_record['plcnode']['date_created']
397
398                 if      diag_record['comonstats']['uptime'] != "null" and \
399                         diag_record['comonstats']['uptime'] != "-1":
400                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
401                         daysdown = "%d days up" % daysdown
402
403                 elif last_contact is None:
404                         if date_created is not None:
405                                 now = time.time()
406                                 diff = now - date_created
407                                 daysdown = diff // (60*60*24)
408                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
409                         else:
410                                 daysdown = "Never contacted PLC"
411                 else:
412                         now = time.time()
413                         diff = now - last_contact
414                         daysdown = diff // (60*60*24)
415                         daysdown = "%s days down" % daysdown
416                 return daysdown
417         getStrDaysDown = classmethod(getStrDaysDown)
418         #def getStrDaysDown(cls, diag_record):
419         #       daysdown = cls.getDaysDown(diag_record)
420         #       if daysdown > -1:
421         #               return "%d days down"%daysdown
422         #       elif daysdown == -1:
423         #               return "Has never contacted PLC"
424         #       else:
425         #               return "%d days up"% -daysdown
426         #getStrDaysDown = classmethod(getStrDaysDown)
427
428         def __getCDVersion(self, diag_record, nodename):
429                 cdversion = ""
430                 #print "Getting kernel for: %s" % diag_record['nodename']
431                 cdversion = diag_record['kernel']
432                 return cdversion
433
434         def __diagnoseSite(self, loginbase, d_diag_nodes):
435                 """
436                 d_diag_nodes are diagnose_in entries.
437                 """
438                 d_diag_site = {loginbase : { 'config' : 
439                                                                                                 {'squeeze': False,
440                                                                                                  'email': False
441                                                                                                 }, 
442                                                                         'nodes': {}
443                                                                         }
444                                            }
445                 sorted_nodes = d_diag_nodes.keys()
446                 sorted_nodes.sort()
447                 for nodename in sorted_nodes:
448                         node_record = d_diag_nodes[nodename]
449                         diag_record = self.__diagnoseNode(loginbase, node_record)
450
451                         if diag_record != None:
452                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
453
454                                 # NOTE: improvement means, we need to act/squeeze and email.
455                                 #print "DIAG_RECORD", diag_record
456                                 if 'monitor-end-record' in diag_record['stage'] or \
457                                    'nmreset' in diag_record['stage']:
458                                 #       print "resetting loginbase!" 
459                                         d_diag_site[loginbase]['config']['squeeze'] = True
460                                         d_diag_site[loginbase]['config']['email'] = True
461                                 #else:
462                                 #       print "NO IMPROVEMENT!!!!"
463                         else:
464                                 pass # there is nothing to do for this node.
465
466                 # NOTE: these settings can be overridden by command line arguments,
467                 #       or the state of a record, i.e. if already in RT's Support Queue.
468                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
469                 if nodes_up < MINUP:
470                         d_diag_site[loginbase]['config']['squeeze'] = True
471
472                 max_slices = self.getMaxSlices(loginbase)
473                 num_nodes = self.getNumNodes(loginbase)
474                 # NOTE: when max_slices == 0, this is either a new site (the old way)
475                 #       or an old disabled site from previous monitor (before site['enabled'])
476                 if nodes_up < num_nodes and max_slices != 0:
477                         d_diag_site[loginbase]['config']['email'] = True
478
479                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
480                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
481
482                 return d_diag_site
483
484         def diagRecordByCategory(self, node_record):
485                 nodename = node_record['nodename']
486                 category = node_record['category']
487                 state    = node_record['state']
488                 loginbase = self.plcdb_hn2lb[nodename]
489                 diag_record = None
490
491                 if  "ERROR" in category:        # i.e. "DOWN"
492                         diag_record = {}
493                         diag_record.update(node_record)
494                         daysdown = self.getDaysDown(diag_record) 
495                         if daysdown < 7:
496                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
497                                 print format % (loginbase, nodename, daysdown)
498                                 return None
499
500                         s_daysdown = self.getStrDaysDown(diag_record)
501                         diag_record['message'] = emailTxt.mailtxt.newdown
502                         diag_record['args'] = {'nodename': nodename}
503                         diag_record['info'] = (nodename, s_daysdown, "")
504
505                         if 'reboot_node_failed' in node_record:
506                                 # there was a previous attempt to use the PCU.
507                                 if node_record['reboot_node_failed'] == False:
508                                         # then the last attempt apparently, succeeded.
509                                         # But, the category is still 'ERROR'.  Therefore, the
510                                         # PCU-to-Node mapping is broken.
511                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
512                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
513                                         diag_record['email_pcu'] = True
514
515                         if 'ticket_id' in diag_record:
516                                 if diag_record['ticket_id'] == "":
517                                         if 'found_rt_ticket' in diag_record:
518                                                 ticket_id = diag_record['found_rt_ticket']
519                                         else:
520                                                 ticket_id = "None"
521                                 else:
522                                         ticket_id = diag_record['ticket_id']
523                         else:
524                                 ticket_id = "None"
525
526                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
527                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
528
529                 elif "OLDBOOTCD" in category:
530                         # V2 boot cds as determined by findbad
531                         s_daysdown = self.getStrDaysDown(node_record)
532                         s_cdversion = self.__getCDVersion(node_record, nodename)
533                         diag_record = {}
534                         diag_record.update(node_record)
535                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
536                         diag_record['message'] = emailTxt.mailtxt.newbootcd
537                         diag_record['args'] = {'nodename': nodename}
538                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
539                         if diag_record['ticket_id'] == "":
540                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
541                                                                         (loginbase, nodename, diag_record['kernel'], 
542                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
543                         else:
544                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
545                                                                         (loginbase, nodename, diag_record['kernel'], 
546                                                                          diag_record['bootcd'], diag_record['ticket_id'])
547
548                 elif "PROD" in category:
549                         if "DEBUG" in state:
550                                 # Not sure what to do with these yet.  Probably need to
551                                 # reboot, and email.
552                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
553                                 return None
554                         elif "BOOT" in state:
555                                 # no action needed.
556                                 # TODO: remove penalties, if any are applied.
557                                 now = time.time()
558                                 last_contact = node_record['plcnode']['last_contact']
559                                 if last_contact == None:
560                                         time_diff = 0
561                                 else:
562                                         time_diff = now - last_contact;
563
564                                 if 'improvement' in node_record['stage']:
565                                         # then we need to pass this on to 'action'
566                                         diag_record = {}
567                                         diag_record.update(node_record)
568                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
569                                         diag_record['args'] = {'nodename': nodename}
570                                         diag_record['info'] = (nodename, node_record['prev_category'], 
571                                                                                                          node_record['category'])
572                                         if 'email_pcu' in diag_record:
573                                                 if diag_record['email_pcu']:
574                                                         # previously, the pcu failed to reboot, so send
575                                                         # email. Now, reset these values to try the reboot
576                                                         # again.
577                                                         diag_record['email_pcu'] = False
578                                                         del diag_record['reboot_node_failed']
579
580                                         if diag_record['ticket_id'] == "":
581                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
582                                                                         (loginbase, nodename, diag_record['stage'], 
583                                                                          state, category, diag_record['found_rt_ticket'])
584                                         else:
585                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
586                                                                         (loginbase, nodename, diag_record['stage'], 
587                                                                          state, category, diag_record['ticket_id'])
588                                         return diag_record
589                                 #elif time_diff >= 6*SPERHOUR:
590                                 #       # heartbeat is older than 30 min.
591                                 #       # then reset NM.
592                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
593                                 #       diag_record = {}
594                                 #       diag_record.update(node_record)
595                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
596                                 #       diag_record['args'] = {'nodename': nodename}
597                                 #       diag_record['stage'] = "nmreset"
598                                 #       diag_record['info'] = (nodename, 
599                                 #                                                       node_record['prev_category'], 
600                                 #                                                       node_record['category'])
601                                 #       if diag_record['ticket_id'] == "":
602                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
603                                 #                                       (loginbase, nodename, diag_record['stage'], 
604                                 #                                        state, category, diag_record['found_rt_ticket'])
605                                 #       else:
606                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
607                                 #                                       (loginbase, nodename, diag_record['stage'])
608 #
609 #                                       return diag_record
610                                 else:
611                                         return None
612                         else:
613                                 # unknown
614                                 pass
615                 elif "ALPHA"    in category:
616                         pass
617                 elif "clock_drift" in category:
618                         pass
619                 elif "dns"    in category:
620                         pass
621                 elif "filerw"    in category:
622                         pass
623                 else:
624                         print "Unknown category!!!! %s" % category
625                         sys.exit(1)
626
627                 return diag_record
628
629         def __diagnoseNode(self, loginbase, node_record):
630                 # TODO: change the format of the hostname in this 
631                 #               record to something more natural.
632                 nodename                = node_record['nodename']
633                 category                = node_record['category']
634                 prev_category   = node_record['prev_category']
635                 state                   = node_record['state']
636                 #if 'prev_category' in node_record:
637                 #       prev_category = node_record['prev_category']
638                 #else:
639                 #       prev_category = "ERROR"
640                 if node_record['prev_category'] != "NORECORD":
641                 
642                         val = cmpCategoryVal(category, prev_category)
643                         print "%s went from %s -> %s" % (nodename, prev_category, category)
644                         if val == 1:
645                                 # improved
646                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
647                                         print "closing record with no ticket: ", node_record['nodename']
648                                         node_record['action'] = ['close_rt']
649                                         node_record['message'] = None
650                                         node_record['stage'] = 'monitor-end-record'
651                                         return node_record
652                                 else:
653                                         node_record['stage'] = 'improvement'
654
655                                 #if 'monitor-end-record' in node_record['stage']:
656                                 #       # just ignore it if it's already ended.
657                                 #       # otherwise, the status should be worse, and we won't get
658                                 #       # here.
659                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
660                                 #       return None
661 #
662 #                                       #return None
663                         elif val == -1:
664                                 # current category is worse than previous, carry on
665                                 pass
666                         else:
667                                 #values are equal, carry on.
668                                 #print "why are we here?"
669                                 pass
670
671                 if 'rt' in node_record and 'Status' in node_record['rt']:
672                         if node_record['stage'] == 'ticket_waitforever':
673                                 if 'resolved' in node_record['rt']['Status']:
674                                         print "ending waitforever record for: ", node_record['nodename']
675                                         node_record['action'] = ['noop']
676                                         node_record['message'] = None
677                                         node_record['stage'] = 'monitor-end-record'
678                                         print "oldlog: %s" % node_record['log'],
679                                         print "%15s" % node_record['action']
680                                         return node_record
681                                 if 'new' in node_record['rt']['Status'] and \
682                                         'Queue' in node_record['rt'] and \
683                                         'Monitor' in node_record['rt']['Queue']:
684
685                                         print "RESETTING stage to findbad"
686                                         node_record['stage'] = 'findbad'
687                         
688                 #### COMPARE category and prev_category
689                 # if not_equal
690                 #       then assign a stage based on relative priorities
691                 # else equal
692                 #       then check category for stats.
693                 diag_record = self.diagRecordByCategory(node_record)
694                 if diag_record == None:
695                         #print "diag_record == None"
696                         return None
697
698                 #### found_RT_ticket
699                 # TODO: need to record time found, and maybe add a stage for acting on it...
700                 # NOTE: after found, if the support ticket is resolved, the block is
701                 #               not removed. How to remove the block on this?
702                 if 'found_rt_ticket' in diag_record and \
703                         diag_record['found_rt_ticket'] is not None:
704                         if diag_record['stage'] is not 'improvement':
705                                 diag_record['stage'] = 'ticket_waitforever'
706                                 
707                 current_time = time.time()
708                 # take off four days, for the delay that database caused.
709                 # TODO: generalize delays at PLC, and prevent enforcement when there
710                 #               have been no emails.
711                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
712                 #delta = current_time - diag_record['time'] - 7*SPERDAY
713                 delta = current_time - diag_record['time']
714
715                 message = diag_record['message']
716                 act_record = {}
717                 act_record.update(diag_record)
718
719                 #### DIAGNOSE STAGES 
720                 if   'findbad' in diag_record['stage']:
721                         # The node is bad, and there's no previous record of it.
722                         act_record['email'] = TECH
723                         act_record['action'] = ['noop']
724                         act_record['message'] = message[0]
725                         act_record['stage'] = 'stage_actinoneweek'
726
727                 elif 'nmreset' in diag_record['stage']:
728                         act_record['email']  = ADMIN 
729                         act_record['action'] = ['reset_nodemanager']
730                         act_record['message'] = message[0]
731                         act_record['stage']  = 'nmreset'
732                         return None
733
734                 elif 'reboot_node' in diag_record['stage']:
735                         act_record['email'] = TECH
736                         act_record['action'] = ['noop']
737                         act_record['message'] = message[0]
738                         act_record['stage'] = 'stage_actinoneweek'
739                         
740                 elif 'improvement' in diag_record['stage']:
741                         # - backoff previous squeeze actions (slice suspend, nocreate)
742                         # TODO: add a backoff_squeeze section... Needs to runthrough
743                         print "backing off of %s" % nodename
744                         act_record['action'] = ['close_rt']
745                         act_record['message'] = message[0]
746                         act_record['stage'] = 'monitor-end-record'
747
748                 elif 'actinoneweek' in diag_record['stage']:
749                         if delta >= 7 * SPERDAY: 
750                                 act_record['email'] = TECH | PI
751                                 act_record['stage'] = 'stage_actintwoweeks'
752                                 act_record['message'] = message[1]
753                                 act_record['action'] = ['nocreate' ]
754                                 act_record['time'] = current_time               # reset clock for waitforever
755                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
756                                 act_record['email'] = TECH 
757                                 act_record['message'] = message[0]
758                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
759                                 act_record['second-mail-at-oneweek'] = True
760                         else:
761                                 act_record['message'] = None
762                                 act_record['action'] = ['waitforoneweekaction' ]
763                                 print "ignoring this record for: %s" % act_record['nodename']
764                                 return None                     # don't send if there's no action
765
766                 elif 'actintwoweeks' in diag_record['stage']:
767                         if delta >= 7 * SPERDAY:
768                                 act_record['email'] = TECH | PI | USER
769                                 act_record['stage'] = 'stage_waitforever'
770                                 act_record['message'] = message[2]
771                                 act_record['action'] = ['suspendslices']
772                                 act_record['time'] = current_time               # reset clock for waitforever
773                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
774                                 act_record['email'] = TECH | PI
775                                 act_record['message'] = message[1]
776                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
777                                 act_record['second-mail-at-twoweeks'] = True
778                         else:
779                                 act_record['message'] = None
780                                 act_record['action'] = ['waitfortwoweeksaction']
781                                 return None                     # don't send if there's no action
782
783                 elif 'ticket_waitforever' in diag_record['stage']:
784                         act_record['email'] = TECH
785                         if 'first-found' not in act_record:
786                                 act_record['first-found'] = True
787                                 act_record['log'] += " firstfound"
788                                 act_record['action'] = ['ticket_waitforever']
789                                 act_record['message'] = message[0]
790                                 act_record['time'] = current_time
791                         else:
792                                 if delta >= 7*SPERDAY:
793                                         act_record['action'] = ['ticket_waitforever']
794                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
795                                                         act_record['rt']['Status'] == 'new':
796                                                 act_record['message'] = message[0]
797                                         else:
798                                                 act_record['message'] = None
799                                                 
800                                         act_record['time'] = current_time               # reset clock
801                                 else:
802                                         act_record['action'] = ['ticket_waitforever']
803                                         act_record['message'] = None
804                                         return None
805
806                 elif 'waitforever' in diag_record['stage']:
807                         # more than 3 days since last action
808                         # TODO: send only on weekdays.
809                         # NOTE: expects that 'time' has been reset before entering waitforever stage
810                         if delta >= 3*SPERDAY:
811                                 act_record['action'] = ['email-againwaitforever']
812                                 act_record['message'] = message[2]
813                                 act_record['time'] = current_time               # reset clock
814                         else:
815                                 act_record['action'] = ['waitforever']
816                                 act_record['message'] = None
817                                 return None                     # don't send if there's no action
818
819                 else:
820                         # There is no action to be taken, possibly b/c the stage has
821                         # already been performed, but diagnose picked it up again.
822                         # two cases, 
823                         #       1. stage is unknown, or 
824                         #       2. delta is not big enough to bump it to the next stage.
825                         # TODO: figure out which. for now assume 2.
826                         print "UNKNOWN stage for %s; nothing done" % nodename
827                         act_record['action'] = ['unknown']
828                         act_record['message'] = message[0]
829
830                         act_record['email'] = TECH
831                         act_record['action'] = ['noop']
832                         act_record['message'] = message[0]
833                         act_record['stage'] = 'stage_actinoneweek'
834                         act_record['time'] = current_time               # reset clock
835                         #print "Exiting..."
836                         #return None
837                         #sys.exit(1)
838
839                 print "%s" % act_record['log'],
840                 print "%15s" % act_record['action']
841                 return act_record
842
843         def getMaxSlices(self, loginbase):
844                 # if sickdb has a loginbase, then it will have at least one node.
845                 site_stats = None
846
847                 for nodename in self.diagnose_in[loginbase].keys():
848                         if nodename in self.findbad['nodes']:
849                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
850                                 break
851
852                 if site_stats == None:
853                         raise Exception, "loginbase with no nodes in findbad"
854                 else:
855                         return site_stats['max_slices']
856
857         def getNumNodes(self, loginbase):
858                 # if sickdb has a loginbase, then it will have at least one node.
859                 site_stats = None
860
861                 for nodename in self.diagnose_in[loginbase].keys():
862                         if nodename in self.findbad['nodes']:
863                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
864                                 break
865
866                 if site_stats == None:
867                         raise Exception, "loginbase with no nodes in findbad"
868                 else:
869                         if 'num_nodes' in site_stats:
870                                 return site_stats['num_nodes']
871                         else:
872                                 return 0
873
874         """
875         Returns number of up nodes as the total number *NOT* in act_all with a
876         stage other than 'steady-state' .
877         """
878         def getUpAtSite(self, loginbase, d_diag_site):
879                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
880                 #               that aren't recorded yet.
881
882                 numnodes = self.getNumNodes(loginbase)
883                 # NOTE: assume nodes we have no record of are ok. (too conservative)
884                 # TODO: make the 'up' value more representative
885                 up = numnodes
886                 for nodename in d_diag_site[loginbase]['nodes'].keys():
887
888                         rec = d_diag_site[loginbase]['nodes'][nodename]
889                         if rec['stage'] != 'monitor-end-record':
890                                 up -= 1
891                         else:
892                                 pass # the node is assumed to be up.
893
894                 #if up != numnodes:
895                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
896
897                 return up
898
899
900 class SiteAction:
901         def __init__(self, parameter_names=['hostname', 'ticket_id']):
902                 self.parameter_names = parameter_names
903         def checkParam(self, args):
904                 for param in self.parameter_names:
905                         if param not in args:
906                                 raise Exception("Parameter %s not provided in args"%param)
907         def run(self, args):
908                 self.checkParam(args)
909                 return self._run(args)
910         def _run(self, args):
911                 pass
912
913 class SuspendAction(SiteAction):
914         def _run(self, args):
915                 return plc.suspendSlices(args['hostname'])
916
917 class RemoveSliceCreation(SiteAction):
918         def _run(self, args):
919                 return plc.removeSliceCreation(args['hostname'])
920
921 class BackoffActions(SiteAction):
922         def _run(self, args):
923                 plc.enableSlices(args['hostname'])
924                 plc.enableSliceCreation(args['hostname'])
925                 return True
926
927 # TODO: create class for each action below, 
928 #               allow for lists of actions to be performed...
929
930 def close_rt_backoff(args):
931         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
932                 mailer.closeTicketViaRT(args['ticket_id'], 
933                                                                 "Ticket CLOSED automatically by SiteAssist.")
934                 plc.enableSlices(args['hostname'])
935                 plc.enableSliceCreation(args['hostname'])
936         return
937
938 def reboot_node(args):
939         host = args['hostname']
940         return reboot.reboot_policy(host, True, config.debug)
941
942 def reset_nodemanager(args):
943         os.system("ssh root@%s /sbin/service nm restart" % nodename)
944         return
945
946 class Action(Thread):
947         def __init__(self, l_action):
948                 self.l_action = l_action
949
950                 # the hostname to loginbase mapping
951                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
952
953                 # Actions to take.
954                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
955                 # Actions taken.
956                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
957
958                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
959                 self.actions = {}
960                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
961                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
962                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
963                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
964                 self.actions['noop'] = lambda args: args
965                 self.actions['reboot_node'] = lambda args: reboot_node(args)
966                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
967
968                 self.actions['ticket_waitforever'] = lambda args: args
969                 self.actions['waitforever'] = lambda args: args
970                 self.actions['unknown'] = lambda args: args
971                 self.actions['waitforoneweekaction'] = lambda args: args
972                 self.actions['waitfortwoweeksaction'] = lambda args: args
973                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
974                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
975                 self.actions['email-againwaitforever'] = lambda args: args
976                 self.actions['email-againticket_waitforever'] = lambda args: args
977                                 
978
979                 self.sickdb = {}
980                 Thread.__init__(self)
981
982         def run(self):
983                 self.accumSites()
984                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
985                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
986
987                 try:
988                         stats = self.analyseSites()
989                 except Exception, err:
990                         print "----------------"
991                         import traceback
992                         print traceback.print_exc()
993                         print err
994                         if config.policysavedb:
995                                 print "Saving Databases... act_all"
996                                 database.dbDump("act_all", self.act_all)
997                         sys.exit(1)
998
999                 print_stats("sites_observed", stats)
1000                 print_stats("sites_diagnosed", stats)
1001                 print_stats("nodes_diagnosed", stats)
1002                 print_stats("sites_emailed", stats)
1003                 print_stats("nodes_actedon", stats)
1004                 print string.join(stats['allsites'], ",")
1005
1006                 if config.policysavedb:
1007                         print "Saving Databases... act_all"
1008                         #database.dbDump("policy.eventlog", self.eventlog)
1009                         # TODO: remove 'diagnose_out', 
1010                         #       or at least the entries that were acted on.
1011                         database.dbDump("act_all", self.act_all)
1012
1013         def accumSites(self):
1014                 """
1015                 Take all nodes, from l_action, look them up in the diagnose_db database, 
1016                 and insert them into sickdb[] as:
1017
1018                 This way only the given l_action nodes will be acted on regardless
1019                 of how many from diagnose_db are available.
1020
1021                         sickdb[loginbase][nodename] = diag_record
1022                 """
1023                 # TODO: what if l_action == None ?
1024                 for nodename in self.l_action:
1025
1026                         loginbase = self.plcdb_hn2lb[nodename]
1027
1028                         if loginbase in self.diagnose_db and \
1029                                 nodename in self.diagnose_db[loginbase]['nodes']:
1030
1031                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1032
1033                                 if loginbase not in self.sickdb:
1034                                         self.sickdb[loginbase] = {'nodes' : {}}
1035
1036                                 # NOTE: don't copy all node records, since not all will be in l_action
1037                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1038                                 # NOTE: but, we want to get the loginbase config settings, 
1039                                 #               this is the easiest way.
1040                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1041                         #else:
1042                                 #print "%s not in diagnose_db!!" % loginbase
1043                 return
1044
1045         def __emailSite(self, loginbase, roles, message, args):
1046                 """
1047                 loginbase is the unique site abbreviation, prepended to slice names.
1048                 roles contains TECH, PI, USER roles, and derive email aliases.
1049                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1050                 """
1051                 ticket_id = 0
1052                 args.update({'loginbase':loginbase})
1053
1054                 if not config.mail and not config.debug and config.bcc:
1055                         roles = ADMIN
1056                 if config.mail and config.debug:
1057                         roles = ADMIN
1058
1059                 # build targets
1060                 contacts = []
1061                 if ADMIN & roles:
1062                         contacts += [config.email]
1063                 if TECH & roles:
1064                         contacts += [TECHEMAIL % loginbase]
1065                 if PI & roles:
1066                         contacts += [PIEMAIL % loginbase]
1067                 if USER & roles:
1068                         slices = plc.slices(loginbase)
1069                         if len(slices) >= 1:
1070                                 for slice in slices:
1071                                         contacts += [SLICEMAIL % slice]
1072                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1073                         else:
1074                                 print "SLIC: %20s : 0 slices" % loginbase
1075
1076                 try:
1077                         subject = message[0] % args
1078                         body = message[1] % args
1079                         if ADMIN & roles:
1080                                 # send only to admin
1081                                 if 'ticket_id' in args:
1082                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1083                                 else:
1084                                         subj = "Re: [PL noticket] %s" % subject
1085                                 mailer.email(subj, body, contacts)
1086                                 ticket_id = args['ticket_id']
1087                         else:
1088                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1089                 except Exception, err:
1090                         print "exception on message:"
1091                         import traceback
1092                         print traceback.print_exc()
1093                         print message
1094
1095                 return ticket_id
1096
1097
1098         def _format_diaginfo(self, diag_node):
1099                 info = diag_node['info']
1100                 if diag_node['stage'] == 'monitor-end-record':
1101                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1102                 else:
1103                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1104                 return hlist
1105
1106
1107         def get_email_args(self, act_recordlist, loginbase=None):
1108
1109                 email_args = {}
1110                 email_args['hostname_list'] = ""
1111
1112                 for act_record in act_recordlist:
1113                         email_args['hostname_list'] += act_record['msg_format']
1114                         email_args['hostname'] = act_record['nodename']
1115                         if  'plcnode' in act_record and \
1116                                 'pcu_ids' in act_record['plcnode'] and \
1117                                 len(act_record['plcnode']['pcu_ids']) > 0:
1118                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1119                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1120                         else:
1121                                 email_args['pcu_id'] = "-1"
1122                                         
1123                         if 'ticket_id' in act_record:
1124                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1125                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1126                                         sys.stdout.flush()
1127                                         line = sys.stdin.readline()
1128                                         try:
1129                                                 ticket_id = int(line)
1130                                         except:
1131                                                 print "could not get ticket_id from stdin..."
1132                                                 os._exit(1)
1133                                 else:
1134                                         ticket_id = act_record['ticket_id']
1135                                         
1136                                 email_args['ticket_id'] = ticket_id
1137
1138                 return email_args
1139
1140         def get_unique_issues(self, act_recordlist):
1141                 # NOTE: only send one email per site, per problem...
1142                 unique_issues = {}
1143                 for act_record in act_recordlist:
1144                         act_key = act_record['action'][0]
1145                         if act_key not in unique_issues:
1146                                 unique_issues[act_key] = []
1147                                 
1148                         unique_issues[act_key] += [act_record]
1149                         
1150                 return unique_issues
1151                         
1152
1153         def __actOnSite(self, loginbase, site_record):
1154                 i_nodes_actedon = 0
1155                 i_nodes_emailed = 0
1156
1157                 act_recordlist = []
1158
1159                 for nodename in site_record['nodes'].keys():
1160                         diag_record = site_record['nodes'][nodename]
1161                         act_record  = self.__actOnNode(diag_record)
1162                         #print "nodename: %s %s" % (nodename, act_record)
1163                         if act_record is not None:
1164                                 act_recordlist += [act_record]
1165
1166                 unique_issues = self.get_unique_issues(act_recordlist)
1167
1168                 for issue in unique_issues.keys():
1169                         print "\tworking on issue: %s" % issue
1170                         issue_record_list = unique_issues[issue]
1171                         email_args = self.get_email_args(issue_record_list, loginbase)
1172
1173                         # for each record.
1174                         for act_record in issue_record_list:
1175                                 # if there's a pcu record and email config is set
1176                                 if 'email_pcu' in act_record:
1177                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1178                                                 # and 'reboot_node' in act_record['stage']:
1179
1180                                                 email_args['hostname'] = act_record['nodename']
1181                                                 ticket_id = self.__emailSite(loginbase, 
1182                                                                                         act_record['email'], 
1183                                                                                         emailTxt.mailtxt.pcudown[0],
1184                                                                                         email_args)
1185                                                 if ticket_id == 0:
1186                                                         # error.
1187                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1188                                                         os._exit(1)
1189                                                         pass
1190                                                 email_args['ticket_id'] = ticket_id
1191
1192                         
1193                         act_record = issue_record_list[0]
1194                         # send message before squeezing
1195                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1196                                                                                                 site_record['config']['email'])
1197                         if act_record['message'] != None and site_record['config']['email']:
1198                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1199                                                                                          act_record['message'], email_args)
1200
1201                                 if ticket_id == 0:
1202                                         # error.
1203                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1204                                         os._exit(1)
1205                                         pass
1206
1207                                 # Add ticket_id to ALL nodenames
1208                                 for act_record in issue_record_list:
1209                                         nodename = act_record['nodename']
1210                                         # update node record with RT ticket_id
1211                                         if nodename in self.act_all:
1212                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1213                                                 # if the ticket was previously resolved, reset it to new.
1214                                                 if 'rt' in act_record and \
1215                                                         'Status' in act_record['rt'] and \
1216                                                         act_record['rt']['Status'] == 'resolved':
1217                                                         mailer.setTicketStatus(ticket_id, "new")
1218                                                 status = mailer.getTicketStatus(ticket_id)
1219                                                 self.act_all[nodename][0]['rt'] = status
1220                                         if config.mail: i_nodes_emailed += 1
1221
1222                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1223                                                                                                         site_record['config']['squeeze'])
1224                         if config.squeeze and site_record['config']['squeeze']:
1225                                 for act_key in act_record['action']:
1226                                         self.actions[act_key](email_args)
1227                                 i_nodes_actedon += 1
1228                 
1229                 if config.policysavedb:
1230                         print "Saving Databases... act_all, diagnose_out"
1231                         database.dbDump("act_all", self.act_all)
1232                         # remove site record from diagnose_out, it's in act_all as done.
1233                         del self.diagnose_db[loginbase]
1234                         database.dbDump("diagnose_out", self.diagnose_db)
1235
1236                 print "sleeping for 1 sec"
1237                 time.sleep(1)
1238                 #print "Hit enter to continue..."
1239                 #sys.stdout.flush()
1240                 #line = sys.stdin.readline()
1241
1242                 return (i_nodes_actedon, i_nodes_emailed)
1243
1244         def __actOnNode(self, diag_record):
1245                 nodename = diag_record['nodename']
1246                 message = diag_record['message']
1247
1248                 act_record = {}
1249                 act_record.update(diag_record)
1250                 act_record['nodename'] = nodename
1251                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1252                 print "act_record['stage'] == %s " % act_record['stage']
1253
1254                 # avoid end records, and nmreset records                                        
1255                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1256
1257                 if 'monitor-end-record' not in act_record['stage'] and \
1258                    'nmreset' not in act_record['stage'] and \
1259                    'reboot_node_failed' not in act_record:
1260
1261                         if "DOWN" in act_record['log'] and \
1262                                         'pcu_ids' in act_record['plcnode'] and \
1263                                         len(act_record['plcnode']['pcu_ids']) > 0:
1264
1265                                 print "%s" % act_record['log'],
1266                                 print "%15s" % (['reboot_node'],)
1267                                 # Set node to re-install
1268                                 plc.nodeBootState(act_record['nodename'], "rins")       
1269                                 try:
1270                                         ret = reboot_node({'hostname': act_record['nodename']})
1271                                 except Exception, exc:
1272                                         print "exception on reboot_node:"
1273                                         import traceback
1274                                         print traceback.print_exc()
1275                                         ret = False
1276
1277                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1278                                         # Reboot Succeeded
1279                                         print "reboot succeeded for %s" % act_record['nodename']
1280                                         act_record2 = {}
1281                                         act_record2.update(act_record)
1282                                         act_record2['action'] = ['reboot_node']
1283                                         act_record2['stage'] = "reboot_node"
1284                                         act_record2['reboot_node_failed'] = False
1285                                         act_record2['email_pcu'] = False
1286
1287                                         if nodename not in self.act_all: 
1288                                                 self.act_all[nodename] = []
1289                                         print "inserting 'reboot_node' record into act_all"
1290                                         self.act_all[nodename].insert(0,act_record2)
1291
1292                                         # return None to avoid further action
1293                                         print "Taking no further action"
1294                                         return None
1295                                 else:
1296                                         print "reboot failed for %s" % act_record['nodename']
1297                                         # set email_pcu to also send pcu notice for this record.
1298                                         act_record['reboot_node_failed'] = True
1299                                         act_record['email_pcu'] = True
1300
1301                         print "%s" % act_record['log'],
1302                         print "%15s" % act_record['action']
1303
1304                 if act_record['stage'] is not 'monitor-end-record' and \
1305                    act_record['stage'] is not 'nmreset':
1306                         if nodename not in self.act_all: 
1307                                 self.act_all[nodename] = []
1308
1309                         self.act_all[nodename].insert(0,act_record)
1310                 else:
1311                         print "Not recording %s in act_all" % nodename
1312
1313                 return act_record
1314
1315         def analyseSites(self):
1316                 i_sites_observed = 0
1317                 i_sites_diagnosed = 0
1318                 i_nodes_diagnosed = 0
1319                 i_nodes_actedon = 0
1320                 i_sites_emailed = 0
1321                 l_allsites = []
1322
1323                 sorted_sites = self.sickdb.keys()
1324                 sorted_sites.sort()
1325                 for loginbase in sorted_sites:
1326                         site_record = self.sickdb[loginbase]
1327                         print "sites: %s" % loginbase
1328                         
1329                         i_nodes_diagnosed += len(site_record.keys())
1330                         i_sites_diagnosed += 1
1331
1332                         (na,ne) = self.__actOnSite(loginbase, site_record)
1333
1334                         i_sites_observed += 1
1335                         i_nodes_actedon += na
1336                         i_sites_emailed += ne
1337
1338                         l_allsites += [loginbase]
1339
1340                 return {'sites_observed': i_sites_observed, 
1341                                 'sites_diagnosed': i_sites_diagnosed, 
1342                                 'nodes_diagnosed': i_nodes_diagnosed, 
1343                                 'sites_emailed': i_sites_emailed, 
1344                                 'nodes_actedon': i_nodes_actedon, 
1345                                 'allsites':l_allsites}
1346
1347         def print_stats(self, key, stats):
1348                 print "%20s : %d" % (key, stats[key])
1349
1350
1351
1352         #"""
1353         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1354         #"""
1355         #def status(self):
1356         #       sub = "Monitor Summary"
1357         #       msg = "\nThe following nodes were acted upon:  \n\n"
1358         #       for (node, (type, date)) in self.emailed.items():
1359         #               # Print only things acted on today.
1360         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1361         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1362         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1363         #       for (loginbase, (date, type)) in self.squeezed.items():
1364         #               # Print only things acted on today.
1365         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1366         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1367         #       mailer.email(sub, msg, [SUMTO])
1368         #       logger.info(msg)
1369         #       return 
1370
1371         #"""
1372         #Store/Load state of emails.  When, where, what.
1373         #"""
1374         #def emailedStore(self, action):
1375         #       try:
1376         #               if action == "LOAD":
1377         #                       f = open(DAT, "r+")
1378         #                       logger.info("POLICY:  Found and reading " + DAT)
1379         #                       self.emailed.update(pickle.load(f))
1380         #               if action == "WRITE":
1381         #                       f = open(DAT, "w")
1382         #                       #logger.debug("Writing " + DAT)
1383         #                       pickle.dump(self.emailed, f)
1384         #               f.close()
1385         #       except Exception, err:
1386         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1387
1388
1389 #class Policy(Thread):
1390
1391 def main():
1392         print "policy.py is a module, not a script for running directly."
1393
1394 if __name__ == '__main__':
1395         import os
1396         import plc
1397         try:
1398                 main()
1399         except KeyboardInterrupt:
1400                 print "Killed.  Exitting."
1401                 logger.info('Monitor Killed')
1402                 os._exit(0)