This commit changes the 'soltesz.py' module into 'moncommands.py' and
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import database
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 from config import config
26 #print "policy"
27 config = config()
28
29 DAT="./monitor.dat"
30
31 logger = logging.getLogger("monitor")
32
33 # Time to enforce policy
34 POLSLEEP = 7200
35
36 # Where to email the summary
37 SUMTO = "soltesz@cs.princeton.edu"
38 TECHEMAIL="tech-%s@sites.planet-lab.org"
39 PIEMAIL="pi-%s@sites.planet-lab.org"
40 SLICEMAIL="%s@slices.planet-lab.org"
41 PLCEMAIL="support@planet-lab.org"
42
43 #Thresholds (DAYS)
44 SPERMIN = 60
45 SPERHOUR = 60*60
46 SPERDAY = 86400
47 PITHRESH = 7 * SPERDAY
48 SLICETHRESH = 7 * SPERDAY
49 # Days before attempting rins again
50 RINSTHRESH = 5 * SPERDAY
51
52 # Days before calling the node dead.
53 DEADTHRESH = 30 * SPERDAY
54 # Minimum number of nodes up before squeezing
55 MINUP = 2
56
57 TECH=1
58 PI=2
59 USER=4
60 ADMIN=8
61
62 # IF:
63 #  no SSH, down.
64 #  bad disk, down
65 #  DNS, kinda down (sick)
66 #  clock, kinda down (sick)
67 #  Full disk, going to be down
68
69 # Actions:
70 #  Email
71 #  suspend slice creation
72 #  kill slices
73 def array_to_priority_map(array):
74         """ Create a mapping where each entry of array is given a priority equal
75         to its position in the array.  This is useful for subsequent use in the
76         cmpMap() function."""
77         map = {}
78         count = 0
79         for i in array:
80                 map[i] = count
81                 count += 1
82         return map
83
84 def getdebug():
85         return config.debug
86
87 def print_stats(key, stats):
88         if key in stats: print "%20s : %d" % (key, stats[key])
89
90 def get_ticket_id(record):
91         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
92                 return record['ticket_id']
93         elif            'found_rt_ticket' in record and \
94                  record['found_rt_ticket'] is not "" and \
95                  record['found_rt_ticket'] is not None:
96                 return record['found_rt_ticket']
97         else:
98                 return None
99
100 class Merge(Thread):
101         def __init__(self, l_merge, toRT):
102                 self.toRT = toRT
103                 self.merge_list = l_merge
104                 # the hostname to loginbase mapping
105                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
106
107                 # Previous actions taken on nodes.
108                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
109                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
110
111                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
112                 self.sickdb = {}
113                 self.mergedb = {}
114                 Thread.__init__(self)
115
116         def run(self):
117                 # populate sickdb
118                 self.accumSickSites()
119                 # read data from findbad and act_all
120                 self.mergeActionsAndBadDB()
121                 # pass node_records to RT
122                 self.sendToRT()
123
124         def accumSickSites(self):
125                 """
126                 Take all nodes, from l_diagnose, look them up in the act_all database, 
127                 and insert them into sickdb[] as:
128
129                         sickdb[loginbase][nodename] = fb_record
130                 """
131                 # look at all problems reported by findbad
132                 l_nodes = self.findbad['nodes'].keys()
133                 count = 0
134                 for nodename in l_nodes:
135                         if nodename not in self.merge_list:
136                                 continue                # skip this node, since it's not wanted
137
138                         count += 1
139                         loginbase = self.plcdb_hn2lb[nodename]
140                         values = self.findbad['nodes'][nodename]['values']
141
142                         fb_record = {}
143                         fb_record['nodename'] = nodename
144                         try:
145                                 fb_record['category'] = values['category']
146                         except:
147                                 print values
148                                 print nodename
149                                 print self.findbad['nodes'][nodename]
150                                 count -= 1
151                                 continue
152                         fb_record['state'] = values['state']
153                         fb_record['comonstats'] = values['comonstats']
154                         fb_record['plcnode'] = values['plcnode']
155                         fb_record['kernel'] = self.getKernel(values['kernel'])
156                         fb_record['stage'] = "findbad"
157                         fb_record['message'] = None
158                         fb_record['bootcd'] = values['bootcd']
159                         fb_record['args'] = None
160                         fb_record['info'] = None
161                         fb_record['time'] = time.time()
162                         fb_record['date_created'] = time.time()
163
164                         if loginbase not in self.sickdb:
165                                 self.sickdb[loginbase] = {}
166
167                         self.sickdb[loginbase][nodename] = fb_record
168
169                 print "Found %d nodes" % count
170
171         def getKernel(self, unamestr):
172                 s = unamestr.split()
173                 if len(s) > 2:
174                         return s[2]
175                 else:
176                         return ""
177
178         def mergeActionsAndBadDB(self): 
179                 """
180                 - Look at the sick node_records as reported in findbad, 
181                 - Then look at the node_records in act_all.  
182
183                 There are four cases:
184                 1) Problem in findbad, no problem in act_all
185                         this ok, b/c it just means it's a new problem
186                 2) Problem in findbad, problem in act_all
187                         -Did the problem get better or worse?  
188                                 -If Same, or Worse, then continue looking for open tickets.
189                                 -If Better, or No problem, then "back-off" penalties.
190                                         This judgement may need to wait until 'Diagnose()'
191
192                 3) No problem in findbad, problem in act_all
193                         The the node is operational again according to Findbad()
194
195                 4) No problem in findbad, no problem in act_all
196                         There won't be a record in either db, so there's no code.
197                 """
198
199                 sorted_sites = self.sickdb.keys()
200                 sorted_sites.sort()
201                 # look at all problems reported by findbad
202                 for loginbase in sorted_sites:
203                         d_fb_nodes = self.sickdb[loginbase]
204                         sorted_nodes = d_fb_nodes.keys()
205                         sorted_nodes.sort()
206                         for nodename in sorted_nodes:
207                                 fb_record = self.sickdb[loginbase][nodename]
208                                 x = fb_record
209                                 if loginbase not in self.mergedb:
210                                         self.mergedb[loginbase] = {}
211
212                                 # take the info either from act_all or fb-record.
213                                 # if node not in act_all
214                                 #       then take it from fbrecord, obviously.
215                                 # else node in act_all
216                                 #   if act_all == 0 length (no previous records)
217                                 #               then take it from fbrecord.
218                                 #   else
219                                 #           take it from act_all.
220                                 #   
221
222                                 # We must compare findbad state with act_all state
223                                 if nodename not in self.act_all:
224                                         # 1) ok, b/c it's a new problem. set ticket_id to null
225                                         self.mergedb[loginbase][nodename] = {} 
226                                         self.mergedb[loginbase][nodename].update(x)
227                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
228                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
229                                 else: 
230                                         if len(self.act_all[nodename]) == 0:
231                                                 self.mergedb[loginbase][nodename] = {} 
232                                                 self.mergedb[loginbase][nodename].update(x)
233                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
234                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
235                                         else:
236                                                 y = self.act_all[nodename][0]
237                                                 y['prev_category'] = y['category']
238
239                                                 self.mergedb[loginbase][nodename] = {}
240                                                 self.mergedb[loginbase][nodename].update(y)
241                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
242                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
243                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
244                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
245                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
246                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
247                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
248                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
249
250                                         # delete the entry from cache_all to keep it out of case 3)
251                                         del self.cache_all[nodename]
252
253                 # 3) nodes that remin in cache_all were not identified by findbad.
254                 #        Do we keep them or not?
255                 #   NOTE: i think that since the categories are performed before this
256                 #               step now, and by a monitor-controlled agent.
257
258                 # TODO: This does not work correctly.  Do we need this? 
259                 #for hn in self.cache_all.keys():
260                 #       y = self.act_all[hn][0]
261                 #       if 'monitor' in y['bucket']:
262                 #               loginbase = self.plcdb_hn2lb[hn] 
263                 #               if loginbase not in self.sickdb:
264                 #                       self.sickdb[loginbase] = {}
265                 #               self.sickdb[loginbase][hn] = y
266                 #       else:
267                 #               del self.cache_all[hn]
268
269                 print "len of cache_all: %d" % len(self.cache_all.keys())
270                 return
271
272         def sendToRT(self):
273                 sorted_sites = self.mergedb.keys()
274                 sorted_sites.sort()
275                 # look at all problems reported by merge
276                 for loginbase in sorted_sites:
277                         d_merge_nodes = self.mergedb[loginbase]
278                         for nodename in d_merge_nodes.keys():
279                                 record = self.mergedb[loginbase][nodename]
280                                 self.toRT.put(record)
281
282                 # send signal to stop reading
283                 self.toRT.put(None)
284                 return
285
286 class Diagnose(Thread):
287         def __init__(self, fromRT):
288                 self.fromRT = fromRT
289                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
290                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
291
292                 self.diagnose_in = {}
293                 self.diagnose_out = {}
294                 Thread.__init__(self)
295
296
297         def run(self):
298                 self.accumSickSites()
299
300                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
301                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
302
303                 try:
304                         stats = self.diagnoseAll()
305                 except Exception, err:
306                         print "----------------"
307                         import traceback
308                         print traceback.print_exc()
309                         print err
310                         #if config.policysavedb:
311                         sys.exit(1)
312
313                 print_stats("sites_observed", stats)
314                 print_stats("sites_diagnosed", stats)
315                 print_stats("nodes_diagnosed", stats)
316
317                 if config.policysavedb:
318                         print "Saving Databases... diagnose_out"
319                         database.dbDump("diagnose_out", self.diagnose_out)
320
321         def accumSickSites(self):
322                 """
323                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
324                 and insert them into diagnose_in[] as:
325
326                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
327                 """
328                 while 1:
329                         node_record = self.fromRT.get(block = True)
330                         if node_record == None:
331                                 break;
332
333                         nodename = node_record['nodename']
334                         loginbase = self.plcdb_hn2lb[nodename]
335
336                         if loginbase not in self.diagnose_in:
337                                 self.diagnose_in[loginbase] = {}
338
339                         self.diagnose_in[loginbase][nodename] = node_record
340
341                 return
342
343         def diagnoseAll(self):
344                 i_sites_observed = 0
345                 i_sites_diagnosed = 0
346                 i_nodes_diagnosed = 0
347                 i_nodes_actedon = 0
348                 i_sites_emailed = 0
349                 l_allsites = []
350
351                 sorted_sites = self.diagnose_in.keys()
352                 sorted_sites.sort()
353                 self.diagnose_out= {}
354                 for loginbase in sorted_sites:
355                         l_allsites += [loginbase]
356
357                         d_diag_nodes = self.diagnose_in[loginbase]
358                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
359                         # store records in diagnose_out, for saving later.
360                         self.diagnose_out.update(d_act_records)
361                         
362                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
363                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
364                                 i_sites_diagnosed += 1
365                         i_sites_observed += 1
366
367                 return {'sites_observed': i_sites_observed, 
368                                 'sites_diagnosed': i_sites_diagnosed, 
369                                 'nodes_diagnosed': i_nodes_diagnosed, 
370                                 'allsites':l_allsites}
371
372                 pass
373                 
374         def getDaysDown(cls, diag_record):
375                 daysdown = -1
376                 last_contact = diag_record['plcnode']['last_contact']
377                 date_created = diag_record['plcnode']['date_created']
378
379                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
380                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
381                 elif last_contact is None:
382                         if date_created is not None:
383                                 now = time.time()
384                                 diff = now - date_created
385                                 daysdown = diff // (60*60*24)
386                         else:
387                                 daysdown = -1
388                 else:
389                         now = time.time()
390                         diff = now - last_contact
391                         daysdown = diff // (60*60*24)
392                 return daysdown
393         getDaysDown = classmethod(getDaysDown)
394
395         def getStrDaysDown(cls, diag_record):
396                 daysdown = "unknown"
397                 last_contact = diag_record['plcnode']['last_contact']
398                 date_created = diag_record['plcnode']['date_created']
399
400                 if      diag_record['comonstats']['uptime'] != "null" and \
401                         diag_record['comonstats']['uptime'] != "-1":
402                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
403                         daysdown = "%d days up" % daysdown
404
405                 elif last_contact is None:
406                         if date_created is not None:
407                                 now = time.time()
408                                 diff = now - date_created
409                                 daysdown = diff // (60*60*24)
410                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
411                         else:
412                                 daysdown = "Never contacted PLC"
413                 else:
414                         now = time.time()
415                         diff = now - last_contact
416                         daysdown = diff // (60*60*24)
417                         daysdown = "%s days down" % daysdown
418                 return daysdown
419         getStrDaysDown = classmethod(getStrDaysDown)
420         #def getStrDaysDown(cls, diag_record):
421         #       daysdown = cls.getDaysDown(diag_record)
422         #       if daysdown > -1:
423         #               return "%d days down"%daysdown
424         #       elif daysdown == -1:
425         #               return "Has never contacted PLC"
426         #       else:
427         #               return "%d days up"% -daysdown
428         #getStrDaysDown = classmethod(getStrDaysDown)
429
430         def __getCDVersion(self, diag_record, nodename):
431                 cdversion = ""
432                 #print "Getting kernel for: %s" % diag_record['nodename']
433                 cdversion = diag_record['kernel']
434                 return cdversion
435
436         def __diagnoseSite(self, loginbase, d_diag_nodes):
437                 """
438                 d_diag_nodes are diagnose_in entries.
439                 """
440                 d_diag_site = {loginbase : { 'config' : 
441                                                                                                 {'squeeze': False,
442                                                                                                  'email': False
443                                                                                                 }, 
444                                                                         'nodes': {}
445                                                                         }
446                                            }
447                 sorted_nodes = d_diag_nodes.keys()
448                 sorted_nodes.sort()
449                 for nodename in sorted_nodes:
450                         node_record = d_diag_nodes[nodename]
451                         diag_record = self.__diagnoseNode(loginbase, node_record)
452
453                         if diag_record != None:
454                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
455
456                                 # NOTE: improvement means, we need to act/squeeze and email.
457                                 #print "DIAG_RECORD", diag_record
458                                 if 'monitor-end-record' in diag_record['stage'] or \
459                                    'nmreset' in diag_record['stage']:
460                                 #       print "resetting loginbase!" 
461                                         d_diag_site[loginbase]['config']['squeeze'] = True
462                                         d_diag_site[loginbase]['config']['email'] = True
463                                 #else:
464                                 #       print "NO IMPROVEMENT!!!!"
465                         else:
466                                 pass # there is nothing to do for this node.
467
468                 # NOTE: these settings can be overridden by command line arguments,
469                 #       or the state of a record, i.e. if already in RT's Support Queue.
470                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
471                 if nodes_up < MINUP:
472                         d_diag_site[loginbase]['config']['squeeze'] = True
473
474                 max_slices = self.getMaxSlices(loginbase)
475                 num_nodes = self.getNumNodes(loginbase)
476                 # NOTE: when max_slices == 0, this is either a new site (the old way)
477                 #       or an old disabled site from previous monitor (before site['enabled'])
478                 if nodes_up < num_nodes and max_slices != 0:
479                         d_diag_site[loginbase]['config']['email'] = True
480
481                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
482                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
483
484                 return d_diag_site
485
486         def diagRecordByCategory(self, node_record):
487                 nodename = node_record['nodename']
488                 category = node_record['category']
489                 state    = node_record['state']
490                 loginbase = self.plcdb_hn2lb[nodename]
491                 diag_record = None
492
493                 if  "ERROR" in category:        # i.e. "DOWN"
494                         diag_record = {}
495                         diag_record.update(node_record)
496                         daysdown = self.getDaysDown(diag_record) 
497                         if daysdown < 7:
498                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
499                                 print format % (loginbase, nodename, daysdown)
500                                 return None
501
502                         s_daysdown = self.getStrDaysDown(diag_record)
503                         diag_record['message'] = emailTxt.mailtxt.newdown
504                         diag_record['args'] = {'nodename': nodename}
505                         diag_record['info'] = (nodename, s_daysdown, "")
506
507                         if 'reboot_node_failed' in node_record:
508                                 # there was a previous attempt to use the PCU.
509                                 if node_record['reboot_node_failed'] == False:
510                                         # then the last attempt apparently, succeeded.
511                                         # But, the category is still 'ERROR'.  Therefore, the
512                                         # PCU-to-Node mapping is broken.
513                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
514                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
515                                         diag_record['email_pcu'] = True
516
517                         if 'ticket_id' in diag_record:
518                                 if diag_record['ticket_id'] == "":
519                                         if 'found_rt_ticket' in diag_record:
520                                                 ticket_id = diag_record['found_rt_ticket']
521                                         else:
522                                                 ticket_id = "None"
523                                 else:
524                                         ticket_id = diag_record['ticket_id']
525                         else:
526                                 ticket_id = "None"
527
528                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
529                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
530
531                 elif "OLDBOOTCD" in category:
532                         # V2 boot cds as determined by findbad
533                         s_daysdown = self.getStrDaysDown(node_record)
534                         s_cdversion = self.__getCDVersion(node_record, nodename)
535                         diag_record = {}
536                         diag_record.update(node_record)
537                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
538                         diag_record['message'] = emailTxt.mailtxt.newbootcd
539                         diag_record['args'] = {'nodename': nodename}
540                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
541                         if diag_record['ticket_id'] == "":
542                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
543                                                                         (loginbase, nodename, diag_record['kernel'], 
544                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
545                         else:
546                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
547                                                                         (loginbase, nodename, diag_record['kernel'], 
548                                                                          diag_record['bootcd'], diag_record['ticket_id'])
549
550                 elif "PROD" in category:
551                         if "DEBUG" in state:
552                                 # Not sure what to do with these yet.  Probably need to
553                                 # reboot, and email.
554                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
555                                 return None
556                         elif "BOOT" in state:
557                                 # no action needed.
558                                 # TODO: remove penalties, if any are applied.
559                                 now = time.time()
560                                 last_contact = node_record['plcnode']['last_contact']
561                                 if last_contact == None:
562                                         time_diff = 0
563                                 else:
564                                         time_diff = now - last_contact;
565
566                                 if 'improvement' in node_record['stage']:
567                                         # then we need to pass this on to 'action'
568                                         diag_record = {}
569                                         diag_record.update(node_record)
570                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
571                                         diag_record['args'] = {'nodename': nodename}
572                                         diag_record['info'] = (nodename, node_record['prev_category'], 
573                                                                                                          node_record['category'])
574                                         if 'email_pcu' in diag_record:
575                                                 if diag_record['email_pcu']:
576                                                         # previously, the pcu failed to reboot, so send
577                                                         # email. Now, reset these values to try the reboot
578                                                         # again.
579                                                         diag_record['email_pcu'] = False
580                                                         del diag_record['reboot_node_failed']
581
582                                         if diag_record['ticket_id'] == "":
583                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
584                                                                         (loginbase, nodename, diag_record['stage'], 
585                                                                          state, category, diag_record['found_rt_ticket'])
586                                         else:
587                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
588                                                                         (loginbase, nodename, diag_record['stage'], 
589                                                                          state, category, diag_record['ticket_id'])
590                                         return diag_record
591                                 #elif time_diff >= 6*SPERHOUR:
592                                 #       # heartbeat is older than 30 min.
593                                 #       # then reset NM.
594                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
595                                 #       diag_record = {}
596                                 #       diag_record.update(node_record)
597                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
598                                 #       diag_record['args'] = {'nodename': nodename}
599                                 #       diag_record['stage'] = "nmreset"
600                                 #       diag_record['info'] = (nodename, 
601                                 #                                                       node_record['prev_category'], 
602                                 #                                                       node_record['category'])
603                                 #       if diag_record['ticket_id'] == "":
604                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
605                                 #                                       (loginbase, nodename, diag_record['stage'], 
606                                 #                                        state, category, diag_record['found_rt_ticket'])
607                                 #       else:
608                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
609                                 #                                       (loginbase, nodename, diag_record['stage'])
610 #
611 #                                       return diag_record
612                                 else:
613                                         return None
614                         else:
615                                 # unknown
616                                 pass
617                 elif "ALPHA"    in category:
618                         pass
619                 elif "clock_drift" in category:
620                         pass
621                 elif "dns"    in category:
622                         pass
623                 elif "filerw"    in category:
624                         pass
625                 else:
626                         print "Unknown category!!!! %s" % category
627                         sys.exit(1)
628
629                 return diag_record
630
631         def __diagnoseNode(self, loginbase, node_record):
632                 # TODO: change the format of the hostname in this 
633                 #               record to something more natural.
634                 nodename                = node_record['nodename']
635                 category                = node_record['category']
636                 prev_category   = node_record['prev_category']
637                 state                   = node_record['state']
638                 #if 'prev_category' in node_record:
639                 #       prev_category = node_record['prev_category']
640                 #else:
641                 #       prev_category = "ERROR"
642                 if node_record['prev_category'] != "NORECORD":
643                 
644                         val = cmpCategoryVal(category, prev_category)
645                         print "%s went from %s -> %s" % (nodename, prev_category, category)
646                         if val == 1:
647                                 # improved
648                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
649                                         print "closing record with no ticket: ", node_record['nodename']
650                                         node_record['action'] = ['close_rt']
651                                         node_record['message'] = None
652                                         node_record['stage'] = 'monitor-end-record'
653                                         return node_record
654                                 else:
655                                         node_record['stage'] = 'improvement'
656
657                                 #if 'monitor-end-record' in node_record['stage']:
658                                 #       # just ignore it if it's already ended.
659                                 #       # otherwise, the status should be worse, and we won't get
660                                 #       # here.
661                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
662                                 #       return None
663 #
664 #                                       #return None
665                         elif val == -1:
666                                 # current category is worse than previous, carry on
667                                 pass
668                         else:
669                                 #values are equal, carry on.
670                                 #print "why are we here?"
671                                 pass
672
673                 if 'rt' in node_record and 'Status' in node_record['rt']:
674                         if node_record['stage'] == 'ticket_waitforever':
675                                 if 'resolved' in node_record['rt']['Status']:
676                                         print "ending waitforever record for: ", node_record['nodename']
677                                         node_record['action'] = ['noop']
678                                         node_record['message'] = None
679                                         node_record['stage'] = 'monitor-end-record'
680                                         print "oldlog: %s" % node_record['log'],
681                                         print "%15s" % node_record['action']
682                                         return node_record
683                                 if 'new' in node_record['rt']['Status'] and \
684                                         'Queue' in node_record['rt'] and \
685                                         'Monitor' in node_record['rt']['Queue']:
686
687                                         print "RESETTING stage to findbad"
688                                         node_record['stage'] = 'findbad'
689                         
690                 #### COMPARE category and prev_category
691                 # if not_equal
692                 #       then assign a stage based on relative priorities
693                 # else equal
694                 #       then check category for stats.
695                 diag_record = self.diagRecordByCategory(node_record)
696                 if diag_record == None:
697                         #print "diag_record == None"
698                         return None
699
700                 #### found_RT_ticket
701                 # TODO: need to record time found, and maybe add a stage for acting on it...
702                 # NOTE: after found, if the support ticket is resolved, the block is
703                 #               not removed. How to remove the block on this?
704                 if 'found_rt_ticket' in diag_record and \
705                         diag_record['found_rt_ticket'] is not None:
706                         if diag_record['stage'] is not 'improvement':
707                                 diag_record['stage'] = 'ticket_waitforever'
708                                 
709                 current_time = time.time()
710                 # take off four days, for the delay that database caused.
711                 # TODO: generalize delays at PLC, and prevent enforcement when there
712                 #               have been no emails.
713                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
714                 #delta = current_time - diag_record['time'] - 7*SPERDAY
715                 delta = current_time - diag_record['time']
716
717                 message = diag_record['message']
718                 act_record = {}
719                 act_record.update(diag_record)
720
721                 #### DIAGNOSE STAGES 
722                 if   'findbad' in diag_record['stage']:
723                         # The node is bad, and there's no previous record of it.
724                         act_record['email'] = TECH
725                         act_record['action'] = ['noop']
726                         act_record['message'] = message[0]
727                         act_record['stage'] = 'stage_actinoneweek'
728
729                 elif 'nmreset' in diag_record['stage']:
730                         act_record['email']  = ADMIN 
731                         act_record['action'] = ['reset_nodemanager']
732                         act_record['message'] = message[0]
733                         act_record['stage']  = 'nmreset'
734                         return None
735
736                 elif 'reboot_node' in diag_record['stage']:
737                         act_record['email'] = TECH
738                         act_record['action'] = ['noop']
739                         act_record['message'] = message[0]
740                         act_record['stage'] = 'stage_actinoneweek'
741                         
742                 elif 'improvement' in diag_record['stage']:
743                         # - backoff previous squeeze actions (slice suspend, nocreate)
744                         # TODO: add a backoff_squeeze section... Needs to runthrough
745                         print "backing off of %s" % nodename
746                         act_record['action'] = ['close_rt']
747                         act_record['message'] = message[0]
748                         act_record['stage'] = 'monitor-end-record'
749
750                 elif 'actinoneweek' in diag_record['stage']:
751                         if delta >= 7 * SPERDAY: 
752                                 act_record['email'] = TECH | PI
753                                 act_record['stage'] = 'stage_actintwoweeks'
754                                 act_record['message'] = message[1]
755                                 act_record['action'] = ['nocreate' ]
756                                 act_record['time'] = current_time               # reset clock for waitforever
757                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
758                                 act_record['email'] = TECH 
759                                 act_record['message'] = message[0]
760                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
761                                 act_record['second-mail-at-oneweek'] = True
762                         else:
763                                 act_record['message'] = None
764                                 act_record['action'] = ['waitforoneweekaction' ]
765                                 print "ignoring this record for: %s" % act_record['nodename']
766                                 return None                     # don't send if there's no action
767
768                 elif 'actintwoweeks' in diag_record['stage']:
769                         if delta >= 7 * SPERDAY:
770                                 act_record['email'] = TECH | PI | USER
771                                 act_record['stage'] = 'stage_waitforever'
772                                 act_record['message'] = message[2]
773                                 act_record['action'] = ['suspendslices']
774                                 act_record['time'] = current_time               # reset clock for waitforever
775                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
776                                 act_record['email'] = TECH | PI
777                                 act_record['message'] = message[1]
778                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
779                                 act_record['second-mail-at-twoweeks'] = True
780                         else:
781                                 act_record['message'] = None
782                                 act_record['action'] = ['waitfortwoweeksaction']
783                                 return None                     # don't send if there's no action
784
785                 elif 'ticket_waitforever' in diag_record['stage']:
786                         act_record['email'] = TECH
787                         if 'first-found' not in act_record:
788                                 act_record['first-found'] = True
789                                 act_record['log'] += " firstfound"
790                                 act_record['action'] = ['ticket_waitforever']
791                                 act_record['message'] = message[0]
792                                 act_record['time'] = current_time
793                         else:
794                                 if delta >= 7*SPERDAY:
795                                         act_record['action'] = ['ticket_waitforever']
796                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
797                                                         act_record['rt']['Status'] == 'new':
798                                                 act_record['message'] = message[0]
799                                         else:
800                                                 act_record['message'] = None
801                                                 
802                                         act_record['time'] = current_time               # reset clock
803                                 else:
804                                         act_record['action'] = ['ticket_waitforever']
805                                         act_record['message'] = None
806                                         return None
807
808                 elif 'waitforever' in diag_record['stage']:
809                         # more than 3 days since last action
810                         # TODO: send only on weekdays.
811                         # NOTE: expects that 'time' has been reset before entering waitforever stage
812                         if delta >= 3*SPERDAY:
813                                 act_record['action'] = ['email-againwaitforever']
814                                 act_record['message'] = message[2]
815                                 act_record['time'] = current_time               # reset clock
816                         else:
817                                 act_record['action'] = ['waitforever']
818                                 act_record['message'] = None
819                                 return None                     # don't send if there's no action
820
821                 else:
822                         # There is no action to be taken, possibly b/c the stage has
823                         # already been performed, but diagnose picked it up again.
824                         # two cases, 
825                         #       1. stage is unknown, or 
826                         #       2. delta is not big enough to bump it to the next stage.
827                         # TODO: figure out which. for now assume 2.
828                         print "UNKNOWN stage for %s; nothing done" % nodename
829                         act_record['action'] = ['unknown']
830                         act_record['message'] = message[0]
831
832                         act_record['email'] = TECH
833                         act_record['action'] = ['noop']
834                         act_record['message'] = message[0]
835                         act_record['stage'] = 'stage_actinoneweek'
836                         act_record['time'] = current_time               # reset clock
837                         #print "Exiting..."
838                         #return None
839                         #sys.exit(1)
840
841                 print "%s" % act_record['log'],
842                 print "%15s" % act_record['action']
843                 return act_record
844
845         def getMaxSlices(self, loginbase):
846                 # if sickdb has a loginbase, then it will have at least one node.
847                 site_stats = None
848
849                 for nodename in self.diagnose_in[loginbase].keys():
850                         if nodename in self.findbad['nodes']:
851                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
852                                 break
853
854                 if site_stats == None:
855                         raise Exception, "loginbase with no nodes in findbad"
856                 else:
857                         return site_stats['max_slices']
858
859         def getNumNodes(self, loginbase):
860                 # if sickdb has a loginbase, then it will have at least one node.
861                 site_stats = None
862
863                 for nodename in self.diagnose_in[loginbase].keys():
864                         if nodename in self.findbad['nodes']:
865                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
866                                 break
867
868                 if site_stats == None:
869                         raise Exception, "loginbase with no nodes in findbad"
870                 else:
871                         if 'num_nodes' in site_stats:
872                                 return site_stats['num_nodes']
873                         else:
874                                 return 0
875
876         """
877         Returns number of up nodes as the total number *NOT* in act_all with a
878         stage other than 'steady-state' .
879         """
880         def getUpAtSite(self, loginbase, d_diag_site):
881                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
882                 #               that aren't recorded yet.
883
884                 numnodes = self.getNumNodes(loginbase)
885                 # NOTE: assume nodes we have no record of are ok. (too conservative)
886                 # TODO: make the 'up' value more representative
887                 up = numnodes
888                 for nodename in d_diag_site[loginbase]['nodes'].keys():
889
890                         rec = d_diag_site[loginbase]['nodes'][nodename]
891                         if rec['stage'] != 'monitor-end-record':
892                                 up -= 1
893                         else:
894                                 pass # the node is assumed to be up.
895
896                 #if up != numnodes:
897                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
898
899                 return up
900
901
902 class SiteAction:
903         def __init__(self, parameter_names=['hostname', 'ticket_id']):
904                 self.parameter_names = parameter_names
905         def checkParam(self, args):
906                 for param in self.parameter_names:
907                         if param not in args:
908                                 raise Exception("Parameter %s not provided in args"%param)
909         def run(self, args):
910                 self.checkParam(args)
911                 return self._run(args)
912         def _run(self, args):
913                 pass
914
915 class SuspendAction(SiteAction):
916         def _run(self, args):
917                 return plc.suspendSlices(args['hostname'])
918
919 class RemoveSliceCreation(SiteAction):
920         def _run(self, args):
921                 return plc.removeSliceCreation(args['hostname'])
922
923 class BackoffActions(SiteAction):
924         def _run(self, args):
925                 plc.enableSlices(args['hostname'])
926                 plc.enableSliceCreation(args['hostname'])
927                 return True
928
929 # TODO: create class for each action below, 
930 #               allow for lists of actions to be performed...
931
932 def close_rt_backoff(args):
933         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
934                 mailer.closeTicketViaRT(args['ticket_id'], 
935                                                                 "Ticket CLOSED automatically by SiteAssist.")
936                 plc.enableSlices(args['hostname'])
937                 plc.enableSliceCreation(args['hostname'])
938         return
939
940 def reboot_node(args):
941         host = args['hostname']
942         return reboot.reboot_policy(host, True, config.debug)
943
944 def reset_nodemanager(args):
945         os.system("ssh root@%s /sbin/service nm restart" % nodename)
946         return
947
948 class Action(Thread):
949         def __init__(self, l_action):
950                 self.l_action = l_action
951
952                 # the hostname to loginbase mapping
953                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
954
955                 # Actions to take.
956                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
957                 # Actions taken.
958                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
959
960                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
961                 self.actions = {}
962                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
963                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
964                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
965                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
966                 self.actions['noop'] = lambda args: args
967                 self.actions['reboot_node'] = lambda args: reboot_node(args)
968                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
969
970                 self.actions['ticket_waitforever'] = lambda args: args
971                 self.actions['waitforever'] = lambda args: args
972                 self.actions['unknown'] = lambda args: args
973                 self.actions['waitforoneweekaction'] = lambda args: args
974                 self.actions['waitfortwoweeksaction'] = lambda args: args
975                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
976                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
977                 self.actions['email-againwaitforever'] = lambda args: args
978                 self.actions['email-againticket_waitforever'] = lambda args: args
979                                 
980
981                 self.sickdb = {}
982                 Thread.__init__(self)
983
984         def run(self):
985                 self.accumSites()
986                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
987                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
988
989                 try:
990                         stats = self.analyseSites()
991                 except Exception, err:
992                         print "----------------"
993                         import traceback
994                         print traceback.print_exc()
995                         print err
996                         if config.policysavedb:
997                                 print "Saving Databases... act_all"
998                                 database.dbDump("act_all", self.act_all)
999                         sys.exit(1)
1000
1001                 print_stats("sites_observed", stats)
1002                 print_stats("sites_diagnosed", stats)
1003                 print_stats("nodes_diagnosed", stats)
1004                 print_stats("sites_emailed", stats)
1005                 print_stats("nodes_actedon", stats)
1006                 print string.join(stats['allsites'], ",")
1007
1008                 if config.policysavedb:
1009                         print "Saving Databases... act_all"
1010                         #database.dbDump("policy.eventlog", self.eventlog)
1011                         # TODO: remove 'diagnose_out', 
1012                         #       or at least the entries that were acted on.
1013                         database.dbDump("act_all", self.act_all)
1014
1015         def accumSites(self):
1016                 """
1017                 Take all nodes, from l_action, look them up in the diagnose_db database, 
1018                 and insert them into sickdb[] as:
1019
1020                 This way only the given l_action nodes will be acted on regardless
1021                 of how many from diagnose_db are available.
1022
1023                         sickdb[loginbase][nodename] = diag_record
1024                 """
1025                 # TODO: what if l_action == None ?
1026                 for nodename in self.l_action:
1027
1028                         loginbase = self.plcdb_hn2lb[nodename]
1029
1030                         if loginbase in self.diagnose_db and \
1031                                 nodename in self.diagnose_db[loginbase]['nodes']:
1032
1033                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1034
1035                                 if loginbase not in self.sickdb:
1036                                         self.sickdb[loginbase] = {'nodes' : {}}
1037
1038                                 # NOTE: don't copy all node records, since not all will be in l_action
1039                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1040                                 # NOTE: but, we want to get the loginbase config settings, 
1041                                 #               this is the easiest way.
1042                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1043                         #else:
1044                                 #print "%s not in diagnose_db!!" % loginbase
1045                 return
1046
1047         def __emailSite(self, loginbase, roles, message, args):
1048                 """
1049                 loginbase is the unique site abbreviation, prepended to slice names.
1050                 roles contains TECH, PI, USER roles, and derive email aliases.
1051                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1052                 """
1053                 ticket_id = 0
1054                 args.update({'loginbase':loginbase})
1055
1056                 if not config.mail and not config.debug and config.bcc:
1057                         roles = ADMIN
1058                 if config.mail and config.debug:
1059                         roles = ADMIN
1060
1061                 # build targets
1062                 contacts = []
1063                 if ADMIN & roles:
1064                         contacts += [config.email]
1065                 if TECH & roles:
1066                         contacts += [TECHEMAIL % loginbase]
1067                 if PI & roles:
1068                         contacts += [PIEMAIL % loginbase]
1069                 if USER & roles:
1070                         slices = plc.slices(loginbase)
1071                         if len(slices) >= 1:
1072                                 for slice in slices:
1073                                         contacts += [SLICEMAIL % slice]
1074                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1075                         else:
1076                                 print "SLIC: %20s : 0 slices" % loginbase
1077
1078                 try:
1079                         subject = message[0] % args
1080                         body = message[1] % args
1081                         if ADMIN & roles:
1082                                 # send only to admin
1083                                 if 'ticket_id' in args:
1084                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1085                                 else:
1086                                         subj = "Re: [PL noticket] %s" % subject
1087                                 mailer.email(subj, body, contacts)
1088                                 ticket_id = args['ticket_id']
1089                         else:
1090                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1091                 except Exception, err:
1092                         print "exception on message:"
1093                         import traceback
1094                         print traceback.print_exc()
1095                         print message
1096
1097                 return ticket_id
1098
1099
1100         def _format_diaginfo(self, diag_node):
1101                 info = diag_node['info']
1102                 if diag_node['stage'] == 'monitor-end-record':
1103                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1104                 else:
1105                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1106                 return hlist
1107
1108
1109         def get_email_args(self, act_recordlist, loginbase=None):
1110
1111                 email_args = {}
1112                 email_args['hostname_list'] = ""
1113
1114                 for act_record in act_recordlist:
1115                         email_args['hostname_list'] += act_record['msg_format']
1116                         email_args['hostname'] = act_record['nodename']
1117                         if  'plcnode' in act_record and \
1118                                 'pcu_ids' in act_record['plcnode'] and \
1119                                 len(act_record['plcnode']['pcu_ids']) > 0:
1120                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1121                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1122                         else:
1123                                 email_args['pcu_id'] = "-1"
1124                                         
1125                         if 'ticket_id' in act_record:
1126                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1127                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1128                                         sys.stdout.flush()
1129                                         line = sys.stdin.readline()
1130                                         try:
1131                                                 ticket_id = int(line)
1132                                         except:
1133                                                 print "could not get ticket_id from stdin..."
1134                                                 os._exit(1)
1135                                 else:
1136                                         ticket_id = act_record['ticket_id']
1137                                         
1138                                 email_args['ticket_id'] = ticket_id
1139
1140                 return email_args
1141
1142         def get_unique_issues(self, act_recordlist):
1143                 # NOTE: only send one email per site, per problem...
1144                 unique_issues = {}
1145                 for act_record in act_recordlist:
1146                         act_key = act_record['action'][0]
1147                         if act_key not in unique_issues:
1148                                 unique_issues[act_key] = []
1149                                 
1150                         unique_issues[act_key] += [act_record]
1151                         
1152                 return unique_issues
1153                         
1154
1155         def __actOnSite(self, loginbase, site_record):
1156                 i_nodes_actedon = 0
1157                 i_nodes_emailed = 0
1158
1159                 act_recordlist = []
1160
1161                 for nodename in site_record['nodes'].keys():
1162                         diag_record = site_record['nodes'][nodename]
1163                         act_record  = self.__actOnNode(diag_record)
1164                         #print "nodename: %s %s" % (nodename, act_record)
1165                         if act_record is not None:
1166                                 act_recordlist += [act_record]
1167
1168                 unique_issues = self.get_unique_issues(act_recordlist)
1169
1170                 for issue in unique_issues.keys():
1171                         print "\tworking on issue: %s" % issue
1172                         issue_record_list = unique_issues[issue]
1173                         email_args = self.get_email_args(issue_record_list, loginbase)
1174
1175                         # for each record.
1176                         for act_record in issue_record_list:
1177                                 # if there's a pcu record and email config is set
1178                                 if 'email_pcu' in act_record:
1179                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1180                                                 # and 'reboot_node' in act_record['stage']:
1181
1182                                                 email_args['hostname'] = act_record['nodename']
1183                                                 ticket_id = self.__emailSite(loginbase, 
1184                                                                                         act_record['email'], 
1185                                                                                         emailTxt.mailtxt.pcudown[0],
1186                                                                                         email_args)
1187                                                 if ticket_id == 0:
1188                                                         # error.
1189                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1190                                                         os._exit(1)
1191                                                         pass
1192                                                 email_args['ticket_id'] = ticket_id
1193
1194                         
1195                         act_record = issue_record_list[0]
1196                         # send message before squeezing
1197                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1198                                                                                                 site_record['config']['email'])
1199                         if act_record['message'] != None and site_record['config']['email']:
1200                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1201                                                                                          act_record['message'], email_args)
1202
1203                                 if ticket_id == 0:
1204                                         # error.
1205                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1206                                         os._exit(1)
1207                                         pass
1208
1209                                 # Add ticket_id to ALL nodenames
1210                                 for act_record in issue_record_list:
1211                                         nodename = act_record['nodename']
1212                                         # update node record with RT ticket_id
1213                                         if nodename in self.act_all:
1214                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1215                                                 # if the ticket was previously resolved, reset it to new.
1216                                                 if 'rt' in act_record and \
1217                                                         'Status' in act_record['rt'] and \
1218                                                         act_record['rt']['Status'] == 'resolved':
1219                                                         mailer.setTicketStatus(ticket_id, "new")
1220                                                 status = mailer.getTicketStatus(ticket_id)
1221                                                 self.act_all[nodename][0]['rt'] = status
1222                                         if config.mail: i_nodes_emailed += 1
1223
1224                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1225                                                                                                         site_record['config']['squeeze'])
1226                         if config.squeeze and site_record['config']['squeeze']:
1227                                 for act_key in act_record['action']:
1228                                         self.actions[act_key](email_args)
1229                                 i_nodes_actedon += 1
1230                 
1231                 if config.policysavedb:
1232                         print "Saving Databases... act_all, diagnose_out"
1233                         database.dbDump("act_all", self.act_all)
1234                         # remove site record from diagnose_out, it's in act_all as done.
1235                         del self.diagnose_db[loginbase]
1236                         database.dbDump("diagnose_out", self.diagnose_db)
1237
1238                 print "sleeping for 1 sec"
1239                 time.sleep(1)
1240                 #print "Hit enter to continue..."
1241                 #sys.stdout.flush()
1242                 #line = sys.stdin.readline()
1243
1244                 return (i_nodes_actedon, i_nodes_emailed)
1245
1246         def __actOnNode(self, diag_record):
1247                 nodename = diag_record['nodename']
1248                 message = diag_record['message']
1249
1250                 act_record = {}
1251                 act_record.update(diag_record)
1252                 act_record['nodename'] = nodename
1253                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1254                 print "act_record['stage'] == %s " % act_record['stage']
1255
1256                 # avoid end records, and nmreset records                                        
1257                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1258
1259                 if 'monitor-end-record' not in act_record['stage'] and \
1260                    'nmreset' not in act_record['stage'] and \
1261                    'reboot_node_failed' not in act_record:
1262
1263                         if "DOWN" in act_record['log'] and \
1264                                         'pcu_ids' in act_record['plcnode'] and \
1265                                         len(act_record['plcnode']['pcu_ids']) > 0:
1266
1267                                 print "%s" % act_record['log'],
1268                                 print "%15s" % (['reboot_node'],)
1269                                 # Set node to re-install
1270                                 plc.nodeBootState(act_record['nodename'], "rins")       
1271                                 try:
1272                                         ret = reboot_node({'hostname': act_record['nodename']})
1273                                 except Exception, exc:
1274                                         print "exception on reboot_node:"
1275                                         import traceback
1276                                         print traceback.print_exc()
1277                                         ret = False
1278
1279                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1280                                         # Reboot Succeeded
1281                                         print "reboot succeeded for %s" % act_record['nodename']
1282                                         act_record2 = {}
1283                                         act_record2.update(act_record)
1284                                         act_record2['action'] = ['reboot_node']
1285                                         act_record2['stage'] = "reboot_node"
1286                                         act_record2['reboot_node_failed'] = False
1287                                         act_record2['email_pcu'] = False
1288
1289                                         if nodename not in self.act_all: 
1290                                                 self.act_all[nodename] = []
1291                                         print "inserting 'reboot_node' record into act_all"
1292                                         self.act_all[nodename].insert(0,act_record2)
1293
1294                                         # return None to avoid further action
1295                                         print "Taking no further action"
1296                                         return None
1297                                 else:
1298                                         print "reboot failed for %s" % act_record['nodename']
1299                                         # set email_pcu to also send pcu notice for this record.
1300                                         act_record['reboot_node_failed'] = True
1301                                         act_record['email_pcu'] = True
1302
1303                         print "%s" % act_record['log'],
1304                         print "%15s" % act_record['action']
1305
1306                 if act_record['stage'] is not 'monitor-end-record' and \
1307                    act_record['stage'] is not 'nmreset':
1308                         if nodename not in self.act_all: 
1309                                 self.act_all[nodename] = []
1310
1311                         self.act_all[nodename].insert(0,act_record)
1312                 else:
1313                         print "Not recording %s in act_all" % nodename
1314
1315                 return act_record
1316
1317         def analyseSites(self):
1318                 i_sites_observed = 0
1319                 i_sites_diagnosed = 0
1320                 i_nodes_diagnosed = 0
1321                 i_nodes_actedon = 0
1322                 i_sites_emailed = 0
1323                 l_allsites = []
1324
1325                 sorted_sites = self.sickdb.keys()
1326                 sorted_sites.sort()
1327                 for loginbase in sorted_sites:
1328                         site_record = self.sickdb[loginbase]
1329                         print "sites: %s" % loginbase
1330                         
1331                         i_nodes_diagnosed += len(site_record.keys())
1332                         i_sites_diagnosed += 1
1333
1334                         (na,ne) = self.__actOnSite(loginbase, site_record)
1335
1336                         i_sites_observed += 1
1337                         i_nodes_actedon += na
1338                         i_sites_emailed += ne
1339
1340                         l_allsites += [loginbase]
1341
1342                 return {'sites_observed': i_sites_observed, 
1343                                 'sites_diagnosed': i_sites_diagnosed, 
1344                                 'nodes_diagnosed': i_nodes_diagnosed, 
1345                                 'sites_emailed': i_sites_emailed, 
1346                                 'nodes_actedon': i_nodes_actedon, 
1347                                 'allsites':l_allsites}
1348
1349         def print_stats(self, key, stats):
1350                 print "%20s : %d" % (key, stats[key])
1351
1352
1353
1354         #"""
1355         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1356         #"""
1357         #def status(self):
1358         #       sub = "Monitor Summary"
1359         #       msg = "\nThe following nodes were acted upon:  \n\n"
1360         #       for (node, (type, date)) in self.emailed.items():
1361         #               # Print only things acted on today.
1362         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1363         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1364         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1365         #       for (loginbase, (date, type)) in self.squeezed.items():
1366         #               # Print only things acted on today.
1367         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1368         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1369         #       mailer.email(sub, msg, [SUMTO])
1370         #       logger.info(msg)
1371         #       return 
1372
1373         #"""
1374         #Store/Load state of emails.  When, where, what.
1375         #"""
1376         #def emailedStore(self, action):
1377         #       try:
1378         #               if action == "LOAD":
1379         #                       f = open(DAT, "r+")
1380         #                       logger.info("POLICY:  Found and reading " + DAT)
1381         #                       self.emailed.update(pickle.load(f))
1382         #               if action == "WRITE":
1383         #                       f = open(DAT, "w")
1384         #                       #logger.debug("Writing " + DAT)
1385         #                       pickle.dump(self.emailed, f)
1386         #               f.close()
1387         #       except Exception, err:
1388         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1389
1390
1391 #class Policy(Thread):
1392
1393 def main():
1394         print "policy.py is a module, not a script for running directly."
1395
1396 if __name__ == '__main__':
1397         import os
1398         import plc
1399         try:
1400                 main()
1401         except KeyboardInterrupt:
1402                 print "Killed.  Exitting."
1403                 logger.info('Monitor Killed')
1404                 os._exit(0)