Massive commit. Just put all local changes into svn.
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import soltesz
23 import string
24 from www.printbadnodes import cmpCategoryVal
25 from config import config
26 #print "policy"
27 config = config()
28
29 DAT="./monitor.dat"
30
31 logger = logging.getLogger("monitor")
32
33 # Time to enforce policy
34 POLSLEEP = 7200
35
36 # Where to email the summary
37 SUMTO = "soltesz@cs.princeton.edu"
38 TECHEMAIL="tech-%s@sites.planet-lab.org"
39 PIEMAIL="pi-%s@sites.planet-lab.org"
40 SLICEMAIL="%s@slices.planet-lab.org"
41 PLCEMAIL="support@planet-lab.org"
42
43 #Thresholds (DAYS)
44 SPERMIN = 60
45 SPERHOUR = 60*60
46 SPERDAY = 86400
47 PITHRESH = 7 * SPERDAY
48 SLICETHRESH = 7 * SPERDAY
49 # Days before attempting rins again
50 RINSTHRESH = 5 * SPERDAY
51
52 # Days before calling the node dead.
53 DEADTHRESH = 30 * SPERDAY
54 # Minimum number of nodes up before squeezing
55 MINUP = 2
56
57 TECH=1
58 PI=2
59 USER=4
60 ADMIN=8
61
62 # IF:
63 #  no SSH, down.
64 #  bad disk, down
65 #  DNS, kinda down (sick)
66 #  clock, kinda down (sick)
67 #  Full disk, going to be down
68
69 # Actions:
70 #  Email
71 #  suspend slice creation
72 #  kill slices
73 def array_to_priority_map(array):
74         """ Create a mapping where each entry of array is given a priority equal
75         to its position in the array.  This is useful for subsequent use in the
76         cmpMap() function."""
77         map = {}
78         count = 0
79         for i in array:
80                 map[i] = count
81                 count += 1
82         return map
83
84 def getdebug():
85         return config.debug
86
87 def print_stats(key, stats):
88         if key in stats: print "%20s : %d" % (key, stats[key])
89
90 def get_ticket_id(record):
91         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
92                 return record['ticket_id']
93         elif            'found_rt_ticket' in record and \
94                  record['found_rt_ticket'] is not "" and \
95                  record['found_rt_ticket'] is not None:
96                 return record['found_rt_ticket']
97         else:
98                 return None
99
100 class Merge(Thread):
101         def __init__(self, l_merge, toRT):
102                 self.toRT = toRT
103                 self.merge_list = l_merge
104                 # the hostname to loginbase mapping
105                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
106
107                 # Previous actions taken on nodes.
108                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
109                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
110
111                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
112                 self.sickdb = {}
113                 self.mergedb = {}
114                 Thread.__init__(self)
115
116         def run(self):
117                 # populate sickdb
118                 self.accumSickSites()
119                 # read data from findbad and act_all
120                 self.mergeActionsAndBadDB()
121                 # pass node_records to RT
122                 self.sendToRT()
123
124         def accumSickSites(self):
125                 """
126                 Take all nodes, from l_diagnose, look them up in the act_all database, 
127                 and insert them into sickdb[] as:
128
129                         sickdb[loginbase][nodename] = fb_record
130                 """
131                 # look at all problems reported by findbad
132                 l_nodes = self.findbad['nodes'].keys()
133                 count = 0
134                 for nodename in l_nodes:
135                         if nodename not in self.merge_list:
136                                 continue                # skip this node, since it's not wanted
137
138                         count += 1
139                         loginbase = self.plcdb_hn2lb[nodename]
140                         values = self.findbad['nodes'][nodename]['values']
141
142                         fb_record = {}
143                         fb_record['nodename'] = nodename
144                         try:
145                                 fb_record['category'] = values['category']
146                         except:
147                                 print values
148                                 print nodename
149                                 print self.findbad['nodes'][nodename]
150                                 count -= 1
151                                 continue
152                         fb_record['state'] = values['state']
153                         fb_record['comonstats'] = values['comonstats']
154                         fb_record['plcnode'] = values['plcnode']
155                         fb_record['kernel'] = self.getKernel(values['kernel'])
156                         fb_record['stage'] = "findbad"
157                         fb_record['message'] = None
158                         fb_record['bootcd'] = values['bootcd']
159                         fb_record['args'] = None
160                         fb_record['info'] = None
161                         fb_record['time'] = time.time()
162                         fb_record['date_created'] = time.time()
163
164                         if loginbase not in self.sickdb:
165                                 self.sickdb[loginbase] = {}
166
167                         self.sickdb[loginbase][nodename] = fb_record
168
169                 print "Found %d nodes" % count
170
171         def getKernel(self, unamestr):
172                 s = unamestr.split()
173                 if len(s) > 2:
174                         return s[2]
175                 else:
176                         return ""
177
178         def mergeActionsAndBadDB(self): 
179                 """
180                 - Look at the sick node_records as reported in findbad, 
181                 - Then look at the node_records in act_all.  
182
183                 There are four cases:
184                 1) Problem in findbad, no problem in act_all
185                         this ok, b/c it just means it's a new problem
186                 2) Problem in findbad, problem in act_all
187                         -Did the problem get better or worse?  
188                                 -If Same, or Worse, then continue looking for open tickets.
189                                 -If Better, or No problem, then "back-off" penalties.
190                                         This judgement may need to wait until 'Diagnose()'
191
192                 3) No problem in findbad, problem in act_all
193                         The the node is operational again according to Findbad()
194
195                 4) No problem in findbad, no problem in act_all
196                         There won't be a record in either db, so there's no code.
197                 """
198
199                 sorted_sites = self.sickdb.keys()
200                 sorted_sites.sort()
201                 # look at all problems reported by findbad
202                 for loginbase in sorted_sites:
203                         d_fb_nodes = self.sickdb[loginbase]
204                         sorted_nodes = d_fb_nodes.keys()
205                         sorted_nodes.sort()
206                         for nodename in sorted_nodes:
207                                 fb_record = self.sickdb[loginbase][nodename]
208                                 x = fb_record
209                                 if loginbase not in self.mergedb:
210                                         self.mergedb[loginbase] = {}
211
212                                 # take the info either from act_all or fb-record.
213                                 # if node not in act_all
214                                 #       then take it from fbrecord, obviously.
215                                 # else node in act_all
216                                 #   if act_all == 0 length (no previous records)
217                                 #               then take it from fbrecord.
218                                 #   else
219                                 #           take it from act_all.
220                                 #   
221
222                                 # We must compare findbad state with act_all state
223                                 if nodename not in self.act_all:
224                                         # 1) ok, b/c it's a new problem. set ticket_id to null
225                                         self.mergedb[loginbase][nodename] = {} 
226                                         self.mergedb[loginbase][nodename].update(x)
227                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
228                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
229                                 else: 
230                                         if len(self.act_all[nodename]) == 0:
231                                                 self.mergedb[loginbase][nodename] = {} 
232                                                 self.mergedb[loginbase][nodename].update(x)
233                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
234                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
235                                         else:
236                                                 y = self.act_all[nodename][0]
237                                                 y['prev_category'] = y['category']
238
239                                                 self.mergedb[loginbase][nodename] = {}
240                                                 self.mergedb[loginbase][nodename].update(y)
241                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
242                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
243                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
244                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
245                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
246                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
247                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
248                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
249
250                                         # delete the entry from cache_all to keep it out of case 3)
251                                         del self.cache_all[nodename]
252
253                 # 3) nodes that remin in cache_all were not identified by findbad.
254                 #        Do we keep them or not?
255                 #   NOTE: i think that since the categories are performed before this
256                 #               step now, and by a monitor-controlled agent.
257
258                 # TODO: This does not work correctly.  Do we need this? 
259                 #for hn in self.cache_all.keys():
260                 #       y = self.act_all[hn][0]
261                 #       if 'monitor' in y['bucket']:
262                 #               loginbase = self.plcdb_hn2lb[hn] 
263                 #               if loginbase not in self.sickdb:
264                 #                       self.sickdb[loginbase] = {}
265                 #               self.sickdb[loginbase][hn] = y
266                 #       else:
267                 #               del self.cache_all[hn]
268
269                 print "len of cache_all: %d" % len(self.cache_all.keys())
270                 return
271
272         def sendToRT(self):
273                 sorted_sites = self.mergedb.keys()
274                 sorted_sites.sort()
275                 # look at all problems reported by merge
276                 for loginbase in sorted_sites:
277                         d_merge_nodes = self.mergedb[loginbase]
278                         for nodename in d_merge_nodes.keys():
279                                 record = self.mergedb[loginbase][nodename]
280                                 self.toRT.put(record)
281
282                 # send signal to stop reading
283                 self.toRT.put(None)
284                 return
285
286 class Diagnose(Thread):
287         def __init__(self, fromRT):
288                 self.fromRT = fromRT
289                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
290                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
291
292                 self.diagnose_in = {}
293                 self.diagnose_out = {}
294                 Thread.__init__(self)
295
296
297         def run(self):
298                 self.accumSickSites()
299
300                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
301                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
302
303                 try:
304                         stats = self.diagnoseAll()
305                 except Exception, err:
306                         print "----------------"
307                         import traceback
308                         print traceback.print_exc()
309                         print err
310                         #if config.policysavedb:
311                         sys.exit(1)
312
313                 print_stats("sites_observed", stats)
314                 print_stats("sites_diagnosed", stats)
315                 print_stats("nodes_diagnosed", stats)
316
317                 if config.policysavedb:
318                         print "Saving Databases... diagnose_out"
319                         soltesz.dbDump("diagnose_out", self.diagnose_out)
320
321         def accumSickSites(self):
322                 """
323                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
324                 and insert them into diagnose_in[] as:
325
326                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
327                 """
328                 while 1:
329                         node_record = self.fromRT.get(block = True)
330                         if node_record == None:
331                                 break;
332
333                         nodename = node_record['nodename']
334                         loginbase = self.plcdb_hn2lb[nodename]
335
336                         if loginbase not in self.diagnose_in:
337                                 self.diagnose_in[loginbase] = {}
338
339                         self.diagnose_in[loginbase][nodename] = node_record
340
341                 return
342
343         def diagnoseAll(self):
344                 i_sites_observed = 0
345                 i_sites_diagnosed = 0
346                 i_nodes_diagnosed = 0
347                 i_nodes_actedon = 0
348                 i_sites_emailed = 0
349                 l_allsites = []
350
351                 sorted_sites = self.diagnose_in.keys()
352                 sorted_sites.sort()
353                 self.diagnose_out= {}
354                 for loginbase in sorted_sites:
355                         l_allsites += [loginbase]
356
357                         d_diag_nodes = self.diagnose_in[loginbase]
358                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
359                         # store records in diagnose_out, for saving later.
360                         self.diagnose_out.update(d_act_records)
361                         
362                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
363                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
364                                 i_sites_diagnosed += 1
365                         i_sites_observed += 1
366
367                 return {'sites_observed': i_sites_observed, 
368                                 'sites_diagnosed': i_sites_diagnosed, 
369                                 'nodes_diagnosed': i_nodes_diagnosed, 
370                                 'allsites':l_allsites}
371
372                 pass
373                 
374         def getDaysDown(cls, diag_record):
375                 daysdown = -1
376                 if diag_record['comonstats']['uptime'] != "null":
377                         #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
378                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
379                 elif diag_record['comonstats']['sshstatus'] != "null":
380                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
381                 elif diag_record['comonstats']['lastcotop'] != "null":
382                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
383                 else:
384                         now = time.time()
385                         last_contact = diag_record['plcnode']['last_contact']
386                         if last_contact == None:
387                                 # the node has never been up, so give it a break
388                                 daysdown = -1
389                         else:
390                                 diff = now - last_contact
391                                 daysdown = diff // (60*60*24)
392                 return daysdown
393         getDaysDown = classmethod(getDaysDown)
394
395         def getStrDaysDown(cls, diag_record):
396                 daysdown = cls.getDaysDown(diag_record)
397                 if daysdown > 0:
398                         return "%d days down"%daysdown
399                 elif daysdown == -1:
400                         return "Unknown number of days"
401                 else:
402                         return "%d days up"% -daysdown
403         getStrDaysDown = classmethod(getStrDaysDown)
404
405         def __getCDVersion(self, diag_record, nodename):
406                 cdversion = ""
407                 #print "Getting kernel for: %s" % diag_record['nodename']
408                 cdversion = diag_record['kernel']
409                 return cdversion
410
411         def __diagnoseSite(self, loginbase, d_diag_nodes):
412                 """
413                 d_diag_nodes are diagnose_in entries.
414                 """
415                 d_diag_site = {loginbase : { 'config' : 
416                                                                                                 {'squeeze': False,
417                                                                                                  'email': False
418                                                                                                 }, 
419                                                                         'nodes': {}
420                                                                         }
421                                            }
422                 sorted_nodes = d_diag_nodes.keys()
423                 sorted_nodes.sort()
424                 for nodename in sorted_nodes:
425                         node_record = d_diag_nodes[nodename]
426                         diag_record = self.__diagnoseNode(loginbase, node_record)
427
428                         if diag_record != None:
429                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
430
431                                 # NOTE: improvement means, we need to act/squeeze and email.
432                                 #print "DIAG_RECORD", diag_record
433                                 if 'monitor-end-record' in diag_record['stage'] or \
434                                    'nmreset' in diag_record['stage']:
435                                 #       print "resetting loginbase!" 
436                                         d_diag_site[loginbase]['config']['squeeze'] = True
437                                         d_diag_site[loginbase]['config']['email'] = True
438                                 #else:
439                                 #       print "NO IMPROVEMENT!!!!"
440                         else:
441                                 pass # there is nothing to do for this node.
442
443                 # NOTE: these settings can be overridden by command line arguments,
444                 #       or the state of a record, i.e. if already in RT's Support Queue.
445                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
446                 if nodes_up < MINUP:
447                         d_diag_site[loginbase]['config']['squeeze'] = True
448
449                 max_slices = self.getMaxSlices(loginbase)
450                 num_nodes = self.getNumNodes(loginbase)
451                 # NOTE: when max_slices == 0, this is either a new site (the old way)
452                 #       or an old disabled site from previous monitor (before site['enabled'])
453                 if nodes_up < num_nodes and max_slices != 0:
454                         d_diag_site[loginbase]['config']['email'] = True
455
456                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
457                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
458
459                 return d_diag_site
460
461         def diagRecordByCategory(self, node_record):
462                 nodename = node_record['nodename']
463                 category = node_record['category']
464                 state    = node_record['state']
465                 loginbase = self.plcdb_hn2lb[nodename]
466                 diag_record = None
467
468                 if  "ERROR" in category:        # i.e. "DOWN"
469                         diag_record = {}
470                         diag_record.update(node_record)
471                         daysdown = self.getDaysDown(diag_record) 
472                         if daysdown < 7:
473                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
474                                 print format % (loginbase, nodename, daysdown)
475                                 return None
476
477                         s_daysdown = self.getStrDaysDown(diag_record)
478                         diag_record['message'] = emailTxt.mailtxt.newdown
479                         diag_record['args'] = {'nodename': nodename}
480                         diag_record['info'] = (nodename, s_daysdown, "")
481
482                         if 'reboot_node_failed' in node_record:
483                                 # there was a previous attempt to use the PCU.
484                                 if node_record['reboot_node_failed'] == False:
485                                         # then the last attempt apparently, succeeded.
486                                         # But, the category is still 'ERROR'.  Therefore, the
487                                         # PCU-to-Node mapping is broken.
488                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
489                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
490                                         diag_record['email_pcu'] = True
491
492                         if 'ticket_id' in diag_record:
493                                 if diag_record['ticket_id'] == "":
494                                         if 'found_rt_ticket' in diag_record:
495                                                 ticket_id = diag_record['found_rt_ticket']
496                                         else:
497                                                 ticket_id = "None"
498                                 else:
499                                         ticket_id = diag_record['ticket_id']
500                         else:
501                                 ticket_id = "None"
502
503                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
504                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
505
506                 elif "OLDBOOTCD" in category:
507                         # V2 boot cds as determined by findbad
508                         s_daysdown = self.getStrDaysDown(node_record)
509                         s_cdversion = self.__getCDVersion(node_record, nodename)
510                         diag_record = {}
511                         diag_record.update(node_record)
512                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
513                         diag_record['message'] = emailTxt.mailtxt.newbootcd
514                         diag_record['args'] = {'nodename': nodename}
515                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
516                         if diag_record['ticket_id'] == "":
517                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
518                                                                         (loginbase, nodename, diag_record['kernel'], 
519                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
520                         else:
521                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
522                                                                         (loginbase, nodename, diag_record['kernel'], 
523                                                                          diag_record['bootcd'], diag_record['ticket_id'])
524
525                 elif "PROD" in category:
526                         if "DEBUG" in state:
527                                 # Not sure what to do with these yet.  Probably need to
528                                 # reboot, and email.
529                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
530                                 return None
531                         elif "BOOT" in state:
532                                 # no action needed.
533                                 # TODO: remove penalties, if any are applied.
534                                 now = time.time()
535                                 last_contact = node_record['plcnode']['last_contact']
536                                 if last_contact == None:
537                                         time_diff = 0
538                                 else:
539                                         time_diff = now - last_contact;
540
541                                 if 'improvement' in node_record['stage']:
542                                         # then we need to pass this on to 'action'
543                                         diag_record = {}
544                                         diag_record.update(node_record)
545                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
546                                         diag_record['args'] = {'nodename': nodename}
547                                         diag_record['info'] = (nodename, node_record['prev_category'], 
548                                                                                                          node_record['category'])
549                                         if 'email_pcu' in diag_record:
550                                                 if diag_record['email_pcu']:
551                                                         # previously, the pcu failed to reboot, so send
552                                                         # email. Now, reset these values to try the reboot
553                                                         # again.
554                                                         diag_record['email_pcu'] = False
555                                                         del diag_record['reboot_node_failed']
556
557                                         if diag_record['ticket_id'] == "":
558                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
559                                                                         (loginbase, nodename, diag_record['stage'], 
560                                                                          state, category, diag_record['found_rt_ticket'])
561                                         else:
562                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
563                                                                         (loginbase, nodename, diag_record['stage'], 
564                                                                          state, category, diag_record['ticket_id'])
565                                         return diag_record
566                                 #elif time_diff >= 6*SPERHOUR:
567                                 #       # heartbeat is older than 30 min.
568                                 #       # then reset NM.
569                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
570                                 #       diag_record = {}
571                                 #       diag_record.update(node_record)
572                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
573                                 #       diag_record['args'] = {'nodename': nodename}
574                                 #       diag_record['stage'] = "nmreset"
575                                 #       diag_record['info'] = (nodename, 
576                                 #                                                       node_record['prev_category'], 
577                                 #                                                       node_record['category'])
578                                 #       if diag_record['ticket_id'] == "":
579                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
580                                 #                                       (loginbase, nodename, diag_record['stage'], 
581                                 #                                        state, category, diag_record['found_rt_ticket'])
582                                 #       else:
583                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
584                                 #                                       (loginbase, nodename, diag_record['stage'])
585 #
586 #                                       return diag_record
587                                 else:
588                                         return None
589                         else:
590                                 # unknown
591                                 pass
592                 elif "ALPHA"    in category:
593                         pass
594                 elif "clock_drift" in category:
595                         pass
596                 elif "dns"    in category:
597                         pass
598                 elif "filerw"    in category:
599                         pass
600                 else:
601                         print "Unknown category!!!! %s" % category
602                         sys.exit(1)
603
604                 return diag_record
605
606         def __diagnoseNode(self, loginbase, node_record):
607                 # TODO: change the format of the hostname in this 
608                 #               record to something more natural.
609                 nodename                = node_record['nodename']
610                 category                = node_record['category']
611                 prev_category   = node_record['prev_category']
612                 state                   = node_record['state']
613                 #if 'prev_category' in node_record:
614                 #       prev_category = node_record['prev_category']
615                 #else:
616                 #       prev_category = "ERROR"
617                 if node_record['prev_category'] != "NORECORD":
618                 
619                         val = cmpCategoryVal(category, prev_category)
620                         print "%s went from %s -> %s" % (nodename, prev_category, category)
621                         if val == 1:
622                                 # improved
623                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
624                                         print "closing record with no ticket: ", node_record['nodename']
625                                         node_record['action'] = ['close_rt']
626                                         node_record['message'] = None
627                                         node_record['stage'] = 'monitor-end-record'
628                                         return node_record
629                                 else:
630                                         node_record['stage'] = 'improvement'
631
632                                 #if 'monitor-end-record' in node_record['stage']:
633                                 #       # just ignore it if it's already ended.
634                                 #       # otherwise, the status should be worse, and we won't get
635                                 #       # here.
636                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
637                                 #       return None
638 #
639 #                                       #return None
640                         elif val == -1:
641                                 # current category is worse than previous, carry on
642                                 pass
643                         else:
644                                 #values are equal, carry on.
645                                 #print "why are we here?"
646                                 pass
647
648                 if 'rt' in node_record and 'Status' in node_record['rt']:
649                         if node_record['stage'] == 'ticket_waitforever':
650                                 if 'resolved' in node_record['rt']['Status']:
651                                         print "ending waitforever record for: ", node_record['nodename']
652                                         node_record['action'] = ['noop']
653                                         node_record['message'] = None
654                                         node_record['stage'] = 'monitor-end-record'
655                                         print "oldlog: %s" % node_record['log'],
656                                         print "%15s" % node_record['action']
657                                         return node_record
658                                 if 'new' in node_record['rt']['Status'] and \
659                                         'Queue' in node_record['rt'] and \
660                                         'Monitor' in node_record['rt']['Queue']:
661
662                                         print "RESETTING stage to findbad"
663                                         node_record['stage'] = 'findbad'
664                         
665                 #### COMPARE category and prev_category
666                 # if not_equal
667                 #       then assign a stage based on relative priorities
668                 # else equal
669                 #       then check category for stats.
670                 diag_record = self.diagRecordByCategory(node_record)
671                 if diag_record == None:
672                         #print "diag_record == None"
673                         return None
674
675                 #### found_RT_ticket
676                 # TODO: need to record time found, and maybe add a stage for acting on it...
677                 # NOTE: after found, if the support ticket is resolved, the block is
678                 #               not removed. How to remove the block on this?
679                 if 'found_rt_ticket' in diag_record and \
680                         diag_record['found_rt_ticket'] is not None:
681                         if diag_record['stage'] is not 'improvement':
682                                 diag_record['stage'] = 'ticket_waitforever'
683                                 
684                 current_time = time.time()
685                 # take off four days, for the delay that database caused.
686                 # TODO: generalize delays at PLC, and prevent enforcement when there
687                 #               have been no emails.
688                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
689                 #delta = current_time - diag_record['time'] - 7*SPERDAY
690                 delta = current_time - diag_record['time']
691
692                 message = diag_record['message']
693                 act_record = {}
694                 act_record.update(diag_record)
695
696                 #### DIAGNOSE STAGES 
697                 if   'findbad' in diag_record['stage']:
698                         # The node is bad, and there's no previous record of it.
699                         act_record['email'] = TECH
700                         act_record['action'] = ['noop']
701                         act_record['message'] = message[0]
702                         act_record['stage'] = 'stage_actinoneweek'
703
704                 elif 'nmreset' in diag_record['stage']:
705                         act_record['email']  = ADMIN 
706                         act_record['action'] = ['reset_nodemanager']
707                         act_record['message'] = message[0]
708                         act_record['stage']  = 'nmreset'
709                         return None
710
711                 elif 'reboot_node' in diag_record['stage']:
712                         act_record['email'] = TECH
713                         act_record['action'] = ['noop']
714                         act_record['message'] = message[0]
715                         act_record['stage'] = 'stage_actinoneweek'
716                         
717                 elif 'improvement' in diag_record['stage']:
718                         # - backoff previous squeeze actions (slice suspend, nocreate)
719                         # TODO: add a backoff_squeeze section... Needs to runthrough
720                         print "backing off of %s" % nodename
721                         act_record['action'] = ['close_rt']
722                         act_record['message'] = message[0]
723                         act_record['stage'] = 'monitor-end-record'
724
725                 elif 'actinoneweek' in diag_record['stage']:
726                         if delta >= 7 * SPERDAY: 
727                                 act_record['email'] = TECH | PI
728                                 act_record['stage'] = 'stage_actintwoweeks'
729                                 act_record['message'] = message[1]
730                                 act_record['action'] = ['nocreate' ]
731                                 act_record['time'] = current_time               # reset clock for waitforever
732                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
733                                 act_record['email'] = TECH 
734                                 act_record['message'] = message[0]
735                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
736                                 act_record['second-mail-at-oneweek'] = True
737                         else:
738                                 act_record['message'] = None
739                                 act_record['action'] = ['waitforoneweekaction' ]
740                                 print "ignoring this record for: %s" % act_record['nodename']
741                                 return None                     # don't send if there's no action
742
743                 elif 'actintwoweeks' in diag_record['stage']:
744                         if delta >= 7 * SPERDAY:
745                                 act_record['email'] = TECH | PI | USER
746                                 act_record['stage'] = 'stage_waitforever'
747                                 act_record['message'] = message[2]
748                                 act_record['action'] = ['suspendslices']
749                                 act_record['time'] = current_time               # reset clock for waitforever
750                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
751                                 act_record['email'] = TECH | PI
752                                 act_record['message'] = message[1]
753                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
754                                 act_record['second-mail-at-twoweeks'] = True
755                         else:
756                                 act_record['message'] = None
757                                 act_record['action'] = ['waitfortwoweeksaction']
758                                 return None                     # don't send if there's no action
759
760                 elif 'ticket_waitforever' in diag_record['stage']:
761                         act_record['email'] = TECH
762                         if 'first-found' not in act_record:
763                                 act_record['first-found'] = True
764                                 act_record['log'] += " firstfound"
765                                 act_record['action'] = ['ticket_waitforever']
766                                 act_record['message'] = message[0]
767                                 act_record['time'] = current_time
768                         else:
769                                 if delta >= 7*SPERDAY:
770                                         act_record['action'] = ['ticket_waitforever']
771                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
772                                                         act_record['rt']['Status'] == 'new':
773                                                 act_record['message'] = message[0]
774                                         else:
775                                                 act_record['message'] = None
776                                                 
777                                         act_record['time'] = current_time               # reset clock
778                                 else:
779                                         act_record['action'] = ['ticket_waitforever']
780                                         act_record['message'] = None
781                                         return None
782
783                 elif 'waitforever' in diag_record['stage']:
784                         # more than 3 days since last action
785                         # TODO: send only on weekdays.
786                         # NOTE: expects that 'time' has been reset before entering waitforever stage
787                         if delta >= 3*SPERDAY:
788                                 act_record['action'] = ['email-againwaitforever']
789                                 act_record['message'] = message[2]
790                                 act_record['time'] = current_time               # reset clock
791                         else:
792                                 act_record['action'] = ['waitforever']
793                                 act_record['message'] = None
794                                 return None                     # don't send if there's no action
795
796                 else:
797                         # There is no action to be taken, possibly b/c the stage has
798                         # already been performed, but diagnose picked it up again.
799                         # two cases, 
800                         #       1. stage is unknown, or 
801                         #       2. delta is not big enough to bump it to the next stage.
802                         # TODO: figure out which. for now assume 2.
803                         print "UNKNOWN stage for %s; nothing done" % nodename
804                         act_record['action'] = ['unknown']
805                         act_record['message'] = message[0]
806
807                         act_record['email'] = TECH
808                         act_record['action'] = ['noop']
809                         act_record['message'] = message[0]
810                         act_record['stage'] = 'stage_actinoneweek'
811                         act_record['time'] = current_time               # reset clock
812                         #print "Exiting..."
813                         #return None
814                         #sys.exit(1)
815
816                 print "%s" % act_record['log'],
817                 print "%15s" % act_record['action']
818                 return act_record
819
820         def getMaxSlices(self, loginbase):
821                 # if sickdb has a loginbase, then it will have at least one node.
822                 site_stats = None
823
824                 for nodename in self.diagnose_in[loginbase].keys():
825                         if nodename in self.findbad['nodes']:
826                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
827                                 break
828
829                 if site_stats == None:
830                         raise Exception, "loginbase with no nodes in findbad"
831                 else:
832                         return site_stats['max_slices']
833
834         def getNumNodes(self, loginbase):
835                 # if sickdb has a loginbase, then it will have at least one node.
836                 site_stats = None
837
838                 for nodename in self.diagnose_in[loginbase].keys():
839                         if nodename in self.findbad['nodes']:
840                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
841                                 break
842
843                 if site_stats == None:
844                         raise Exception, "loginbase with no nodes in findbad"
845                 else:
846                         if 'num_nodes' in site_stats:
847                                 return site_stats['num_nodes']
848                         else:
849                                 return 0
850
851         """
852         Returns number of up nodes as the total number *NOT* in act_all with a
853         stage other than 'steady-state' .
854         """
855         def getUpAtSite(self, loginbase, d_diag_site):
856                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
857                 #               that aren't recorded yet.
858
859                 numnodes = self.getNumNodes(loginbase)
860                 # NOTE: assume nodes we have no record of are ok. (too conservative)
861                 # TODO: make the 'up' value more representative
862                 up = numnodes
863                 for nodename in d_diag_site[loginbase]['nodes'].keys():
864
865                         rec = d_diag_site[loginbase]['nodes'][nodename]
866                         if rec['stage'] != 'monitor-end-record':
867                                 up -= 1
868                         else:
869                                 pass # the node is assumed to be up.
870
871                 #if up != numnodes:
872                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
873
874                 return up
875
876
877 class SiteAction:
878         def __init__(self, parameter_names=['hostname', 'ticket_id']):
879                 self.parameter_names = parameter_names
880         def checkParam(self, args):
881                 for param in self.parameter_names:
882                         if param not in args:
883                                 raise Exception("Parameter %s not provided in args"%param)
884         def run(self, args):
885                 self.checkParam(args)
886                 return self._run(args)
887         def _run(self, args):
888                 pass
889
890 class SuspendAction(SiteAction):
891         def _run(self, args):
892                 return plc.suspendSlices(args['hostname'])
893
894 class RemoveSliceCreation(SiteAction):
895         def _run(self, args):
896                 return plc.removeSliceCreation(args['hostname'])
897
898 class BackoffActions(SiteAction):
899         def _run(self, args):
900                 plc.enableSlices(args['hostname'])
901                 plc.enableSliceCreation(args['hostname'])
902                 return True
903
904 # TODO: create class for each action below, 
905 #               allow for lists of actions to be performed...
906
907 def close_rt_backoff(args):
908         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
909                 mailer.closeTicketViaRT(args['ticket_id'], 
910                                                                 "Ticket CLOSED automatically by SiteAssist.")
911                 plc.enableSlices(args['hostname'])
912                 plc.enableSliceCreation(args['hostname'])
913         return
914
915 def reboot_node(args):
916         host = args['hostname']
917         return reboot.reboot_policy(host, True, config.debug)
918
919 def reset_nodemanager(args):
920         os.system("ssh root@%s /sbin/service nm restart" % nodename)
921         return
922
923 class Action(Thread):
924         def __init__(self, l_action):
925                 self.l_action = l_action
926
927                 # the hostname to loginbase mapping
928                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
929
930                 # Actions to take.
931                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
932                 # Actions taken.
933                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
934
935                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
936                 self.actions = {}
937                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
938                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
939                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
940                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
941                 self.actions['noop'] = lambda args: args
942                 self.actions['reboot_node'] = lambda args: reboot_node(args)
943                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
944
945                 self.actions['ticket_waitforever'] = lambda args: args
946                 self.actions['waitforever'] = lambda args: args
947                 self.actions['unknown'] = lambda args: args
948                 self.actions['waitforoneweekaction'] = lambda args: args
949                 self.actions['waitfortwoweeksaction'] = lambda args: args
950                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
951                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
952                 self.actions['email-againwaitforever'] = lambda args: args
953                 self.actions['email-againticket_waitforever'] = lambda args: args
954                                 
955
956                 self.sickdb = {}
957                 Thread.__init__(self)
958
959         def run(self):
960                 self.accumSites()
961                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
962                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
963
964                 try:
965                         stats = self.analyseSites()
966                 except Exception, err:
967                         print "----------------"
968                         import traceback
969                         print traceback.print_exc()
970                         print err
971                         if config.policysavedb:
972                                 print "Saving Databases... act_all"
973                                 soltesz.dbDump("act_all", self.act_all)
974                         sys.exit(1)
975
976                 print_stats("sites_observed", stats)
977                 print_stats("sites_diagnosed", stats)
978                 print_stats("nodes_diagnosed", stats)
979                 print_stats("sites_emailed", stats)
980                 print_stats("nodes_actedon", stats)
981                 print string.join(stats['allsites'], ",")
982
983                 if config.policysavedb:
984                         print "Saving Databases... act_all"
985                         #soltesz.dbDump("policy.eventlog", self.eventlog)
986                         # TODO: remove 'diagnose_out', 
987                         #       or at least the entries that were acted on.
988                         soltesz.dbDump("act_all", self.act_all)
989
990         def accumSites(self):
991                 """
992                 Take all nodes, from l_action, look them up in the diagnose_db database, 
993                 and insert them into sickdb[] as:
994
995                 This way only the given l_action nodes will be acted on regardless
996                 of how many from diagnose_db are available.
997
998                         sickdb[loginbase][nodename] = diag_record
999                 """
1000                 # TODO: what if l_action == None ?
1001                 for nodename in self.l_action:
1002
1003                         loginbase = self.plcdb_hn2lb[nodename]
1004
1005                         if loginbase in self.diagnose_db and \
1006                                 nodename in self.diagnose_db[loginbase]['nodes']:
1007
1008                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1009
1010                                 if loginbase not in self.sickdb:
1011                                         self.sickdb[loginbase] = {'nodes' : {}}
1012
1013                                 # NOTE: don't copy all node records, since not all will be in l_action
1014                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1015                                 # NOTE: but, we want to get the loginbase config settings, 
1016                                 #               this is the easiest way.
1017                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1018                         #else:
1019                                 #print "%s not in diagnose_db!!" % loginbase
1020                 return
1021
1022         def __emailSite(self, loginbase, roles, message, args):
1023                 """
1024                 loginbase is the unique site abbreviation, prepended to slice names.
1025                 roles contains TECH, PI, USER roles, and derive email aliases.
1026                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1027                 """
1028                 ticket_id = 0
1029                 args.update({'loginbase':loginbase})
1030
1031                 if not config.mail and not config.debug and config.bcc:
1032                         roles = ADMIN
1033                 if config.mail and config.debug:
1034                         roles = ADMIN
1035
1036                 # build targets
1037                 contacts = []
1038                 if ADMIN & roles:
1039                         contacts += [config.email]
1040                 if TECH & roles:
1041                         contacts += [TECHEMAIL % loginbase]
1042                 if PI & roles:
1043                         contacts += [PIEMAIL % loginbase]
1044                 if USER & roles:
1045                         slices = plc.slices(loginbase)
1046                         if len(slices) >= 1:
1047                                 for slice in slices:
1048                                         contacts += [SLICEMAIL % slice]
1049                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1050                         else:
1051                                 print "SLIC: %20s : 0 slices" % loginbase
1052
1053                 try:
1054                         subject = message[0] % args
1055                         body = message[1] % args
1056                         if ADMIN & roles:
1057                                 # send only to admin
1058                                 if 'ticket_id' in args:
1059                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1060                                 else:
1061                                         subj = "Re: [PL noticket] %s" % subject
1062                                 mailer.email(subj, body, contacts)
1063                                 ticket_id = args['ticket_id']
1064                         else:
1065                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1066                 except Exception, err:
1067                         print "exception on message:"
1068                         import traceback
1069                         print traceback.print_exc()
1070                         print message
1071
1072                 return ticket_id
1073
1074
1075         def _format_diaginfo(self, diag_node):
1076                 info = diag_node['info']
1077                 if diag_node['stage'] == 'monitor-end-record':
1078                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1079                 else:
1080                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1081                 return hlist
1082
1083
1084         def get_email_args(self, act_recordlist, loginbase=None):
1085
1086                 email_args = {}
1087                 email_args['hostname_list'] = ""
1088
1089                 for act_record in act_recordlist:
1090                         email_args['hostname_list'] += act_record['msg_format']
1091                         email_args['hostname'] = act_record['nodename']
1092                         if  'plcnode' in act_record and \
1093                                 'pcu_ids' in act_record['plcnode'] and \
1094                                 len(act_record['plcnode']['pcu_ids']) > 0:
1095                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1096                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1097                         else:
1098                                 email_args['pcu_id'] = "-1"
1099                                         
1100                         if 'ticket_id' in act_record:
1101                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1102                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1103                                         sys.stdout.flush()
1104                                         line = sys.stdin.readline()
1105                                         try:
1106                                                 ticket_id = int(line)
1107                                         except:
1108                                                 print "could not get ticket_id from stdin..."
1109                                                 os._exit(1)
1110                                 else:
1111                                         ticket_id = act_record['ticket_id']
1112                                         
1113                                 email_args['ticket_id'] = ticket_id
1114
1115                 return email_args
1116
1117         def get_unique_issues(self, act_recordlist):
1118                 # NOTE: only send one email per site, per problem...
1119                 unique_issues = {}
1120                 for act_record in act_recordlist:
1121                         act_key = act_record['action'][0]
1122                         if act_key not in unique_issues:
1123                                 unique_issues[act_key] = []
1124                                 
1125                         unique_issues[act_key] += [act_record]
1126                         
1127                 return unique_issues
1128                         
1129
1130         def __actOnSite(self, loginbase, site_record):
1131                 i_nodes_actedon = 0
1132                 i_nodes_emailed = 0
1133
1134                 act_recordlist = []
1135
1136                 for nodename in site_record['nodes'].keys():
1137                         diag_record = site_record['nodes'][nodename]
1138                         act_record  = self.__actOnNode(diag_record)
1139                         #print "nodename: %s %s" % (nodename, act_record)
1140                         if act_record is not None:
1141                                 act_recordlist += [act_record]
1142
1143                 unique_issues = self.get_unique_issues(act_recordlist)
1144
1145                 for issue in unique_issues.keys():
1146                         print "\tworking on issue: %s" % issue
1147                         issue_record_list = unique_issues[issue]
1148                         email_args = self.get_email_args(issue_record_list, loginbase)
1149
1150                         # for each record.
1151                         for act_record in issue_record_list:
1152                                 # if there's a pcu record and email config is set
1153                                 if 'email_pcu' in act_record:
1154                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1155                                                 # and 'reboot_node' in act_record['stage']:
1156
1157                                                 email_args['hostname'] = act_record['nodename']
1158                                                 ticket_id = self.__emailSite(loginbase, 
1159                                                                                         act_record['email'], 
1160                                                                                         emailTxt.mailtxt.pcudown[0],
1161                                                                                         email_args)
1162                                                 if ticket_id == 0:
1163                                                         # error.
1164                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1165                                                         os._exit(1)
1166                                                         pass
1167                                                 email_args['ticket_id'] = ticket_id
1168
1169                         
1170                         act_record = issue_record_list[0]
1171                         # send message before squeezing
1172                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1173                                                                                                 site_record['config']['email'])
1174                         if act_record['message'] != None and site_record['config']['email']:
1175                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1176                                                                                          act_record['message'], email_args)
1177
1178                                 if ticket_id == 0:
1179                                         # error.
1180                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1181                                         os._exit(1)
1182                                         pass
1183
1184                                 # Add ticket_id to ALL nodenames
1185                                 for act_record in issue_record_list:
1186                                         nodename = act_record['nodename']
1187                                         # update node record with RT ticket_id
1188                                         if nodename in self.act_all:
1189                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1190                                                 # if the ticket was previously resolved, reset it to new.
1191                                                 if 'rt' in act_record and \
1192                                                         'Status' in act_record['rt'] and \
1193                                                         act_record['rt']['Status'] == 'resolved':
1194                                                         mailer.setTicketStatus(ticket_id, "new")
1195                                                 status = mailer.getTicketStatus(ticket_id)
1196                                                 self.act_all[nodename][0]['rt'] = status
1197                                         if config.mail: i_nodes_emailed += 1
1198
1199                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1200                                                                                                         site_record['config']['squeeze'])
1201                         if config.squeeze and site_record['config']['squeeze']:
1202                                 for act_key in act_record['action']:
1203                                         self.actions[act_key](email_args)
1204                                 i_nodes_actedon += 1
1205                 
1206                 if config.policysavedb:
1207                         print "Saving Databases... act_all, diagnose_out"
1208                         soltesz.dbDump("act_all", self.act_all)
1209                         # remove site record from diagnose_out, it's in act_all as done.
1210                         del self.diagnose_db[loginbase]
1211                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1212
1213                 print "sleeping for 1 sec"
1214                 time.sleep(1)
1215                 #print "Hit enter to continue..."
1216                 #sys.stdout.flush()
1217                 #line = sys.stdin.readline()
1218
1219                 return (i_nodes_actedon, i_nodes_emailed)
1220
1221         def __actOnNode(self, diag_record):
1222                 nodename = diag_record['nodename']
1223                 message = diag_record['message']
1224
1225                 act_record = {}
1226                 act_record.update(diag_record)
1227                 act_record['nodename'] = nodename
1228                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1229                 print "act_record['stage'] == %s " % act_record['stage']
1230
1231                 # avoid end records, and nmreset records                                        
1232                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1233
1234                 if 'monitor-end-record' not in act_record['stage'] and \
1235                    'nmreset' not in act_record['stage'] and \
1236                    'reboot_node_failed' not in act_record:
1237
1238                         if "DOWN" in act_record['log'] and \
1239                                         'pcu_ids' in act_record['plcnode'] and \
1240                                         len(act_record['plcnode']['pcu_ids']) > 0:
1241
1242                                 print "%s" % act_record['log'],
1243                                 print "%15s" % (['reboot_node'],)
1244                                 # Set node to re-install
1245                                 plc.nodeBootState(act_record['nodename'], "rins")       
1246                                 try:
1247                                         ret = reboot_node({'hostname': act_record['nodename']})
1248                                 except Exception, exc:
1249                                         print "exception on reboot_node:"
1250                                         import traceback
1251                                         print traceback.print_exc()
1252                                         ret = False
1253
1254                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1255                                         # Reboot Succeeded
1256                                         print "reboot succeeded for %s" % act_record['nodename']
1257                                         act_record2 = {}
1258                                         act_record2.update(act_record)
1259                                         act_record2['action'] = ['reboot_node']
1260                                         act_record2['stage'] = "reboot_node"
1261                                         act_record2['reboot_node_failed'] = False
1262                                         act_record2['email_pcu'] = False
1263
1264                                         if nodename not in self.act_all: 
1265                                                 self.act_all[nodename] = []
1266                                         print "inserting 'reboot_node' record into act_all"
1267                                         self.act_all[nodename].insert(0,act_record2)
1268
1269                                         # return None to avoid further action
1270                                         print "Taking no further action"
1271                                         return None
1272                                 else:
1273                                         print "reboot failed for %s" % act_record['nodename']
1274                                         # set email_pcu to also send pcu notice for this record.
1275                                         act_record['reboot_node_failed'] = True
1276                                         act_record['email_pcu'] = True
1277
1278                         print "%s" % act_record['log'],
1279                         print "%15s" % act_record['action']
1280
1281                 if act_record['stage'] is not 'monitor-end-record' and \
1282                    act_record['stage'] is not 'nmreset':
1283                         if nodename not in self.act_all: 
1284                                 self.act_all[nodename] = []
1285
1286                         self.act_all[nodename].insert(0,act_record)
1287                 else:
1288                         print "Not recording %s in act_all" % nodename
1289
1290                 return act_record
1291
1292         def analyseSites(self):
1293                 i_sites_observed = 0
1294                 i_sites_diagnosed = 0
1295                 i_nodes_diagnosed = 0
1296                 i_nodes_actedon = 0
1297                 i_sites_emailed = 0
1298                 l_allsites = []
1299
1300                 sorted_sites = self.sickdb.keys()
1301                 sorted_sites.sort()
1302                 for loginbase in sorted_sites:
1303                         site_record = self.sickdb[loginbase]
1304                         print "sites: %s" % loginbase
1305                         
1306                         i_nodes_diagnosed += len(site_record.keys())
1307                         i_sites_diagnosed += 1
1308
1309                         (na,ne) = self.__actOnSite(loginbase, site_record)
1310
1311                         i_sites_observed += 1
1312                         i_nodes_actedon += na
1313                         i_sites_emailed += ne
1314
1315                         l_allsites += [loginbase]
1316
1317                 return {'sites_observed': i_sites_observed, 
1318                                 'sites_diagnosed': i_sites_diagnosed, 
1319                                 'nodes_diagnosed': i_nodes_diagnosed, 
1320                                 'sites_emailed': i_sites_emailed, 
1321                                 'nodes_actedon': i_nodes_actedon, 
1322                                 'allsites':l_allsites}
1323
1324         def print_stats(self, key, stats):
1325                 print "%20s : %d" % (key, stats[key])
1326
1327
1328
1329         #"""
1330         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1331         #"""
1332         #def status(self):
1333         #       sub = "Monitor Summary"
1334         #       msg = "\nThe following nodes were acted upon:  \n\n"
1335         #       for (node, (type, date)) in self.emailed.items():
1336         #               # Print only things acted on today.
1337         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1338         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1339         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1340         #       for (loginbase, (date, type)) in self.squeezed.items():
1341         #               # Print only things acted on today.
1342         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1343         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1344         #       mailer.email(sub, msg, [SUMTO])
1345         #       logger.info(msg)
1346         #       return 
1347
1348         #"""
1349         #Store/Load state of emails.  When, where, what.
1350         #"""
1351         #def emailedStore(self, action):
1352         #       try:
1353         #               if action == "LOAD":
1354         #                       f = open(DAT, "r+")
1355         #                       logger.info("POLICY:  Found and reading " + DAT)
1356         #                       self.emailed.update(pickle.load(f))
1357         #               if action == "WRITE":
1358         #                       f = open(DAT, "w")
1359         #                       #logger.debug("Writing " + DAT)
1360         #                       pickle.dump(self.emailed, f)
1361         #               f.close()
1362         #       except Exception, err:
1363         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1364
1365
1366 #class Policy(Thread):
1367
1368 def main():
1369         print "policy.py is a module, not a script for running directly."
1370
1371 if __name__ == '__main__':
1372         import os
1373         import plc
1374         try:
1375                 main()
1376         except KeyboardInterrupt:
1377                 print "Killed.  Exitting."
1378                 logger.info('Monitor Killed')
1379                 os._exit(0)