mass commit. updates for the new db schema in findbad, findbadpcu, nodequery,
[monitor.git] / policy.py
1 #
2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
3 #
4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
5 #
6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
7 #
8 # Policy Engine.
9
10 #from monitor import *
11 from threading import *
12 import time
13 import logging
14 import mailer
15 import emailTxt
16 import pickle
17 import Queue
18 import plc
19 import sys
20 import os
21 import reboot
22 import database
23 import string
24 from unified_model import cmpCategoryVal
25 import config
26
27 DAT="./monitor.dat"
28
29 logger = logging.getLogger("monitor")
30
31 # Time to enforce policy
32 POLSLEEP = 7200
33
34 # Where to email the summary
35 SUMTO = "soltesz@cs.princeton.edu"
36 TECHEMAIL="tech-%s@sites.planet-lab.org"
37 PIEMAIL="pi-%s@sites.planet-lab.org"
38 SLICEMAIL="%s@slices.planet-lab.org"
39 PLCEMAIL="support@planet-lab.org"
40
41 #Thresholds (DAYS)
42 SPERMIN = 60
43 SPERHOUR = 60*60
44 SPERDAY = 86400
45 PITHRESH = 7 * SPERDAY
46 SLICETHRESH = 7 * SPERDAY
47 # Days before attempting rins again
48 RINSTHRESH = 5 * SPERDAY
49
50 # Days before calling the node dead.
51 DEADTHRESH = 30 * SPERDAY
52 # Minimum number of nodes up before squeezing
53 MINUP = 2
54
55 TECH=1
56 PI=2
57 USER=4
58 ADMIN=8
59
60 # IF:
61 #  no SSH, down.
62 #  bad disk, down
63 #  DNS, kinda down (sick)
64 #  clock, kinda down (sick)
65 #  Full disk, going to be down
66
67 # Actions:
68 #  Email
69 #  suspend slice creation
70 #  kill slices
71 def array_to_priority_map(array):
72         """ Create a mapping where each entry of array is given a priority equal
73         to its position in the array.  This is useful for subsequent use in the
74         cmpMap() function."""
75         map = {}
76         count = 0
77         for i in array:
78                 map[i] = count
79                 count += 1
80         return map
81
82 def getdebug():
83         return config.debug
84
85 def print_stats(key, stats):
86         if key in stats: print "%20s : %d" % (key, stats[key])
87
88
89 class Merge(Thread):
90         def __init__(self, l_merge, toRT):
91                 self.toRT = toRT
92                 self.merge_list = l_merge
93                 # the hostname to loginbase mapping
94                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
95
96                 # Previous actions taken on nodes.
97                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
98                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
99
100                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
101                 self.sickdb = {}
102                 self.mergedb = {}
103                 Thread.__init__(self)
104
105         def run(self):
106                 # populate sickdb
107                 self.accumSickSites()
108                 # read data from findbad and act_all
109                 self.mergeActionsAndBadDB()
110                 # pass node_records to RT
111                 self.sendToRT()
112
113         def accumSickSites(self):
114                 """
115                 Take all nodes, from l_diagnose, look them up in the act_all database, 
116                 and insert them into sickdb[] as:
117
118                         sickdb[loginbase][nodename] = fb_record
119                 """
120                 # look at all problems reported by findbad
121                 l_nodes = self.findbad['nodes'].keys()
122                 count = 0
123                 for nodename in l_nodes:
124                         if nodename not in self.merge_list:
125                                 continue                # skip this node, since it's not wanted
126
127                         count += 1
128                         loginbase = self.plcdb_hn2lb[nodename]
129                         values = self.findbad['nodes'][nodename]['values']
130
131                         fb_record = {}
132                         fb_record['nodename'] = nodename
133                         try:
134                                 fb_record['category'] = values['category']
135                         except:
136                                 print values
137                                 print nodename
138                                 print self.findbad['nodes'][nodename]
139                                 count -= 1
140                                 continue
141                         fb_record['state'] = values['state']
142                         fb_record['comonstats'] = values['comonstats']
143                         fb_record['plcnode'] = values['plcnode']
144                         fb_record['kernel'] = self.getKernel(values['kernel'])
145                         fb_record['stage'] = "findbad"
146                         fb_record['message'] = None
147                         fb_record['bootcd'] = values['bootcd']
148                         fb_record['args'] = None
149                         fb_record['info'] = None
150                         fb_record['time'] = time.time()
151                         fb_record['date_created'] = time.time()
152
153                         if loginbase not in self.sickdb:
154                                 self.sickdb[loginbase] = {}
155
156                         self.sickdb[loginbase][nodename] = fb_record
157
158                 print "Found %d nodes" % count
159
160         def getKernel(self, unamestr):
161                 s = unamestr.split()
162                 if len(s) > 2:
163                         return s[2]
164                 else:
165                         return ""
166
167         def mergeActionsAndBadDB(self): 
168                 """
169                 - Look at the sick node_records as reported in findbad, 
170                 - Then look at the node_records in act_all.  
171
172                 There are four cases:
173                 1) Problem in findbad, no problem in act_all
174                         this ok, b/c it just means it's a new problem
175                 2) Problem in findbad, problem in act_all
176                         -Did the problem get better or worse?  
177                                 -If Same, or Worse, then continue looking for open tickets.
178                                 -If Better, or No problem, then "back-off" penalties.
179                                         This judgement may need to wait until 'Diagnose()'
180
181                 3) No problem in findbad, problem in act_all
182                         The the node is operational again according to Findbad()
183
184                 4) No problem in findbad, no problem in act_all
185                         There won't be a record in either db, so there's no code.
186                 """
187
188                 sorted_sites = self.sickdb.keys()
189                 sorted_sites.sort()
190                 # look at all problems reported by findbad
191                 for loginbase in sorted_sites:
192                         d_fb_nodes = self.sickdb[loginbase]
193                         sorted_nodes = d_fb_nodes.keys()
194                         sorted_nodes.sort()
195                         for nodename in sorted_nodes:
196                                 fb_record = self.sickdb[loginbase][nodename]
197                                 x = fb_record
198                                 if loginbase not in self.mergedb:
199                                         self.mergedb[loginbase] = {}
200
201                                 # take the info either from act_all or fb-record.
202                                 # if node not in act_all
203                                 #       then take it from fbrecord, obviously.
204                                 # else node in act_all
205                                 #   if act_all == 0 length (no previous records)
206                                 #               then take it from fbrecord.
207                                 #   else
208                                 #           take it from act_all.
209                                 #   
210
211                                 # We must compare findbad state with act_all state
212                                 if nodename not in self.act_all:
213                                         # 1) ok, b/c it's a new problem. set ticket_id to null
214                                         self.mergedb[loginbase][nodename] = {} 
215                                         self.mergedb[loginbase][nodename].update(x)
216                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
217                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
218                                 else: 
219                                         if len(self.act_all[nodename]) == 0:
220                                                 self.mergedb[loginbase][nodename] = {} 
221                                                 self.mergedb[loginbase][nodename].update(x)
222                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
223                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD" 
224                                         else:
225                                                 y = self.act_all[nodename][0]
226                                                 y['prev_category'] = y['category']
227
228                                                 self.mergedb[loginbase][nodename] = {}
229                                                 self.mergedb[loginbase][nodename].update(y)
230                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
231                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
232                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
233                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
234                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
235                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
236                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
237                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
238
239                                         # delete the entry from cache_all to keep it out of case 3)
240                                         del self.cache_all[nodename]
241
242                 # 3) nodes that remin in cache_all were not identified by findbad.
243                 #        Do we keep them or not?
244                 #   NOTE: i think that since the categories are performed before this
245                 #               step now, and by a monitor-controlled agent.
246
247                 # TODO: This does not work correctly.  Do we need this? 
248                 #for hn in self.cache_all.keys():
249                 #       y = self.act_all[hn][0]
250                 #       if 'monitor' in y['bucket']:
251                 #               loginbase = self.plcdb_hn2lb[hn] 
252                 #               if loginbase not in self.sickdb:
253                 #                       self.sickdb[loginbase] = {}
254                 #               self.sickdb[loginbase][hn] = y
255                 #       else:
256                 #               del self.cache_all[hn]
257
258                 print "len of cache_all: %d" % len(self.cache_all.keys())
259                 return
260
261         def sendToRT(self):
262                 sorted_sites = self.mergedb.keys()
263                 sorted_sites.sort()
264                 # look at all problems reported by merge
265                 for loginbase in sorted_sites:
266                         d_merge_nodes = self.mergedb[loginbase]
267                         for nodename in d_merge_nodes.keys():
268                                 record = self.mergedb[loginbase][nodename]
269                                 self.toRT.put(record)
270
271                 # send signal to stop reading
272                 self.toRT.put(None)
273                 return
274
275 class Diagnose(Thread):
276         def __init__(self, fromRT):
277                 self.fromRT = fromRT
278                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
279                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
280
281                 self.diagnose_in = {}
282                 self.diagnose_out = {}
283                 Thread.__init__(self)
284
285
286         def run(self):
287                 self.accumSickSites()
288
289                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
290                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
291
292                 try:
293                         stats = self.diagnoseAll()
294                 except Exception, err:
295                         print "----------------"
296                         import traceback
297                         print traceback.print_exc()
298                         print err
299                         #if config.policysavedb:
300                         sys.exit(1)
301
302                 print_stats("sites_observed", stats)
303                 print_stats("sites_diagnosed", stats)
304                 print_stats("nodes_diagnosed", stats)
305
306                 if config.policysavedb:
307                         print "Saving Databases... diagnose_out"
308                         database.dbDump("diagnose_out", self.diagnose_out)
309
310         def accumSickSites(self):
311                 """
312                 Take all nodes, from l_diagnose, look them up in the diagnose_out database, 
313                 and insert them into diagnose_in[] as:
314
315                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
316                 """
317                 while 1:
318                         node_record = self.fromRT.get(block = True)
319                         if node_record == None:
320                                 break;
321
322                         nodename = node_record['nodename']
323                         loginbase = self.plcdb_hn2lb[nodename]
324
325                         if loginbase not in self.diagnose_in:
326                                 self.diagnose_in[loginbase] = {}
327
328                         self.diagnose_in[loginbase][nodename] = node_record
329
330                 return
331
332         def diagnoseAll(self):
333                 i_sites_observed = 0
334                 i_sites_diagnosed = 0
335                 i_nodes_diagnosed = 0
336                 i_nodes_actedon = 0
337                 i_sites_emailed = 0
338                 l_allsites = []
339
340                 sorted_sites = self.diagnose_in.keys()
341                 sorted_sites.sort()
342                 self.diagnose_out= {}
343                 for loginbase in sorted_sites:
344                         l_allsites += [loginbase]
345
346                         d_diag_nodes = self.diagnose_in[loginbase]
347                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
348                         # store records in diagnose_out, for saving later.
349                         self.diagnose_out.update(d_act_records)
350                         
351                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
352                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
353                                 i_sites_diagnosed += 1
354                         i_sites_observed += 1
355
356                 return {'sites_observed': i_sites_observed, 
357                                 'sites_diagnosed': i_sites_diagnosed, 
358                                 'nodes_diagnosed': i_nodes_diagnosed, 
359                                 'allsites':l_allsites}
360
361                 pass
362                 
363         def getDaysDown(cls, diag_record):
364                 daysdown = -1
365                 last_contact = diag_record['plcnode']['last_contact']
366                 date_created = diag_record['plcnode']['date_created']
367
368                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
369                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
370                 elif last_contact is None:
371                         if date_created is not None:
372                                 now = time.time()
373                                 diff = now - date_created
374                                 daysdown = diff // (60*60*24)
375                         else:
376                                 daysdown = -1
377                 else:
378                         now = time.time()
379                         diff = now - last_contact
380                         daysdown = diff // (60*60*24)
381                 return daysdown
382         getDaysDown = classmethod(getDaysDown)
383
384         def getStrDaysDown(cls, diag_record):
385                 daysdown = "unknown"
386                 last_contact = diag_record['plcnode']['last_contact']
387                 date_created = diag_record['plcnode']['date_created']
388
389                 if      diag_record['comonstats']['uptime'] != "null" and \
390                         diag_record['comonstats']['uptime'] != "-1":
391                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
392                         daysdown = "%d days up" % daysdown
393
394                 elif last_contact is None:
395                         if date_created is not None:
396                                 now = time.time()
397                                 diff = now - date_created
398                                 daysdown = diff // (60*60*24)
399                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
400                         else:
401                                 daysdown = "Never contacted PLC"
402                 else:
403                         now = time.time()
404                         diff = now - last_contact
405                         daysdown = diff // (60*60*24)
406                         daysdown = "%s days down" % daysdown
407                 return daysdown
408         getStrDaysDown = classmethod(getStrDaysDown)
409         #def getStrDaysDown(cls, diag_record):
410         #       daysdown = cls.getDaysDown(diag_record)
411         #       if daysdown > -1:
412         #               return "%d days down"%daysdown
413         #       elif daysdown == -1:
414         #               return "Has never contacted PLC"
415         #       else:
416         #               return "%d days up"% -daysdown
417         #getStrDaysDown = classmethod(getStrDaysDown)
418
419         def __getCDVersion(self, diag_record, nodename):
420                 cdversion = ""
421                 #print "Getting kernel for: %s" % diag_record['nodename']
422                 cdversion = diag_record['kernel']
423                 return cdversion
424
425         def __diagnoseSite(self, loginbase, d_diag_nodes):
426                 """
427                 d_diag_nodes are diagnose_in entries.
428                 """
429                 d_diag_site = {loginbase : { 'config' : 
430                                                                                                 {'squeeze': False,
431                                                                                                  'email': False
432                                                                                                 }, 
433                                                                         'nodes': {}
434                                                                         }
435                                            }
436                 sorted_nodes = d_diag_nodes.keys()
437                 sorted_nodes.sort()
438                 for nodename in sorted_nodes:
439                         node_record = d_diag_nodes[nodename]
440                         diag_record = self.__diagnoseNode(loginbase, node_record)
441
442                         if diag_record != None:
443                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
444
445                                 # NOTE: improvement means, we need to act/squeeze and email.
446                                 #print "DIAG_RECORD", diag_record
447                                 if 'monitor-end-record' in diag_record['stage'] or \
448                                    'nmreset' in diag_record['stage']:
449                                 #       print "resetting loginbase!" 
450                                         d_diag_site[loginbase]['config']['squeeze'] = True
451                                         d_diag_site[loginbase]['config']['email'] = True
452                                 #else:
453                                 #       print "NO IMPROVEMENT!!!!"
454                         else:
455                                 pass # there is nothing to do for this node.
456
457                 # NOTE: these settings can be overridden by command line arguments,
458                 #       or the state of a record, i.e. if already in RT's Support Queue.
459                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
460                 if nodes_up < MINUP:
461                         d_diag_site[loginbase]['config']['squeeze'] = True
462
463                 max_slices = self.getMaxSlices(loginbase)
464                 num_nodes = self.getNumNodes(loginbase)
465                 # NOTE: when max_slices == 0, this is either a new site (the old way)
466                 #       or an old disabled site from previous monitor (before site['enabled'])
467                 if nodes_up < num_nodes and max_slices != 0:
468                         d_diag_site[loginbase]['config']['email'] = True
469
470                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
471                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
472
473                 return d_diag_site
474
475         def diagRecordByCategory(self, node_record):
476                 nodename = node_record['nodename']
477                 category = node_record['category']
478                 state    = node_record['state']
479                 loginbase = self.plcdb_hn2lb[nodename]
480                 diag_record = None
481
482                 if  "ERROR" in category:        # i.e. "DOWN"
483                         diag_record = {}
484                         diag_record.update(node_record)
485                         daysdown = self.getDaysDown(diag_record) 
486                         if daysdown < 7:
487                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
488                                 print format % (loginbase, nodename, daysdown)
489                                 return None
490
491                         s_daysdown = self.getStrDaysDown(diag_record)
492                         diag_record['message'] = emailTxt.mailtxt.newdown
493                         diag_record['args'] = {'nodename': nodename}
494                         diag_record['info'] = (nodename, s_daysdown, "")
495
496                         if 'reboot_node_failed' in node_record:
497                                 # there was a previous attempt to use the PCU.
498                                 if node_record['reboot_node_failed'] == False:
499                                         # then the last attempt apparently, succeeded.
500                                         # But, the category is still 'ERROR'.  Therefore, the
501                                         # PCU-to-Node mapping is broken.
502                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
503                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
504                                         diag_record['email_pcu'] = True
505
506                         if 'ticket_id' in diag_record:
507                                 if diag_record['ticket_id'] == "":
508                                         if 'found_rt_ticket' in diag_record:
509                                                 ticket_id = diag_record['found_rt_ticket']
510                                         else:
511                                                 ticket_id = "None"
512                                 else:
513                                         ticket_id = diag_record['ticket_id']
514                         else:
515                                 ticket_id = "None"
516
517                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
518                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
519
520                 elif "OLDBOOTCD" in category:
521                         # V2 boot cds as determined by findbad
522                         s_daysdown = self.getStrDaysDown(node_record)
523                         s_cdversion = self.__getCDVersion(node_record, nodename)
524                         diag_record = {}
525                         diag_record.update(node_record)
526                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
527                         diag_record['message'] = emailTxt.mailtxt.newbootcd
528                         diag_record['args'] = {'nodename': nodename}
529                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
530                         if diag_record['ticket_id'] == "":
531                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
532                                                                         (loginbase, nodename, diag_record['kernel'], 
533                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
534                         else:
535                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
536                                                                         (loginbase, nodename, diag_record['kernel'], 
537                                                                          diag_record['bootcd'], diag_record['ticket_id'])
538
539                 elif "PROD" in category:
540                         if "DEBUG" in state:
541                                 # Not sure what to do with these yet.  Probably need to
542                                 # reboot, and email.
543                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
544                                 return None
545                         elif "BOOT" in state:
546                                 # no action needed.
547                                 # TODO: remove penalties, if any are applied.
548                                 now = time.time()
549                                 last_contact = node_record['plcnode']['last_contact']
550                                 if last_contact == None:
551                                         time_diff = 0
552                                 else:
553                                         time_diff = now - last_contact;
554
555                                 if 'improvement' in node_record['stage']:
556                                         # then we need to pass this on to 'action'
557                                         diag_record = {}
558                                         diag_record.update(node_record)
559                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
560                                         diag_record['args'] = {'nodename': nodename}
561                                         diag_record['info'] = (nodename, node_record['prev_category'], 
562                                                                                                          node_record['category'])
563                                         if 'email_pcu' in diag_record:
564                                                 if diag_record['email_pcu']:
565                                                         # previously, the pcu failed to reboot, so send
566                                                         # email. Now, reset these values to try the reboot
567                                                         # again.
568                                                         diag_record['email_pcu'] = False
569                                                         del diag_record['reboot_node_failed']
570
571                                         if diag_record['ticket_id'] == "":
572                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
573                                                                         (loginbase, nodename, diag_record['stage'], 
574                                                                          state, category, diag_record['found_rt_ticket'])
575                                         else:
576                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
577                                                                         (loginbase, nodename, diag_record['stage'], 
578                                                                          state, category, diag_record['ticket_id'])
579                                         return diag_record
580                                 #elif time_diff >= 6*SPERHOUR:
581                                 #       # heartbeat is older than 30 min.
582                                 #       # then reset NM.
583                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
584                                 #       diag_record = {}
585                                 #       diag_record.update(node_record)
586                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
587                                 #       diag_record['args'] = {'nodename': nodename}
588                                 #       diag_record['stage'] = "nmreset"
589                                 #       diag_record['info'] = (nodename, 
590                                 #                                                       node_record['prev_category'], 
591                                 #                                                       node_record['category'])
592                                 #       if diag_record['ticket_id'] == "":
593                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
594                                 #                                       (loginbase, nodename, diag_record['stage'], 
595                                 #                                        state, category, diag_record['found_rt_ticket'])
596                                 #       else:
597                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
598                                 #                                       (loginbase, nodename, diag_record['stage'])
599 #
600 #                                       return diag_record
601                                 else:
602                                         return None
603                         else:
604                                 # unknown
605                                 pass
606                 elif "ALPHA"    in category:
607                         pass
608                 elif "clock_drift" in category:
609                         pass
610                 elif "dns"    in category:
611                         pass
612                 elif "filerw"    in category:
613                         pass
614                 else:
615                         print "Unknown category!!!! %s" % category
616                         sys.exit(1)
617
618                 return diag_record
619
620         def __diagnoseNode(self, loginbase, node_record):
621                 # TODO: change the format of the hostname in this 
622                 #               record to something more natural.
623                 nodename                = node_record['nodename']
624                 category                = node_record['category']
625                 prev_category   = node_record['prev_category']
626                 state                   = node_record['state']
627                 #if 'prev_category' in node_record:
628                 #       prev_category = node_record['prev_category']
629                 #else:
630                 #       prev_category = "ERROR"
631                 if node_record['prev_category'] != "NORECORD":
632                 
633                         val = cmpCategoryVal(category, prev_category)
634                         print "%s went from %s -> %s" % (nodename, prev_category, category)
635                         if val == 1:
636                                 # improved
637                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
638                                         print "closing record with no ticket: ", node_record['nodename']
639                                         node_record['action'] = ['close_rt']
640                                         node_record['message'] = None
641                                         node_record['stage'] = 'monitor-end-record'
642                                         return node_record
643                                 else:
644                                         node_record['stage'] = 'improvement'
645
646                                 #if 'monitor-end-record' in node_record['stage']:
647                                 #       # just ignore it if it's already ended.
648                                 #       # otherwise, the status should be worse, and we won't get
649                                 #       # here.
650                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
651                                 #       return None
652 #
653 #                                       #return None
654                         elif val == -1:
655                                 # current category is worse than previous, carry on
656                                 pass
657                         else:
658                                 #values are equal, carry on.
659                                 #print "why are we here?"
660                                 pass
661
662                 if 'rt' in node_record and 'Status' in node_record['rt']:
663                         if node_record['stage'] == 'ticket_waitforever':
664                                 if 'resolved' in node_record['rt']['Status']:
665                                         print "ending waitforever record for: ", node_record['nodename']
666                                         node_record['action'] = ['noop']
667                                         node_record['message'] = None
668                                         node_record['stage'] = 'monitor-end-record'
669                                         print "oldlog: %s" % node_record['log'],
670                                         print "%15s" % node_record['action']
671                                         return node_record
672                                 if 'new' in node_record['rt']['Status'] and \
673                                         'Queue' in node_record['rt'] and \
674                                         'Monitor' in node_record['rt']['Queue']:
675
676                                         print "RESETTING stage to findbad"
677                                         node_record['stage'] = 'findbad'
678                         
679                 #### COMPARE category and prev_category
680                 # if not_equal
681                 #       then assign a stage based on relative priorities
682                 # else equal
683                 #       then check category for stats.
684                 diag_record = self.diagRecordByCategory(node_record)
685                 if diag_record == None:
686                         #print "diag_record == None"
687                         return None
688
689                 #### found_RT_ticket
690                 # TODO: need to record time found, and maybe add a stage for acting on it...
691                 # NOTE: after found, if the support ticket is resolved, the block is
692                 #               not removed. How to remove the block on this?
693                 if 'found_rt_ticket' in diag_record and \
694                         diag_record['found_rt_ticket'] is not None:
695                         if diag_record['stage'] is not 'improvement':
696                                 diag_record['stage'] = 'ticket_waitforever'
697                                 
698                 current_time = time.time()
699                 # take off four days, for the delay that database caused.
700                 # TODO: generalize delays at PLC, and prevent enforcement when there
701                 #               have been no emails.
702                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
703                 #delta = current_time - diag_record['time'] - 7*SPERDAY
704                 delta = current_time - diag_record['time']
705
706                 message = diag_record['message']
707                 act_record = {}
708                 act_record.update(diag_record)
709
710                 #### DIAGNOSE STAGES 
711                 if   'findbad' in diag_record['stage']:
712                         # The node is bad, and there's no previous record of it.
713                         act_record['email'] = TECH
714                         act_record['action'] = ['noop']
715                         act_record['message'] = message[0]
716                         act_record['stage'] = 'stage_actinoneweek'
717
718                 elif 'nmreset' in diag_record['stage']:
719                         act_record['email']  = ADMIN 
720                         act_record['action'] = ['reset_nodemanager']
721                         act_record['message'] = message[0]
722                         act_record['stage']  = 'nmreset'
723                         return None
724
725                 elif 'reboot_node' in diag_record['stage']:
726                         act_record['email'] = TECH
727                         act_record['action'] = ['noop']
728                         act_record['message'] = message[0]
729                         act_record['stage'] = 'stage_actinoneweek'
730                         
731                 elif 'improvement' in diag_record['stage']:
732                         # - backoff previous squeeze actions (slice suspend, nocreate)
733                         # TODO: add a backoff_squeeze section... Needs to runthrough
734                         print "backing off of %s" % nodename
735                         act_record['action'] = ['close_rt']
736                         act_record['message'] = message[0]
737                         act_record['stage'] = 'monitor-end-record'
738
739                 elif 'actinoneweek' in diag_record['stage']:
740                         if delta >= 7 * SPERDAY: 
741                                 act_record['email'] = TECH | PI
742                                 act_record['stage'] = 'stage_actintwoweeks'
743                                 act_record['message'] = message[1]
744                                 act_record['action'] = ['nocreate' ]
745                                 act_record['time'] = current_time               # reset clock for waitforever
746                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
747                                 act_record['email'] = TECH 
748                                 act_record['message'] = message[0]
749                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
750                                 act_record['second-mail-at-oneweek'] = True
751                         else:
752                                 act_record['message'] = None
753                                 act_record['action'] = ['waitforoneweekaction' ]
754                                 print "ignoring this record for: %s" % act_record['nodename']
755                                 return None                     # don't send if there's no action
756
757                 elif 'actintwoweeks' in diag_record['stage']:
758                         if delta >= 7 * SPERDAY:
759                                 act_record['email'] = TECH | PI | USER
760                                 act_record['stage'] = 'stage_waitforever'
761                                 act_record['message'] = message[2]
762                                 act_record['action'] = ['suspendslices']
763                                 act_record['time'] = current_time               # reset clock for waitforever
764                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
765                                 act_record['email'] = TECH | PI
766                                 act_record['message'] = message[1]
767                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
768                                 act_record['second-mail-at-twoweeks'] = True
769                         else:
770                                 act_record['message'] = None
771                                 act_record['action'] = ['waitfortwoweeksaction']
772                                 return None                     # don't send if there's no action
773
774                 elif 'ticket_waitforever' in diag_record['stage']:
775                         act_record['email'] = TECH
776                         if 'first-found' not in act_record:
777                                 act_record['first-found'] = True
778                                 act_record['log'] += " firstfound"
779                                 act_record['action'] = ['ticket_waitforever']
780                                 act_record['message'] = message[0]
781                                 act_record['time'] = current_time
782                         else:
783                                 if delta >= 7*SPERDAY:
784                                         act_record['action'] = ['ticket_waitforever']
785                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
786                                                         act_record['rt']['Status'] == 'new':
787                                                 act_record['message'] = message[0]
788                                         else:
789                                                 act_record['message'] = None
790                                                 
791                                         act_record['time'] = current_time               # reset clock
792                                 else:
793                                         act_record['action'] = ['ticket_waitforever']
794                                         act_record['message'] = None
795                                         return None
796
797                 elif 'waitforever' in diag_record['stage']:
798                         # more than 3 days since last action
799                         # TODO: send only on weekdays.
800                         # NOTE: expects that 'time' has been reset before entering waitforever stage
801                         if delta >= 3*SPERDAY:
802                                 act_record['action'] = ['email-againwaitforever']
803                                 act_record['message'] = message[2]
804                                 act_record['time'] = current_time               # reset clock
805                         else:
806                                 act_record['action'] = ['waitforever']
807                                 act_record['message'] = None
808                                 return None                     # don't send if there's no action
809
810                 else:
811                         # There is no action to be taken, possibly b/c the stage has
812                         # already been performed, but diagnose picked it up again.
813                         # two cases, 
814                         #       1. stage is unknown, or 
815                         #       2. delta is not big enough to bump it to the next stage.
816                         # TODO: figure out which. for now assume 2.
817                         print "UNKNOWN stage for %s; nothing done" % nodename
818                         act_record['action'] = ['unknown']
819                         act_record['message'] = message[0]
820
821                         act_record['email'] = TECH
822                         act_record['action'] = ['noop']
823                         act_record['message'] = message[0]
824                         act_record['stage'] = 'stage_actinoneweek'
825                         act_record['time'] = current_time               # reset clock
826                         #print "Exiting..."
827                         #return None
828                         #sys.exit(1)
829
830                 print "%s" % act_record['log'],
831                 print "%15s" % act_record['action']
832                 return act_record
833
834         def getMaxSlices(self, loginbase):
835                 # if sickdb has a loginbase, then it will have at least one node.
836                 site_stats = None
837
838                 for nodename in self.diagnose_in[loginbase].keys():
839                         if nodename in self.findbad['nodes']:
840                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
841                                 break
842
843                 if site_stats == None:
844                         raise Exception, "loginbase with no nodes in findbad"
845                 else:
846                         return site_stats['max_slices']
847
848         def getNumNodes(self, loginbase):
849                 # if sickdb has a loginbase, then it will have at least one node.
850                 site_stats = None
851
852                 for nodename in self.diagnose_in[loginbase].keys():
853                         if nodename in self.findbad['nodes']:
854                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
855                                 break
856
857                 if site_stats == None:
858                         raise Exception, "loginbase with no nodes in findbad"
859                 else:
860                         if 'num_nodes' in site_stats:
861                                 return site_stats['num_nodes']
862                         else:
863                                 return 0
864
865         """
866         Returns number of up nodes as the total number *NOT* in act_all with a
867         stage other than 'steady-state' .
868         """
869         def getUpAtSite(self, loginbase, d_diag_site):
870                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
871                 #               that aren't recorded yet.
872
873                 numnodes = self.getNumNodes(loginbase)
874                 # NOTE: assume nodes we have no record of are ok. (too conservative)
875                 # TODO: make the 'up' value more representative
876                 up = numnodes
877                 for nodename in d_diag_site[loginbase]['nodes'].keys():
878
879                         rec = d_diag_site[loginbase]['nodes'][nodename]
880                         if rec['stage'] != 'monitor-end-record':
881                                 up -= 1
882                         else:
883                                 pass # the node is assumed to be up.
884
885                 #if up != numnodes:
886                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
887
888                 return up
889
890
891 class SiteAction:
892         def __init__(self, parameter_names=['hostname', 'ticket_id']):
893                 self.parameter_names = parameter_names
894         def checkParam(self, args):
895                 for param in self.parameter_names:
896                         if param not in args:
897                                 raise Exception("Parameter %s not provided in args"%param)
898         def run(self, args):
899                 self.checkParam(args)
900                 return self._run(args)
901         def _run(self, args):
902                 pass
903
904 class SuspendAction(SiteAction):
905         def _run(self, args):
906                 return plc.suspendSlices(args['hostname'])
907
908 class RemoveSliceCreation(SiteAction):
909         def _run(self, args):
910                 return plc.removeSliceCreation(args['hostname'])
911
912 class BackoffActions(SiteAction):
913         def _run(self, args):
914                 plc.enableSlices(args['hostname'])
915                 plc.enableSliceCreation(args['hostname'])
916                 return True
917
918 # TODO: create class for each action below, 
919 #               allow for lists of actions to be performed...
920
921
922
923 def reset_nodemanager(args):
924         os.system("ssh root@%s /sbin/service nm restart" % nodename)
925         return
926
927 class Action(Thread):
928         def __init__(self, l_action):
929                 self.l_action = l_action
930
931                 # the hostname to loginbase mapping
932                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
933
934                 # Actions to take.
935                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
936                 # Actions taken.
937                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
938
939                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
940                 self.actions = {}
941                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
942                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
943                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
944                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins") 
945                 self.actions['noop'] = lambda args: args
946                 self.actions['reboot_node'] = lambda args: reboot_node(args)
947                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
948
949                 self.actions['ticket_waitforever'] = lambda args: args
950                 self.actions['waitforever'] = lambda args: args
951                 self.actions['unknown'] = lambda args: args
952                 self.actions['waitforoneweekaction'] = lambda args: args
953                 self.actions['waitfortwoweeksaction'] = lambda args: args
954                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
955                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
956                 self.actions['email-againwaitforever'] = lambda args: args
957                 self.actions['email-againticket_waitforever'] = lambda args: args
958                                 
959
960                 self.sickdb = {}
961                 Thread.__init__(self)
962
963         def run(self):
964                 self.accumSites()
965                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
966                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
967
968                 try:
969                         stats = self.analyseSites()
970                 except Exception, err:
971                         print "----------------"
972                         import traceback
973                         print traceback.print_exc()
974                         print err
975                         if config.policysavedb:
976                                 print "Saving Databases... act_all"
977                                 database.dbDump("act_all", self.act_all)
978                         sys.exit(1)
979
980                 print_stats("sites_observed", stats)
981                 print_stats("sites_diagnosed", stats)
982                 print_stats("nodes_diagnosed", stats)
983                 print_stats("sites_emailed", stats)
984                 print_stats("nodes_actedon", stats)
985                 print string.join(stats['allsites'], ",")
986
987                 if config.policysavedb:
988                         print "Saving Databases... act_all"
989                         #database.dbDump("policy.eventlog", self.eventlog)
990                         # TODO: remove 'diagnose_out', 
991                         #       or at least the entries that were acted on.
992                         database.dbDump("act_all", self.act_all)
993
994         def accumSites(self):
995                 """
996                 Take all nodes, from l_action, look them up in the diagnose_db database, 
997                 and insert them into sickdb[] as:
998
999                 This way only the given l_action nodes will be acted on regardless
1000                 of how many from diagnose_db are available.
1001
1002                         sickdb[loginbase][nodename] = diag_record
1003                 """
1004                 # TODO: what if l_action == None ?
1005                 for nodename in self.l_action:
1006
1007                         loginbase = self.plcdb_hn2lb[nodename]
1008
1009                         if loginbase in self.diagnose_db and \
1010                                 nodename in self.diagnose_db[loginbase]['nodes']:
1011
1012                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1013
1014                                 if loginbase not in self.sickdb:
1015                                         self.sickdb[loginbase] = {'nodes' : {}}
1016
1017                                 # NOTE: don't copy all node records, since not all will be in l_action
1018                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1019                                 # NOTE: but, we want to get the loginbase config settings, 
1020                                 #               this is the easiest way.
1021                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1022                         #else:
1023                                 #print "%s not in diagnose_db!!" % loginbase
1024                 return
1025
1026         def __emailSite(self, loginbase, roles, message, args):
1027                 """
1028                 loginbase is the unique site abbreviation, prepended to slice names.
1029                 roles contains TECH, PI, USER roles, and derive email aliases.
1030                 record contains {'message': [<subj>,<body>], 'args': {...}} 
1031                 """
1032                 ticket_id = 0
1033                 args.update({'loginbase':loginbase})
1034
1035                 if not config.mail and not config.debug and config.bcc:
1036                         roles = ADMIN
1037                 if config.mail and config.debug:
1038                         roles = ADMIN
1039
1040                 # build targets
1041                 contacts = []
1042                 if ADMIN & roles:
1043                         contacts += [config.email]
1044                 if TECH & roles:
1045                         contacts += [TECHEMAIL % loginbase]
1046                 if PI & roles:
1047                         contacts += [PIEMAIL % loginbase]
1048                 if USER & roles:
1049                         slices = plc.slices(loginbase)
1050                         if len(slices) >= 1:
1051                                 for slice in slices:
1052                                         contacts += [SLICEMAIL % slice]
1053                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1054                         else:
1055                                 print "SLIC: %20s : 0 slices" % loginbase
1056
1057                 try:
1058                         subject = message[0] % args
1059                         body = message[1] % args
1060                         if ADMIN & roles:
1061                                 # send only to admin
1062                                 if 'ticket_id' in args:
1063                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1064                                 else:
1065                                         subj = "Re: [PL noticket] %s" % subject
1066                                 mailer.email(subj, body, contacts)
1067                                 ticket_id = args['ticket_id']
1068                         else:
1069                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1070                 except Exception, err:
1071                         print "exception on message:"
1072                         import traceback
1073                         print traceback.print_exc()
1074                         print message
1075
1076                 return ticket_id
1077
1078
1079         def _format_diaginfo(self, diag_node):
1080                 info = diag_node['info']
1081                 if diag_node['stage'] == 'monitor-end-record':
1082                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
1083                 else:
1084                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1085                 return hlist
1086
1087
1088         def get_email_args(self, act_recordlist, loginbase=None):
1089
1090                 email_args = {}
1091                 email_args['hostname_list'] = ""
1092
1093                 for act_record in act_recordlist:
1094                         email_args['hostname_list'] += act_record['msg_format']
1095                         email_args['hostname'] = act_record['nodename']
1096                         if  'plcnode' in act_record and \
1097                                 'pcu_ids' in act_record['plcnode'] and \
1098                                 len(act_record['plcnode']['pcu_ids']) > 0:
1099                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1100                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1101                         else:
1102                                 email_args['pcu_id'] = "-1"
1103                                         
1104                         if 'ticket_id' in act_record:
1105                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1106                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1107                                         sys.stdout.flush()
1108                                         line = sys.stdin.readline()
1109                                         try:
1110                                                 ticket_id = int(line)
1111                                         except:
1112                                                 print "could not get ticket_id from stdin..."
1113                                                 os._exit(1)
1114                                 else:
1115                                         ticket_id = act_record['ticket_id']
1116                                         
1117                                 email_args['ticket_id'] = ticket_id
1118
1119                 return email_args
1120
1121         def get_unique_issues(self, act_recordlist):
1122                 # NOTE: only send one email per site, per problem...
1123                 unique_issues = {}
1124                 for act_record in act_recordlist:
1125                         act_key = act_record['action'][0]
1126                         if act_key not in unique_issues:
1127                                 unique_issues[act_key] = []
1128                                 
1129                         unique_issues[act_key] += [act_record]
1130                         
1131                 return unique_issues
1132                         
1133
1134         def __actOnSite(self, loginbase, site_record):
1135                 i_nodes_actedon = 0
1136                 i_nodes_emailed = 0
1137
1138                 act_recordlist = []
1139
1140                 for nodename in site_record['nodes'].keys():
1141                         diag_record = site_record['nodes'][nodename]
1142                         act_record  = self.__actOnNode(diag_record)
1143                         #print "nodename: %s %s" % (nodename, act_record)
1144                         if act_record is not None:
1145                                 act_recordlist += [act_record]
1146
1147                 unique_issues = self.get_unique_issues(act_recordlist)
1148
1149                 for issue in unique_issues.keys():
1150                         print "\tworking on issue: %s" % issue
1151                         issue_record_list = unique_issues[issue]
1152                         email_args = self.get_email_args(issue_record_list, loginbase)
1153
1154                         # for each record.
1155                         for act_record in issue_record_list:
1156                                 # if there's a pcu record and email config is set
1157                                 if 'email_pcu' in act_record:
1158                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1159                                                 # and 'reboot_node' in act_record['stage']:
1160
1161                                                 email_args['hostname'] = act_record['nodename']
1162                                                 ticket_id = self.__emailSite(loginbase, 
1163                                                                                         act_record['email'], 
1164                                                                                         emailTxt.mailtxt.pcudown[0],
1165                                                                                         email_args)
1166                                                 if ticket_id == 0:
1167                                                         # error.
1168                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1169                                                         os._exit(1)
1170                                                         pass
1171                                                 email_args['ticket_id'] = ticket_id
1172
1173                         
1174                         act_record = issue_record_list[0]
1175                         # send message before squeezing
1176                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None, 
1177                                                                                                 site_record['config']['email'])
1178                         if act_record['message'] != None and site_record['config']['email']:
1179                                 ticket_id = self.__emailSite(loginbase, act_record['email'], 
1180                                                                                          act_record['message'], email_args)
1181
1182                                 if ticket_id == 0:
1183                                         # error.
1184                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1185                                         os._exit(1)
1186                                         pass
1187
1188                                 # Add ticket_id to ALL nodenames
1189                                 for act_record in issue_record_list:
1190                                         nodename = act_record['nodename']
1191                                         # update node record with RT ticket_id
1192                                         if nodename in self.act_all:
1193                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1194                                                 # if the ticket was previously resolved, reset it to new.
1195                                                 if 'rt' in act_record and \
1196                                                         'Status' in act_record['rt'] and \
1197                                                         act_record['rt']['Status'] == 'resolved':
1198                                                         mailer.setTicketStatus(ticket_id, "new")
1199                                                 status = mailer.getTicketStatus(ticket_id)
1200                                                 self.act_all[nodename][0]['rt'] = status
1201                                         if config.mail: i_nodes_emailed += 1
1202
1203                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1204                                                                                                         site_record['config']['squeeze'])
1205                         if config.squeeze and site_record['config']['squeeze']:
1206                                 for act_key in act_record['action']:
1207                                         self.actions[act_key](email_args)
1208                                 i_nodes_actedon += 1
1209                 
1210                 if config.policysavedb:
1211                         print "Saving Databases... act_all, diagnose_out"
1212                         database.dbDump("act_all", self.act_all)
1213                         # remove site record from diagnose_out, it's in act_all as done.
1214                         del self.diagnose_db[loginbase]
1215                         database.dbDump("diagnose_out", self.diagnose_db)
1216
1217                 print "sleeping for 1 sec"
1218                 time.sleep(1)
1219                 #print "Hit enter to continue..."
1220                 #sys.stdout.flush()
1221                 #line = sys.stdin.readline()
1222
1223                 return (i_nodes_actedon, i_nodes_emailed)
1224
1225         def __actOnNode(self, diag_record):
1226                 nodename = diag_record['nodename']
1227                 message = diag_record['message']
1228
1229                 act_record = {}
1230                 act_record.update(diag_record)
1231                 act_record['nodename'] = nodename
1232                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1233                 print "act_record['stage'] == %s " % act_record['stage']
1234
1235                 # avoid end records, and nmreset records                                        
1236                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1237
1238                 if 'monitor-end-record' not in act_record['stage'] and \
1239                    'nmreset' not in act_record['stage'] and \
1240                    'reboot_node_failed' not in act_record:
1241
1242                         if "DOWN" in act_record['log'] and \
1243                                         'pcu_ids' in act_record['plcnode'] and \
1244                                         len(act_record['plcnode']['pcu_ids']) > 0:
1245
1246                                 print "%s" % act_record['log'],
1247                                 print "%15s" % (['reboot_node'],)
1248                                 # Set node to re-install
1249                                 plc.nodeBootState(act_record['nodename'], "rins")       
1250                                 try:
1251                                         ret = reboot_node({'hostname': act_record['nodename']})
1252                                 except Exception, exc:
1253                                         print "exception on reboot_node:"
1254                                         import traceback
1255                                         print traceback.print_exc()
1256                                         ret = False
1257
1258                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1259                                         # Reboot Succeeded
1260                                         print "reboot succeeded for %s" % act_record['nodename']
1261                                         act_record2 = {}
1262                                         act_record2.update(act_record)
1263                                         act_record2['action'] = ['reboot_node']
1264                                         act_record2['stage'] = "reboot_node"
1265                                         act_record2['reboot_node_failed'] = False
1266                                         act_record2['email_pcu'] = False
1267
1268                                         if nodename not in self.act_all: 
1269                                                 self.act_all[nodename] = []
1270                                         print "inserting 'reboot_node' record into act_all"
1271                                         self.act_all[nodename].insert(0,act_record2)
1272
1273                                         # return None to avoid further action
1274                                         print "Taking no further action"
1275                                         return None
1276                                 else:
1277                                         print "reboot failed for %s" % act_record['nodename']
1278                                         # set email_pcu to also send pcu notice for this record.
1279                                         act_record['reboot_node_failed'] = True
1280                                         act_record['email_pcu'] = True
1281
1282                         print "%s" % act_record['log'],
1283                         print "%15s" % act_record['action']
1284
1285                 if act_record['stage'] is not 'monitor-end-record' and \
1286                    act_record['stage'] is not 'nmreset':
1287                         if nodename not in self.act_all: 
1288                                 self.act_all[nodename] = []
1289
1290                         self.act_all[nodename].insert(0,act_record)
1291                 else:
1292                         print "Not recording %s in act_all" % nodename
1293
1294                 return act_record
1295
1296         def analyseSites(self):
1297                 i_sites_observed = 0
1298                 i_sites_diagnosed = 0
1299                 i_nodes_diagnosed = 0
1300                 i_nodes_actedon = 0
1301                 i_sites_emailed = 0
1302                 l_allsites = []
1303
1304                 sorted_sites = self.sickdb.keys()
1305                 sorted_sites.sort()
1306                 for loginbase in sorted_sites:
1307                         site_record = self.sickdb[loginbase]
1308                         print "sites: %s" % loginbase
1309                         
1310                         i_nodes_diagnosed += len(site_record.keys())
1311                         i_sites_diagnosed += 1
1312
1313                         (na,ne) = self.__actOnSite(loginbase, site_record)
1314
1315                         i_sites_observed += 1
1316                         i_nodes_actedon += na
1317                         i_sites_emailed += ne
1318
1319                         l_allsites += [loginbase]
1320
1321                 return {'sites_observed': i_sites_observed, 
1322                                 'sites_diagnosed': i_sites_diagnosed, 
1323                                 'nodes_diagnosed': i_nodes_diagnosed, 
1324                                 'sites_emailed': i_sites_emailed, 
1325                                 'nodes_actedon': i_nodes_actedon, 
1326                                 'allsites':l_allsites}
1327
1328         def print_stats(self, key, stats):
1329                 print "%20s : %d" % (key, stats[key])
1330
1331
1332
1333         #"""
1334         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1335         #"""
1336         #def status(self):
1337         #       sub = "Monitor Summary"
1338         #       msg = "\nThe following nodes were acted upon:  \n\n"
1339         #       for (node, (type, date)) in self.emailed.items():
1340         #               # Print only things acted on today.
1341         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1342         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1343         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1344         #       for (loginbase, (date, type)) in self.squeezed.items():
1345         #               # Print only things acted on today.
1346         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1347         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1348         #       mailer.email(sub, msg, [SUMTO])
1349         #       logger.info(msg)
1350         #       return 
1351
1352         #"""
1353         #Store/Load state of emails.  When, where, what.
1354         #"""
1355         #def emailedStore(self, action):
1356         #       try:
1357         #               if action == "LOAD":
1358         #                       f = open(DAT, "r+")
1359         #                       logger.info("POLICY:  Found and reading " + DAT)
1360         #                       self.emailed.update(pickle.load(f))
1361         #               if action == "WRITE":
1362         #                       f = open(DAT, "w")
1363         #                       #logger.debug("Writing " + DAT)
1364         #                       pickle.dump(self.emailed, f)
1365         #               f.close()
1366         #       except Exception, err:
1367         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1368
1369
1370 #class Policy(Thread):
1371
1372 def main():
1373         print "policy.py is a module, not a script for running directly."
1374
1375 if __name__ == '__main__':
1376         import os
1377         import plc
1378         try:
1379                 main()
1380         except KeyboardInterrupt:
1381                 print "Killed.  Exitting."
1382                 logger.info('Monitor Killed')
1383                 os._exit(0)