policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import soltesz
  23 import string
  24 from www.printbadnodes import cmpCategoryVal
  25 from config import config
  26 print "policy"
  27 config = config()
  28
  29 DAT="./monitor.dat"
  30
  31 logger = logging.getLogger("monitor")
  32
  33 # Time to enforce policy
  34 POLSLEEP = 7200
  35
  36 # Where to email the summary
  37 SUMTO = "soltesz@cs.princeton.edu"
  38 TECHEMAIL="tech-%s@sites.planet-lab.org"
  39 PIEMAIL="pi-%s@sites.planet-lab.org"
  40 SLICEMAIL="%s@slices.planet-lab.org"
  41 PLCEMAIL="support@planet-lab.org"
  42
  43 #Thresholds (DAYS)
  44 SPERMIN = 60
  45 SPERHOUR = 60*60
  46 SPERDAY = 86400
  47 PITHRESH = 7 * SPERDAY
  48 SLICETHRESH = 7 * SPERDAY
  49 # Days before attempting rins again
  50 RINSTHRESH = 5 * SPERDAY
  51
  52 # Days before calling the node dead.
  53 DEADTHRESH = 30 * SPERDAY
  54 # Minimum number of nodes up before squeezing
  55 MINUP = 2
  56
  57 TECH=1
  58 PI=2
  59 USER=4
  60 ADMIN=8
  61
  62 # IF:
  63 #  no SSH, down.
  64 #  bad disk, down
  65 #  DNS, kinda down (sick)
  66 #  clock, kinda down (sick)
  67 #  Full disk, going to be down
  68
  69 # Actions:
  70 #  Email
  71 #  suspend slice creation
  72 #  kill slices
  73 def array_to_priority_map(array):
  74         """ Create a mapping where each entry of array is given a priority equal
  75         to its position in the array.  This is useful for subsequent use in the
  76         cmpMap() function."""
  77         map = {}
  78         count = 0
  79         for i in array:
  80                 map[i] = count
  81                 count += 1
  82         return map
  83
  84 def getdebug():
  85         return config.debug
  86
  87 def print_stats(key, stats):
  88         if key in stats: print "%20s : %d" % (key, stats[key])
  89
  90 class Merge(Thread):
  91         def __init__(self, l_merge, toRT):
  92                 self.toRT = toRT
  93                 self.merge_list = l_merge
  94                 # the hostname to loginbase mapping
  95                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
  96
  97                 # Previous actions taken on nodes.
  98                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
  99                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 100
 101                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
 102                 self.sickdb = {}
 103                 self.mergedb = {}
 104                 Thread.__init__(self)
 105
 106         def run(self):
 107                 # populate sickdb
 108                 self.accumSickSites()
 109                 # read data from findbad and act_all
 110                 self.mergeActionsAndBadDB()
 111                 # pass node_records to RT
 112                 self.sendToRT()
 113
 114         def accumSickSites(self):
 115                 """
 116                 Take all nodes, from l_diagnose, look them up in the act_all database,
 117                 and insert them into sickdb[] as:
 118
 119                         sickdb[loginbase][nodename] = fb_record
 120                 """
 121                 # look at all problems reported by findbad
 122                 l_nodes = self.findbad['nodes'].keys()
 123                 count = 0
 124                 for nodename in l_nodes:
 125                         if nodename not in self.merge_list:
 126                                 continue                # skip this node, since it's not wanted
 127
 128                         count += 1
 129                         loginbase = self.plcdb_hn2lb[nodename]
 130                         values = self.findbad['nodes'][nodename]['values']
 131
 132                         fb_record = {}
 133                         fb_record['nodename'] = nodename
 134                         try:
 135                                 fb_record['category'] = values['category']
 136                         except:
 137                                 print values
 138                                 print nodename
 139                                 print self.findbad['nodes'][nodename]
 140                                 count -= 1
 141                                 continue
 142                         fb_record['state'] = values['state']
 143                         fb_record['comonstats'] = values['comonstats']
 144                         fb_record['plcnode'] = values['plcnode']
 145                         fb_record['kernel'] = self.getKernel(values['kernel'])
 146                         fb_record['stage'] = "findbad"
 147                         fb_record['message'] = None
 148                         fb_record['bootcd'] = values['bootcd']
 149                         fb_record['args'] = None
 150                         fb_record['info'] = None
 151                         fb_record['time'] = time.time()
 152                         fb_record['date_created'] = time.time()
 153
 154                         if loginbase not in self.sickdb:
 155                                 self.sickdb[loginbase] = {}
 156
 157                         self.sickdb[loginbase][nodename] = fb_record
 158
 159                 print "Found %d nodes" % count
 160
 161         def getKernel(self, unamestr):
 162                 s = unamestr.split()
 163                 if len(s) > 2:
 164                         return s[2]
 165                 else:
 166                         return ""
 167
 168         def mergeActionsAndBadDB(self):
 169                 """
 170                 - Look at the sick node_records as reported in findbad,
 171                 - Then look at the node_records in act_all.
 172
 173                 There are four cases:
 174                 1) Problem in findbad, no problem in act_all
 175                         this ok, b/c it just means it's a new problem
 176                 2) Problem in findbad, problem in act_all
 177                         -Did the problem get better or worse?
 178                                 -If Same, or Worse, then continue looking for open tickets.
 179                                 -If Better, or No problem, then "back-off" penalties.
 180                                         This judgement may need to wait until 'Diagnose()'
 181
 182                 3) No problem in findbad, problem in act_all
 183                         The the node is operational again according to Findbad()
 184
 185                 4) No problem in findbad, no problem in act_all
 186                         There won't be a record in either db, so there's no code.
 187                 """
 188
 189                 sorted_sites = self.sickdb.keys()
 190                 sorted_sites.sort()
 191                 # look at all problems reported by findbad
 192                 for loginbase in sorted_sites:
 193                         d_fb_nodes = self.sickdb[loginbase]
 194                         sorted_nodes = d_fb_nodes.keys()
 195                         sorted_nodes.sort()
 196                         for nodename in sorted_nodes:
 197                                 fb_record = self.sickdb[loginbase][nodename]
 198                                 x = fb_record
 199                                 if loginbase not in self.mergedb:
 200                                         self.mergedb[loginbase] = {}
 201
 202                                 # We must compare findbad state with act_all state
 203                                 if nodename not in self.act_all:
 204                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 205                                         self.mergedb[loginbase][nodename] = {}
 206                                         self.mergedb[loginbase][nodename].update(x)
 207                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 208                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 209                                 else:
 210                                         if len(self.act_all[nodename]) == 0:
 211                                                 print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
 212                                                 continue
 213
 214                                         y = self.act_all[nodename][0]
 215
 216                                         ## skip if end-stage
 217                                         #if 'stage' in y and "monitor-end-record" in y['stage']:
 218                                         #       # 1) ok, b/c it's a new problem. set ticket_id to null
 219                                         ##      self.mergedb[loginbase][nodename] = {}
 220                                         #       self.mergedb[loginbase][nodename].update(x)
 221                                         #       self.mergedb[loginbase][nodename]['ticket_id'] = ""
 222                                         #       self.mergedb[loginbase][nodename]['prev_category'] = None
 223                                         #       continue
 224
 225                                         ## for legacy actions
 226                                         #if 'bucket' in y and y['bucket'][0] == 'dbg':
 227                                         #       # Only bootcd debugs made it to the act_all db.
 228                                         #       y['prev_category'] = "OLDBOOTCD"
 229                                         #elif 'bucket' in y and y['bucket'][0] == 'down':
 230                                         #       y['prev_category'] = "ERROR"
 231                                         #elif 'bucket' not in y:
 232                                         #       # for all other actions, just carry over the
 233                                         #       # previous category
 234                                         #       y['prev_category'] = y['category']
 235                                         #else:
 236                                         #       print "UNKNOWN state for record: %s" % y
 237                                         #       sys.exit(1)
 238
 239                                         # determine through translation, if the buckets match
 240                                         #if 'category' in y and x['category'] == y['category']:
 241                                         #       b_match = True
 242                                         #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
 243                                         #       b_match = True
 244                                         #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
 245                                         #       b_match = True
 246                                         #else:
 247                                         #       b_match = False
 248
 249                                         #if b_match:
 250                                         #       # 2b) ok, b/c they agree that there's still a problem..
 251                                         #       # 2b) Comon & Monitor still agree; RT ticket?
 252                                         #else:
 253                                         #       # 2a) mismatch, need a policy for how to resolve
 254                                         #       #     resolution will be handled in __diagnoseNode()
 255                                         #       #         for now just record the two categories.
 256                                         #       #if x['category'] == "PROD" and x['state'] == "BOOT" and \
 257                                         #       # ( y['bucket'][0] == 'down' or  y['bucket'][0] == 'dbg'):
 258                                         #       print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
 259                                         #                               (x['category'], y['bucket'])
 260
 261                                         y['prev_category'] = y['category']
 262                                         self.mergedb[loginbase][nodename] = {}
 263                                         self.mergedb[loginbase][nodename].update(y)
 264                                         self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 265                                         self.mergedb[loginbase][nodename]['category']   = x['category']
 266                                         self.mergedb[loginbase][nodename]['state'] = x['state']
 267                                         self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 268                                         self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 269                                         self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 270                                         # delete the entry from cache_all to keep it out of case 3)
 271                                         del self.cache_all[nodename]
 272
 273                 # 3) nodes that remin in cache_all were not identified by findbad.
 274                 #        Do we keep them or not?
 275                 #   NOTE: i think that since the categories are performed before this
 276                 #               step now, and by a monitor-controlled agent.
 277
 278                 # TODO: This does not work correctly.  Do we need this?
 279                 #for hn in self.cache_all.keys():
 280                 #       y = self.act_all[hn][0]
 281                 #       if 'monitor' in y['bucket']:
 282                 #               loginbase = self.plcdb_hn2lb[hn]
 283                 #               if loginbase not in self.sickdb:
 284                 #                       self.sickdb[loginbase] = {}
 285                 #               self.sickdb[loginbase][hn] = y
 286                 #       else:
 287                 #               del self.cache_all[hn]
 288
 289                 print "len of cache_all: %d" % len(self.cache_all.keys())
 290                 return
 291
 292         def sendToRT(self):
 293                 sorted_sites = self.mergedb.keys()
 294                 sorted_sites.sort()
 295                 # look at all problems reported by merge
 296                 for loginbase in sorted_sites:
 297                         d_merge_nodes = self.mergedb[loginbase]
 298                         for nodename in d_merge_nodes.keys():
 299                                 record = self.mergedb[loginbase][nodename]
 300                                 self.toRT.put(record)
 301
 302                 # send signal to stop reading
 303                 self.toRT.put(None)
 304                 return
 305
 306 class Diagnose(Thread):
 307         def __init__(self, fromRT):
 308                 self.fromRT = fromRT
 309                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 310                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 311
 312                 self.diagnose_in = {}
 313                 self.diagnose_out = {}
 314                 Thread.__init__(self)
 315
 316
 317         def run(self):
 318                 self.accumSickSites()
 319
 320                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 321                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 322
 323                 try:
 324                         stats = self.diagnoseAll()
 325                 except Exception, err:
 326                         print "----------------"
 327                         import traceback
 328                         print traceback.print_exc()
 329                         print err
 330                         #if config.policysavedb:
 331                         sys.exit(1)
 332
 333                 print_stats("sites_observed", stats)
 334                 print_stats("sites_diagnosed", stats)
 335                 print_stats("nodes_diagnosed", stats)
 336
 337                 if config.policysavedb:
 338                         print "Saving Databases... diagnose_out"
 339                         soltesz.dbDump("diagnose_out", self.diagnose_out)
 340
 341         def accumSickSites(self):
 342                 """
 343                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 344                 and insert them into diagnose_in[] as:
 345
 346                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 347                 """
 348                 while 1:
 349                         node_record = self.fromRT.get(block = True)
 350                         if node_record == None:
 351                                 break;
 352
 353                         nodename = node_record['nodename']
 354                         loginbase = self.plcdb_hn2lb[nodename]
 355
 356                         if loginbase not in self.diagnose_in:
 357                                 self.diagnose_in[loginbase] = {}
 358
 359                         self.diagnose_in[loginbase][nodename] = node_record
 360
 361                 return
 362
 363         def diagnoseAll(self):
 364                 i_sites_observed = 0
 365                 i_sites_diagnosed = 0
 366                 i_nodes_diagnosed = 0
 367                 i_nodes_actedon = 0
 368                 i_sites_emailed = 0
 369                 l_allsites = []
 370
 371                 sorted_sites = self.diagnose_in.keys()
 372                 sorted_sites.sort()
 373                 self.diagnose_out= {}
 374                 for loginbase in sorted_sites:
 375                         l_allsites += [loginbase]
 376
 377                         d_diag_nodes = self.diagnose_in[loginbase]
 378                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 379                         # store records in diagnose_out, for saving later.
 380                         self.diagnose_out.update(d_act_records)
 381
 382                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 383                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 384                                 i_sites_diagnosed += 1
 385                         i_sites_observed += 1
 386
 387                 return {'sites_observed': i_sites_observed,
 388                                 'sites_diagnosed': i_sites_diagnosed,
 389                                 'nodes_diagnosed': i_nodes_diagnosed,
 390                                 'allsites':l_allsites}
 391
 392                 pass
 393
 394         def __getDaysDown(self, diag_record, nodename):
 395                 daysdown = -1
 396                 if diag_record['comonstats']['sshstatus'] != "null":
 397                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
 398                 elif diag_record['comonstats']['lastcotop'] != "null":
 399                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
 400                 else:
 401                         now = time.time()
 402                         last_contact = diag_record['plcnode']['last_contact']
 403                         if last_contact == None:
 404                                 # the node has never been up, so give it a break
 405                                 daysdown = -1
 406                         else:
 407                                 diff = now - last_contact
 408                                 daysdown = diff // (60*60*24)
 409                 return daysdown
 410
 411         def __getStrDaysDown(self, diag_record, nodename):
 412                 daysdown = self.__getDaysDown(diag_record, nodename)
 413                 if daysdown > 0:
 414                         return "(%d days down)"%daysdown
 415                 else:
 416                         return "Unknown number of days"
 417
 418         def __getCDVersion(self, diag_record, nodename):
 419                 cdversion = ""
 420                 #print "Getting kernel for: %s" % diag_record['nodename']
 421                 cdversion = diag_record['kernel']
 422                 return cdversion
 423
 424         def __diagnoseSite(self, loginbase, d_diag_nodes):
 425                 """
 426                 d_diag_nodes are diagnose_in entries.
 427                 """
 428                 d_diag_site = {loginbase : { 'config' :
 429                                                                                                 {'squeeze': False,
 430                                                                                                  'email': False
 431                                                                                                 },
 432                                                                         'nodes': {}
 433                                                                         }
 434                                            }
 435                 sorted_nodes = d_diag_nodes.keys()
 436                 sorted_nodes.sort()
 437                 for nodename in sorted_nodes:
 438                         node_record = d_diag_nodes[nodename]
 439                         diag_record = self.__diagnoseNode(loginbase, node_record)
 440
 441                         if diag_record != None:
 442                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 443
 444                                 # NOTE: improvement means, we need to act/squeeze and email.
 445                                 #print "DIAG_RECORD", diag_record
 446                                 if 'monitor-end-record' in diag_record['stage'] or \
 447                                    'nmreset' in diag_record['stage']:
 448                                 #       print "resetting loginbase!"
 449                                         d_diag_site[loginbase]['config']['squeeze'] = True
 450                                         d_diag_site[loginbase]['config']['email'] = True
 451                                 #else:
 452                                 #       print "NO IMPROVEMENT!!!!"
 453                         else:
 454                                 pass # there is nothing to do for this node.
 455
 456                 # NOTE: these settings can be overridden by command line arguments,
 457                 #       or the state of a record, i.e. if already in RT's Support Queue.
 458                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 459                 if nodes_up < MINUP:
 460                         d_diag_site[loginbase]['config']['squeeze'] = True
 461
 462                 max_slices = self.getMaxSlices(loginbase)
 463                 num_nodes = self.getNumNodes(loginbase)
 464                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 465                 #       or an old disabled site from previous monitor (before site['enabled'])
 466                 if nodes_up < num_nodes and max_slices != 0:
 467                         d_diag_site[loginbase]['config']['email'] = True
 468
 469                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 470                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 471
 472                 return d_diag_site
 473
 474         def diagRecordByCategory(self, node_record):
 475                 nodename = node_record['nodename']
 476                 category = node_record['category']
 477                 state    = node_record['state']
 478                 loginbase = self.plcdb_hn2lb[nodename]
 479                 diag_record = None
 480
 481                 if  "ERROR" in category:        # i.e. "DOWN"
 482                         diag_record = {}
 483                         diag_record.update(node_record)
 484                         daysdown = self.__getDaysDown(diag_record, nodename)
 485                         if daysdown < 7:
 486                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 487                                 print format % (loginbase, nodename, daysdown)
 488                                 return None
 489
 490                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
 491                         diag_record['message'] = emailTxt.mailtxt.newdown
 492                         diag_record['args'] = {'nodename': nodename}
 493                         diag_record['info'] = (nodename, s_daysdown, "")
 494
 495                         if 'reboot_node_failed' in node_record:
 496                                 # there was a previous attempt to use the PCU.
 497                                 if node_record['reboot_node_failed'] == False:
 498                                         # then the last attempt apparently, succeeded.
 499                                         # But, the category is still 'ERROR'.  Therefore, the
 500                                         # PCU-to-Node mapping is broken.
 501                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 502                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 503                                         diag_record['email_pcu'] = True
 504
 505                         if diag_record['ticket_id'] == "":
 506                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 507                                         (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
 508                         else:
 509                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 510                                         (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
 511
 512                 elif "OLDBOOTCD" in category:
 513                         # V2 boot cds as determined by findbad
 514                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
 515                         s_cdversion = self.__getCDVersion(node_record, nodename)
 516                         diag_record = {}
 517                         diag_record.update(node_record)
 518                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 519                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 520                         diag_record['args'] = {'nodename': nodename}
 521                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 522                         if diag_record['ticket_id'] == "":
 523                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 524                                                                         (loginbase, nodename, diag_record['kernel'],
 525                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 526                         else:
 527                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 528                                                                         (loginbase, nodename, diag_record['kernel'],
 529                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 530
 531                 elif "PROD" in category:
 532                         if "DEBUG" in state:
 533                                 # Not sure what to do with these yet.  Probably need to
 534                                 # reboot, and email.
 535                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 536                                 return None
 537                         elif "BOOT" in state:
 538                                 # no action needed.
 539                                 # TODO: remove penalties, if any are applied.
 540                                 now = time.time()
 541                                 last_contact = node_record['plcnode']['last_contact']
 542                                 if last_contact == None:
 543                                         time_diff = 0
 544                                 else:
 545                                         time_diff = now - last_contact;
 546
 547                                 if 'improvement' in node_record['stage']:
 548                                         # then we need to pass this on to 'action'
 549                                         diag_record = {}
 550                                         diag_record.update(node_record)
 551                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 552                                         diag_record['args'] = {'nodename': nodename}
 553                                         diag_record['info'] = (nodename, node_record['prev_category'],
 554                                                                                                          node_record['category'])
 555                                         if diag_record['ticket_id'] == "":
 556                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 557                                                                         (loginbase, nodename, diag_record['stage'],
 558                                                                          state, category, diag_record['found_rt_ticket'])
 559                                         else:
 560                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 561                                                                         (loginbase, nodename, diag_record['stage'],
 562                                                                          state, category, diag_record['ticket_id'])
 563                                         return diag_record
 564                                 elif time_diff >= 6*SPERHOUR:
 565                                         # heartbeat is older than 30 min.
 566                                         # then reset NM.
 567                                         #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 568                                         diag_record = {}
 569                                         diag_record.update(node_record)
 570                                         diag_record['message'] = emailTxt.mailtxt.NMReset
 571                                         diag_record['args'] = {'nodename': nodename}
 572                                         diag_record['stage'] = "nmreset"
 573                                         diag_record['info'] = (nodename,
 574                                                                                         node_record['prev_category'],
 575                                                                                         node_record['category'])
 576                                         if diag_record['ticket_id'] == "":
 577                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 578                                                                         (loginbase, nodename, diag_record['stage'],
 579                                                                          state, category, diag_record['found_rt_ticket'])
 580                                         else:
 581                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 582                                                                         (loginbase, nodename, diag_record['stage'])
 583
 584                                         return diag_record
 585                                 else:
 586                                         return None
 587                         else:
 588                                 # unknown
 589                                 pass
 590                 elif "ALPHA"    in category:
 591                         pass
 592                 elif "clock_drift" in category:
 593                         pass
 594                 elif "dns"    in category:
 595                         pass
 596                 elif "filerw"    in category:
 597                         pass
 598                 else:
 599                         print "Unknown category!!!! %s" % category
 600                         sys.exit(1)
 601
 602                 return diag_record
 603
 604         def __diagnoseNode(self, loginbase, node_record):
 605                 # TODO: change the format of the hostname in this
 606                 #               record to something more natural.
 607                 nodename                = node_record['nodename']
 608                 category                = node_record['category']
 609                 prev_category   = node_record['prev_category']
 610                 state                   = node_record['state']
 611                 #if 'prev_category' in node_record:
 612                 #       prev_category = node_record['prev_category']
 613                 #else:
 614                 #       prev_category = "ERROR"
 615                 if node_record['prev_category'] != "NORECORD":
 616
 617                         val = cmpCategoryVal(category, prev_category)
 618                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 619                         if val == 1:
 620                                 # improved
 621                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 622                                         print "closing record with no ticket: ", node_record['nodename']
 623                                         node_record['action'] = ['close_rt']
 624                                         node_record['message'] = None
 625                                         node_record['stage'] = 'monitor-end-record'
 626                                         return node_record
 627                                 else:
 628                                         node_record['stage'] = 'improvement'
 629
 630                                 #if 'monitor-end-record' in node_record['stage']:
 631                                 #       # just ignore it if it's already ended.
 632                                 #       # otherwise, the status should be worse, and we won't get
 633                                 #       # here.
 634                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 635                                 #       return None
 636 #
 637 #                                       #return None
 638                         elif val == -1:
 639                                 # current category is worse than previous, carry on
 640                                 pass
 641                         else:
 642                                 #values are equal, carry on.
 643                                 #print "why are we here?"
 644                                 pass
 645
 646                 #### COMPARE category and prev_category
 647                 # if not_equal
 648                 #       then assign a stage based on relative priorities
 649                 # else equal
 650                 #       then check category for stats.
 651                 diag_record = self.diagRecordByCategory(node_record)
 652                 if diag_record == None:
 653                         #print "diag_record == None"
 654                         return None
 655
 656                 #### found_RT_ticket
 657                 # TODO: need to record time found, and maybe add a stage for acting on it...
 658                 if 'found_rt_ticket' in diag_record and \
 659                         diag_record['found_rt_ticket'] is not None:
 660                         if diag_record['stage'] is not 'improvement':
 661                                 diag_record['stage'] = 'ticket_waitforever'
 662
 663                 current_time = time.time()
 664                 # take off four days, for the delay that database caused.
 665                 # TODO: generalize delays at PLC, and prevent enforcement when there
 666                 #               have been no emails.
 667                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 668                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 669                 delta = current_time - diag_record['time']
 670
 671                 message = diag_record['message']
 672                 act_record = {}
 673                 act_record.update(diag_record)
 674
 675                 #### DIAGNOSE STAGES
 676                 if   'findbad' in diag_record['stage']:
 677                         # The node is bad, and there's no previous record of it.
 678                         act_record['email'] = TECH
 679                         act_record['action'] = ['noop']
 680                         act_record['message'] = message[0]
 681                         act_record['stage'] = 'stage_actinoneweek'
 682
 683                 elif 'nmreset' in diag_record['stage']:
 684                         act_record['email']  = ADMIN
 685                         act_record['action'] = ['reset_nodemanager']
 686                         act_record['message'] = message[0]
 687                         act_record['stage']  = 'nmreset'
 688                         return None
 689
 690                 elif 'reboot_node' in diag_record['stage']:
 691                         act_record['email'] = TECH
 692                         act_record['action'] = ['noop']
 693                         act_record['message'] = message[0]
 694                         act_record['stage'] = 'stage_actinoneweek'
 695
 696                 elif 'improvement' in diag_record['stage']:
 697                         # - backoff previous squeeze actions (slice suspend, nocreate)
 698                         # TODO: add a backoff_squeeze section... Needs to runthrough
 699                         act_record['action'] = ['close_rt']
 700                         act_record['message'] = message[0]
 701                         act_record['stage'] = 'monitor-end-record'
 702
 703                 elif 'actinoneweek' in diag_record['stage']:
 704                         if delta >= 7 * SPERDAY:
 705                                 act_record['email'] = TECH | PI
 706                                 act_record['stage'] = 'stage_actintwoweeks'
 707                                 act_record['message'] = message[1]
 708                                 act_record['action'] = ['nocreate' ]
 709                                 act_record['time'] = current_time               # reset clock for waitforever
 710                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 711                                 act_record['email'] = TECH
 712                                 act_record['message'] = message[0]
 713                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 714                                 act_record['second-mail-at-oneweek'] = True
 715                         else:
 716                                 act_record['message'] = None
 717                                 act_record['action'] = ['waitforoneweekaction' ]
 718                                 print "ignoring this record for: %s" % act_record['nodename']
 719                                 return None                     # don't send if there's no action
 720
 721                 elif 'actintwoweeks' in diag_record['stage']:
 722                         if delta >= 7 * SPERDAY:
 723                                 act_record['email'] = TECH | PI | USER
 724                                 act_record['stage'] = 'stage_waitforever'
 725                                 act_record['message'] = message[2]
 726                                 act_record['action'] = ['suspendslices']
 727                                 act_record['time'] = current_time               # reset clock for waitforever
 728                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 729                                 act_record['email'] = TECH | PI
 730                                 act_record['message'] = message[1]
 731                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 732                                 act_record['second-mail-at-twoweeks'] = True
 733                         else:
 734                                 act_record['message'] = None
 735                                 act_record['action'] = ['waitfortwoweeksaction']
 736                                 return None                     # don't send if there's no action
 737
 738                 elif 'ticket_waitforever' in diag_record['stage']:
 739                         act_record['email'] = TECH
 740                         if 'first-found' not in act_record:
 741                                 act_record['first-found'] = True
 742                                 act_record['log'] += " firstfound"
 743                                 act_record['action'] = ['ticket_waitforever']
 744                                 act_record['message'] = None
 745                                 act_record['time'] = current_time
 746                         else:
 747                                 if delta >= 7*SPERDAY:
 748                                         act_record['action'] = ['ticket_waitforever']
 749                                         act_record['message'] = None
 750                                         act_record['time'] = current_time               # reset clock
 751                                 else:
 752                                         act_record['action'] = ['ticket_waitforever']
 753                                         act_record['message'] = None
 754                                         return None
 755
 756                 elif 'waitforever' in diag_record['stage']:
 757                         # more than 3 days since last action
 758                         # TODO: send only on weekdays.
 759                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 760                         if delta >= 3*SPERDAY:
 761                                 act_record['action'] = ['email-againwaitforever']
 762                                 act_record['message'] = message[2]
 763                                 act_record['time'] = current_time               # reset clock
 764                         else:
 765                                 act_record['action'] = ['waitforever']
 766                                 act_record['message'] = None
 767                                 return None                     # don't send if there's no action
 768
 769                 else:
 770                         # There is no action to be taken, possibly b/c the stage has
 771                         # already been performed, but diagnose picked it up again.
 772                         # two cases,
 773                         #       1. stage is unknown, or
 774                         #       2. delta is not big enough to bump it to the next stage.
 775                         # TODO: figure out which. for now assume 2.
 776                         print "UNKNOWN stage for %s; nothing done" % nodename
 777                         act_record['action'] = ['unknown']
 778                         act_record['message'] = message[0]
 779                         #print "Exiting..."
 780                         return None
 781                         #sys.exit(1)
 782
 783                 print "%s" % act_record['log'],
 784                 print "%15s" % act_record['action']
 785                 return act_record
 786
 787         def getMaxSlices(self, loginbase):
 788                 # if sickdb has a loginbase, then it will have at least one node.
 789                 site_stats = None
 790
 791                 for nodename in self.diagnose_in[loginbase].keys():
 792                         if nodename in self.findbad['nodes']:
 793                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 794                                 break
 795
 796                 if site_stats == None:
 797                         raise Exception, "loginbase with no nodes in findbad"
 798                 else:
 799                         return site_stats['max_slices']
 800
 801         def getNumNodes(self, loginbase):
 802                 # if sickdb has a loginbase, then it will have at least one node.
 803                 site_stats = None
 804
 805                 for nodename in self.diagnose_in[loginbase].keys():
 806                         if nodename in self.findbad['nodes']:
 807                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 808                                 break
 809
 810                 if site_stats == None:
 811                         raise Exception, "loginbase with no nodes in findbad"
 812                 else:
 813                         return site_stats['num_nodes']
 814
 815         """
 816         Returns number of up nodes as the total number *NOT* in act_all with a
 817         stage other than 'steady-state' .
 818         """
 819         def getUpAtSite(self, loginbase, d_diag_site):
 820                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 821                 #               that aren't recorded yet.
 822
 823                 numnodes = self.getNumNodes(loginbase)
 824                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 825                 # TODO: make the 'up' value more representative
 826                 up = numnodes
 827                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 828
 829                         rec = d_diag_site[loginbase]['nodes'][nodename]
 830                         if rec['stage'] != 'monitor-end-record':
 831                                 up -= 1
 832                         else:
 833                                 pass # the node is assumed to be up.
 834
 835                 #if up != numnodes:
 836                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 837
 838                 return up
 839
 840
 841 class SiteAction:
 842         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 843                 self.parameter_names = parameter_names
 844         def checkParam(self, args):
 845                 for param in self.parameter_names:
 846                         if param not in args:
 847                                 raise Exception("Parameter %s not provided in args"%param)
 848         def run(self, args):
 849                 self.checkParam(args)
 850                 return self._run(args)
 851         def _run(self, args):
 852                 pass
 853
 854 class SuspendAction(SiteAction):
 855         def _run(self, args):
 856                 return plc.suspendSlices(args['hostname'])
 857
 858 class RemoveSliceCreation(SiteAction):
 859         def _run(self, args):
 860                 return plc.removeSliceCreation(args['hostname'])
 861
 862 class BackoffActions(SiteAction):
 863         def _run(self, args):
 864                 plc.enableSlices(args['hostname'])
 865                 plc.enableSliceCreation(args['hostname'])
 866                 return True
 867
 868 # TODO: create class for each action below,
 869 #               allow for lists of actions to be performed...
 870
 871 def close_rt_backoff(args):
 872         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
 873                 mailer.closeTicketViaRT(args['ticket_id'],
 874                                                                 "Ticket CLOSED automatically by SiteAssist.")
 875                 plc.enableSlices(args['hostname'])
 876                 plc.enableSliceCreation(args['hostname'])
 877         return
 878
 879 def reboot_node(args):
 880         host = args['hostname']
 881         return reboot.reboot_new(host, True, config.debug)
 882
 883 def reset_nodemanager(args):
 884         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 885         return
 886
 887 class Action(Thread):
 888         def __init__(self, l_action):
 889                 self.l_action = l_action
 890
 891                 # the hostname to loginbase mapping
 892                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 893
 894                 # Actions to take.
 895                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
 896                 # Actions taken.
 897                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
 898
 899                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 900                 self.actions = {}
 901                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 902                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 903                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 904                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
 905                 self.actions['noop'] = lambda args: args
 906                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 907                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 908
 909                 self.actions['ticket_waitforever'] = lambda args: args
 910                 self.actions['waitforever'] = lambda args: args
 911                 self.actions['unknown'] = lambda args: args
 912                 self.actions['waitforoneweekaction'] = lambda args: args
 913                 self.actions['waitfortwoweeksaction'] = lambda args: args
 914                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 915                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 916                 self.actions['email-againwaitforever'] = lambda args: args
 917                 self.actions['email-againticket_waitforever'] = lambda args: args
 918
 919
 920                 self.sickdb = {}
 921                 Thread.__init__(self)
 922
 923         def run(self):
 924                 self.accumSites()
 925                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 926                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 927
 928                 try:
 929                         stats = self.analyseSites()
 930                 except Exception, err:
 931                         print "----------------"
 932                         import traceback
 933                         print traceback.print_exc()
 934                         print err
 935                         if config.policysavedb:
 936                                 print "Saving Databases... act_all"
 937                                 soltesz.dbDump("act_all", self.act_all)
 938                         sys.exit(1)
 939
 940                 print_stats("sites_observed", stats)
 941                 print_stats("sites_diagnosed", stats)
 942                 print_stats("nodes_diagnosed", stats)
 943                 print_stats("sites_emailed", stats)
 944                 print_stats("nodes_actedon", stats)
 945                 print string.join(stats['allsites'], ",")
 946
 947                 if config.policysavedb:
 948                         print "Saving Databases... act_all"
 949                         #soltesz.dbDump("policy.eventlog", self.eventlog)
 950                         # TODO: remove 'diagnose_out',
 951                         #       or at least the entries that were acted on.
 952                         soltesz.dbDump("act_all", self.act_all)
 953
 954         def accumSites(self):
 955                 """
 956                 Take all nodes, from l_action, look them up in the diagnose_db database,
 957                 and insert them into sickdb[] as:
 958
 959                 This way only the given l_action nodes will be acted on regardless
 960                 of how many from diagnose_db are available.
 961
 962                         sickdb[loginbase][nodename] = diag_record
 963                 """
 964                 # TODO: what if l_action == None ?
 965                 for nodename in self.l_action:
 966
 967                         loginbase = self.plcdb_hn2lb[nodename]
 968
 969                         if loginbase in self.diagnose_db and \
 970                                 nodename in self.diagnose_db[loginbase]['nodes']:
 971
 972                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
 973
 974                                 if loginbase not in self.sickdb:
 975                                         self.sickdb[loginbase] = {'nodes' : {}}
 976
 977                                 # NOTE: don't copy all node records, since not all will be in l_action
 978                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
 979                                 # NOTE: but, we want to get the loginbase config settings,
 980                                 #               this is the easiest way.
 981                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
 982                         #else:
 983                                 #print "%s not in diagnose_db!!" % loginbase
 984                 return
 985
 986         def __emailSite(self, loginbase, roles, message, args):
 987                 """
 988                 loginbase is the unique site abbreviation, prepended to slice names.
 989                 roles contains TECH, PI, USER roles, and derive email aliases.
 990                 record contains {'message': [<subj>,<body>], 'args': {...}}
 991                 """
 992                 ticket_id = 0
 993                 args.update({'loginbase':loginbase})
 994
 995                 if not config.mail and not config.debug and config.bcc:
 996                         roles = ADMIN
 997                 if config.mail and config.debug:
 998                         roles = ADMIN
 999
1000                 # build targets
1001                 contacts = []
1002                 if ADMIN & roles:
1003                         contacts += [config.email]
1004                 if TECH & roles:
1005                         contacts += [TECHEMAIL % loginbase]
1006                 if PI & roles:
1007                         contacts += [PIEMAIL % loginbase]
1008                 if USER & roles:
1009                         slices = plc.slices(loginbase)
1010                         if len(slices) >= 1:
1011                                 for slice in slices:
1012                                         contacts += [SLICEMAIL % slice]
1013                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1014                         else:
1015                                 print "SLIC: %20s : 0 slices" % loginbase
1016
1017                 try:
1018                         subject = message[0] % args
1019                         body = message[1] % args
1020                         if ADMIN & roles:
1021                                 # send only to admin
1022                                 if 'ticket_id' in args:
1023                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1024                                 else:
1025                                         subj = "Re: [PL noticket] %s" % subject
1026                                 mailer.email(subj, body, contacts)
1027                                 ticket_id = args['ticket_id']
1028                         else:
1029                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1030                 except Exception, err:
1031                         print "exception on message:"
1032                         import traceback
1033                         print traceback.print_exc()
1034                         print message
1035
1036                 return ticket_id
1037
1038
1039         def _format_diaginfo(self, diag_node):
1040                 info = diag_node['info']
1041                 if diag_node['stage'] == 'monitor-end-record':
1042                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1043                 else:
1044                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1045                 return hlist
1046
1047
1048         def get_email_args(self, act_recordlist):
1049
1050                 email_args = {}
1051                 email_args['hostname_list'] = ""
1052
1053                 for act_record in act_recordlist:
1054                         email_args['hostname_list'] += act_record['msg_format']
1055                         email_args['hostname'] = act_record['nodename']
1056                         if  'plcnode' in act_record and \
1057                                 'pcu_ids' in act_record['plcnode'] and \
1058                                 len(act_record['plcnode']['pcu_ids']) > 0:
1059                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1060                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1061                         else:
1062                                 email_args['pcu_id'] = "-1"
1063
1064                         if 'ticket_id' in act_record:
1065                                 email_args['ticket_id'] = act_record['ticket_id']
1066
1067                 return email_args
1068
1069         def get_unique_issues(self, act_recordlist):
1070                 # NOTE: only send one email per site, per problem...
1071                 unique_issues = {}
1072                 for act_record in act_recordlist:
1073                         act_key = act_record['action'][0]
1074                         if act_key not in unique_issues:
1075                                 unique_issues[act_key] = []
1076
1077                         unique_issues[act_key] += [act_record]
1078
1079                 return unique_issues
1080
1081
1082         def __actOnSite(self, loginbase, site_record):
1083                 i_nodes_actedon = 0
1084                 i_nodes_emailed = 0
1085
1086                 act_recordlist = []
1087
1088                 for nodename in site_record['nodes'].keys():
1089                         diag_record = site_record['nodes'][nodename]
1090                         act_record  = self.__actOnNode(diag_record)
1091                         #print "nodename: %s %s" % (nodename, act_record)
1092                         if act_record is not None:
1093                                 act_recordlist += [act_record]
1094
1095                 unique_issues = self.get_unique_issues(act_recordlist)
1096
1097                 for issue in unique_issues.keys():
1098                         print "\tworking on issue: %s" % issue
1099                         issue_record_list = unique_issues[issue]
1100                         email_args = self.get_email_args(issue_record_list)
1101
1102                         # for each record.
1103                         for act_record in issue_record_list:
1104                                 # if there's a pcu record and email config is set
1105                                 if 'email_pcu' in act_record:
1106                                         if act_record['email_pcu'] and \
1107                                                 site_record['config']['email']:
1108
1109                                                 email_args['hostname'] = act_record['nodename']
1110                                                 ticket_id = self.__emailSite(loginbase,
1111                                                                                         act_record['email'],
1112                                                                                         emailTxt.mailtxt.pcudown[0],
1113                                                                                         email_args)
1114                                                 email_args['ticket_id'] = ticket_id
1115
1116
1117                         act_record = issue_record_list[0]
1118                         # send message before squeezing
1119                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1120                                                                                                 site_record['config']['email'])
1121                         if act_record['message'] != None and site_record['config']['email']:
1122                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1123                                                                                          act_record['message'], email_args)
1124
1125                                 # Add ticket_id to ALL nodenames
1126                                 for act_record in issue_record_list:
1127                                         nodename = act_record['nodename']
1128                                         # update node record with RT ticket_id
1129                                         if nodename in self.act_all:
1130                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1131                                         if config.mail: i_nodes_emailed += 1
1132
1133                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1134                                                                                                         site_record['config']['squeeze'])
1135                         if config.squeeze and site_record['config']['squeeze']:
1136                                 for act_key in act_record['action']:
1137                                         self.actions[act_key](email_args)
1138                                 i_nodes_actedon += 1
1139
1140                 if config.policysavedb:
1141                         print "Saving Databases... act_all, diagnose_out"
1142                         soltesz.dbDump("act_all", self.act_all)
1143                         # remove site record from diagnose_out, it's in act_all as done.
1144                         del self.diagnose_db[loginbase]
1145                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1146
1147                 #print "sleeping for 1 sec"
1148                 #time.sleep(1)
1149                 print "Hit enter to continue..."
1150                 sys.stdout.flush()
1151                 line = sys.stdin.readline()
1152
1153                 return (i_nodes_actedon, i_nodes_emailed)
1154
1155         def __actOnNode(self, diag_record):
1156                 nodename = diag_record['nodename']
1157                 message = diag_record['message']
1158
1159                 act_record = {}
1160                 act_record.update(diag_record)
1161                 act_record['nodename'] = nodename
1162                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1163                 print "act_record['stage'] == %s " % act_record['stage']
1164
1165                 # avoid end records, and nmreset records
1166                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1167
1168                 if 'monitor-end-record' not in act_record['stage'] and \
1169                    'nmreset' not in act_record['stage'] and \
1170                    'reboot_node_failed' not in act_record:
1171
1172                         if "DOWN" in act_record['log'] and \
1173                                         'pcu_ids' in act_record['plcnode'] and \
1174                                         len(act_record['plcnode']['pcu_ids']) > 0:
1175
1176                                 print "%s" % act_record['log'],
1177                                 print "%15s" % (['reboot_node'],)
1178                                 # Set node to re-install
1179                                 plc.nodeBootState(act_record['nodename'], "rins")
1180                                 try:
1181                                         ret = reboot_node({'hostname': act_record['nodename']})
1182                                 except Exception, exc:
1183                                         print "exception on reboot_node:"
1184                                         import traceback
1185                                         print traceback.print_exc()
1186                                         ret = False
1187
1188                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1189                                         # Reboot Succeeded
1190                                         print "reboot succeeded for %s" % act_record['nodename']
1191                                         act_record2 = {}
1192                                         act_record2.update(act_record)
1193                                         act_record2['action'] = ['reboot_node']
1194                                         act_record2['stage'] = "reboot_node"
1195                                         act_record2['reboot_node_failed'] = False
1196                                         act_record2['email_pcu'] = False
1197
1198                                         if nodename not in self.act_all:
1199                                                 self.act_all[nodename] = []
1200                                         print "inserting 'reboot_node' record into act_all"
1201                                         self.act_all[nodename].insert(0,act_record2)
1202
1203                                         # return None to avoid further action
1204                                         print "Taking no further action"
1205                                         return None
1206                                 else:
1207                                         print "reboot failed for %s" % act_record['nodename']
1208                                         # set email_pcu to also send pcu notice for this record.
1209                                         act_record['reboot_node_failed'] = True
1210                                         act_record['email_pcu'] = True
1211
1212                         print "%s" % act_record['log'],
1213                         print "%15s" % act_record['action']
1214
1215                 if act_record['stage'] is not 'monitor-end-record' and \
1216                    act_record['stage'] is not 'nmreset':
1217                         if nodename not in self.act_all:
1218                                 self.act_all[nodename] = []
1219
1220                         self.act_all[nodename].insert(0,act_record)
1221                 else:
1222                         print "Not recording %s in act_all" % nodename
1223
1224                 return act_record
1225
1226         def analyseSites(self):
1227                 i_sites_observed = 0
1228                 i_sites_diagnosed = 0
1229                 i_nodes_diagnosed = 0
1230                 i_nodes_actedon = 0
1231                 i_sites_emailed = 0
1232                 l_allsites = []
1233
1234                 sorted_sites = self.sickdb.keys()
1235                 sorted_sites.sort()
1236                 for loginbase in sorted_sites:
1237                         site_record = self.sickdb[loginbase]
1238                         print "sites: %s" % loginbase
1239
1240                         i_nodes_diagnosed += len(site_record.keys())
1241                         i_sites_diagnosed += 1
1242
1243                         (na,ne) = self.__actOnSite(loginbase, site_record)
1244
1245                         i_sites_observed += 1
1246                         i_nodes_actedon += na
1247                         i_sites_emailed += ne
1248
1249                         l_allsites += [loginbase]
1250
1251                 return {'sites_observed': i_sites_observed,
1252                                 'sites_diagnosed': i_sites_diagnosed,
1253                                 'nodes_diagnosed': i_nodes_diagnosed,
1254                                 'sites_emailed': i_sites_emailed,
1255                                 'nodes_actedon': i_nodes_actedon,
1256                                 'allsites':l_allsites}
1257
1258         def print_stats(self, key, stats):
1259                 print "%20s : %d" % (key, stats[key])
1260
1261
1262
1263         #"""
1264         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1265         #"""
1266         #def status(self):
1267         #       sub = "Monitor Summary"
1268         #       msg = "\nThe following nodes were acted upon:  \n\n"
1269         #       for (node, (type, date)) in self.emailed.items():
1270         #               # Print only things acted on today.
1271         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1272         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1273         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1274         #       for (loginbase, (date, type)) in self.squeezed.items():
1275         #               # Print only things acted on today.
1276         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1277         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1278         #       mailer.email(sub, msg, [SUMTO])
1279         #       logger.info(msg)
1280         #       return
1281
1282         #"""
1283         #Store/Load state of emails.  When, where, what.
1284         #"""
1285         #def emailedStore(self, action):
1286         #       try:
1287         #               if action == "LOAD":
1288         #                       f = open(DAT, "r+")
1289         #                       logger.info("POLICY:  Found and reading " + DAT)
1290         #                       self.emailed.update(pickle.load(f))
1291         #               if action == "WRITE":
1292         #                       f = open(DAT, "w")
1293         #                       #logger.debug("Writing " + DAT)
1294         #                       pickle.dump(self.emailed, f)
1295         #               f.close()
1296         #       except Exception, err:
1297         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1298
1299
1300 #class Policy(Thread):
1301
1302 def main():
1303         print "policy.py is a module, not a script for running directly."
1304
1305 if __name__ == '__main__':
1306         import os
1307         import plc
1308         try:
1309                 main()
1310         except KeyboardInterrupt:
1311                 print "Killed.  Exitting."
1312                 logger.info('Monitor Killed')
1313                 os._exit(0)