policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import soltesz
  23 import string
  24 from www.printbadnodes import cmpCategoryVal
  25 from config import config
  26 print "policy"
  27 config = config()
  28
  29 DAT="./monitor.dat"
  30
  31 logger = logging.getLogger("monitor")
  32
  33 # Time to enforce policy
  34 POLSLEEP = 7200
  35
  36 # Where to email the summary
  37 SUMTO = "soltesz@cs.princeton.edu"
  38 TECHEMAIL="tech-%s@sites.planet-lab.org"
  39 PIEMAIL="pi-%s@sites.planet-lab.org"
  40 SLICEMAIL="%s@slices.planet-lab.org"
  41 PLCEMAIL="support@planet-lab.org"
  42
  43 #Thresholds (DAYS)
  44 SPERMIN = 60
  45 SPERHOUR = 60*60
  46 SPERDAY = 86400
  47 PITHRESH = 7 * SPERDAY
  48 SLICETHRESH = 7 * SPERDAY
  49 # Days before attempting rins again
  50 RINSTHRESH = 5 * SPERDAY
  51
  52 # Days before calling the node dead.
  53 DEADTHRESH = 30 * SPERDAY
  54 # Minimum number of nodes up before squeezing
  55 MINUP = 2
  56
  57 TECH=1
  58 PI=2
  59 USER=4
  60 ADMIN=8
  61
  62 # IF:
  63 #  no SSH, down.
  64 #  bad disk, down
  65 #  DNS, kinda down (sick)
  66 #  clock, kinda down (sick)
  67 #  Full disk, going to be down
  68
  69 # Actions:
  70 #  Email
  71 #  suspend slice creation
  72 #  kill slices
  73 def array_to_priority_map(array):
  74         """ Create a mapping where each entry of array is given a priority equal
  75         to its position in the array.  This is useful for subsequent use in the
  76         cmpMap() function."""
  77         map = {}
  78         count = 0
  79         for i in array:
  80                 map[i] = count
  81                 count += 1
  82         return map
  83
  84 def getdebug():
  85         return config.debug
  86
  87 def print_stats(key, stats):
  88         if key in stats: print "%20s : %d" % (key, stats[key])
  89
  90 class Merge(Thread):
  91         def __init__(self, l_merge, toRT):
  92                 self.toRT = toRT
  93                 self.merge_list = l_merge
  94                 # the hostname to loginbase mapping
  95                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
  96
  97                 # Previous actions taken on nodes.
  98                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
  99                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 100
 101                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
 102                 self.sickdb = {}
 103                 self.mergedb = {}
 104                 Thread.__init__(self)
 105
 106         def run(self):
 107                 # populate sickdb
 108                 self.accumSickSites()
 109                 # read data from findbad and act_all
 110                 self.mergeActionsAndBadDB()
 111                 # pass node_records to RT
 112                 self.sendToRT()
 113
 114         def accumSickSites(self):
 115                 """
 116                 Take all nodes, from l_diagnose, look them up in the act_all database,
 117                 and insert them into sickdb[] as:
 118
 119                         sickdb[loginbase][nodename] = fb_record
 120                 """
 121                 # look at all problems reported by findbad
 122                 l_nodes = self.findbad['nodes'].keys()
 123                 count = 0
 124                 for nodename in l_nodes:
 125                         if nodename not in self.merge_list:
 126                                 continue                # skip this node, since it's not wanted
 127
 128                         count += 1
 129                         loginbase = self.plcdb_hn2lb[nodename]
 130                         values = self.findbad['nodes'][nodename]['values']
 131
 132                         fb_record = {}
 133                         fb_record['nodename'] = nodename
 134                         try:
 135                                 fb_record['category'] = values['category']
 136                         except:
 137                                 print values
 138                                 print nodename
 139                                 print self.findbad['nodes'][nodename]
 140                                 count -= 1
 141                                 continue
 142                         fb_record['state'] = values['state']
 143                         fb_record['comonstats'] = values['comonstats']
 144                         fb_record['plcnode'] = values['plcnode']
 145                         fb_record['kernel'] = self.getKernel(values['kernel'])
 146                         fb_record['stage'] = "findbad"
 147                         fb_record['message'] = None
 148                         fb_record['bootcd'] = values['bootcd']
 149                         fb_record['args'] = None
 150                         fb_record['info'] = None
 151                         fb_record['time'] = time.time()
 152                         fb_record['date_created'] = time.time()
 153
 154                         if loginbase not in self.sickdb:
 155                                 self.sickdb[loginbase] = {}
 156
 157                         self.sickdb[loginbase][nodename] = fb_record
 158
 159                 print "Found %d nodes" % count
 160
 161         def getKernel(self, unamestr):
 162                 s = unamestr.split()
 163                 if len(s) > 2:
 164                         return s[2]
 165                 else:
 166                         return ""
 167
 168         def mergeActionsAndBadDB(self):
 169                 """
 170                 - Look at the sick node_records as reported in findbad,
 171                 - Then look at the node_records in act_all.
 172
 173                 There are four cases:
 174                 1) Problem in findbad, no problem in act_all
 175                         this ok, b/c it just means it's a new problem
 176                 2) Problem in findbad, problem in act_all
 177                         -Did the problem get better or worse?
 178                                 -If Same, or Worse, then continue looking for open tickets.
 179                                 -If Better, or No problem, then "back-off" penalties.
 180                                         This judgement may need to wait until 'Diagnose()'
 181
 182                 3) No problem in findbad, problem in act_all
 183                         The the node is operational again according to Findbad()
 184
 185                 4) No problem in findbad, no problem in act_all
 186                         There won't be a record in either db, so there's no code.
 187                 """
 188
 189                 sorted_sites = self.sickdb.keys()
 190                 sorted_sites.sort()
 191                 # look at all problems reported by findbad
 192                 for loginbase in sorted_sites:
 193                         d_fb_nodes = self.sickdb[loginbase]
 194                         sorted_nodes = d_fb_nodes.keys()
 195                         sorted_nodes.sort()
 196                         for nodename in sorted_nodes:
 197                                 fb_record = self.sickdb[loginbase][nodename]
 198                                 x = fb_record
 199                                 if loginbase not in self.mergedb:
 200                                         self.mergedb[loginbase] = {}
 201
 202                                 # We must compare findbad state with act_all state
 203                                 if nodename not in self.act_all:
 204                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 205                                         self.mergedb[loginbase][nodename] = {}
 206                                         self.mergedb[loginbase][nodename].update(x)
 207                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 208                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 209                                 else:
 210                                         if len(self.act_all[nodename]) == 0:
 211                                                 print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
 212                                                 continue
 213
 214                                         y = self.act_all[nodename][0]
 215
 216                                         ## skip if end-stage
 217                                         #if 'stage' in y and "monitor-end-record" in y['stage']:
 218                                         #       # 1) ok, b/c it's a new problem. set ticket_id to null
 219                                         ##      self.mergedb[loginbase][nodename] = {}
 220                                         #       self.mergedb[loginbase][nodename].update(x)
 221                                         #       self.mergedb[loginbase][nodename]['ticket_id'] = ""
 222                                         #       self.mergedb[loginbase][nodename]['prev_category'] = None
 223                                         #       continue
 224
 225                                         ## for legacy actions
 226                                         #if 'bucket' in y and y['bucket'][0] == 'dbg':
 227                                         #       # Only bootcd debugs made it to the act_all db.
 228                                         #       y['prev_category'] = "OLDBOOTCD"
 229                                         #elif 'bucket' in y and y['bucket'][0] == 'down':
 230                                         #       y['prev_category'] = "ERROR"
 231                                         #elif 'bucket' not in y:
 232                                         #       # for all other actions, just carry over the
 233                                         #       # previous category
 234                                         #       y['prev_category'] = y['category']
 235                                         #else:
 236                                         #       print "UNKNOWN state for record: %s" % y
 237                                         #       sys.exit(1)
 238
 239                                         # determine through translation, if the buckets match
 240                                         #if 'category' in y and x['category'] == y['category']:
 241                                         #       b_match = True
 242                                         #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
 243                                         #       b_match = True
 244                                         #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
 245                                         #       b_match = True
 246                                         #else:
 247                                         #       b_match = False
 248
 249                                         #if b_match:
 250                                         #       # 2b) ok, b/c they agree that there's still a problem..
 251                                         #       # 2b) Comon & Monitor still agree; RT ticket?
 252                                         #else:
 253                                         #       # 2a) mismatch, need a policy for how to resolve
 254                                         #       #     resolution will be handled in __diagnoseNode()
 255                                         #       #         for now just record the two categories.
 256                                         #       #if x['category'] == "PROD" and x['state'] == "BOOT" and \
 257                                         #       # ( y['bucket'][0] == 'down' or  y['bucket'][0] == 'dbg'):
 258                                         #       print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
 259                                         #                               (x['category'], y['bucket'])
 260
 261                                         y['prev_category'] = y['category']
 262                                         self.mergedb[loginbase][nodename] = {}
 263                                         self.mergedb[loginbase][nodename].update(y)
 264                                         self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 265                                         self.mergedb[loginbase][nodename]['category']   = x['category']
 266                                         self.mergedb[loginbase][nodename]['state'] = x['state']
 267                                         self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 268                                         self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 269                                         self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 270                                         # delete the entry from cache_all to keep it out of case 3)
 271                                         del self.cache_all[nodename]
 272
 273                 # 3) nodes that remin in cache_all were not identified by findbad.
 274                 #        Do we keep them or not?
 275                 #   NOTE: i think that since the categories are performed before this
 276                 #               step now, and by a monitor-controlled agent.
 277
 278                 # TODO: This does not work correctly.  Do we need this?
 279                 #for hn in self.cache_all.keys():
 280                 #       y = self.act_all[hn][0]
 281                 #       if 'monitor' in y['bucket']:
 282                 #               loginbase = self.plcdb_hn2lb[hn]
 283                 #               if loginbase not in self.sickdb:
 284                 #                       self.sickdb[loginbase] = {}
 285                 #               self.sickdb[loginbase][hn] = y
 286                 #       else:
 287                 #               del self.cache_all[hn]
 288
 289                 print "len of cache_all: %d" % len(self.cache_all.keys())
 290                 return
 291
 292         def sendToRT(self):
 293                 sorted_sites = self.mergedb.keys()
 294                 sorted_sites.sort()
 295                 # look at all problems reported by merge
 296                 for loginbase in sorted_sites:
 297                         d_merge_nodes = self.mergedb[loginbase]
 298                         for nodename in d_merge_nodes.keys():
 299                                 record = self.mergedb[loginbase][nodename]
 300                                 self.toRT.put(record)
 301
 302                 # send signal to stop reading
 303                 self.toRT.put(None)
 304                 return
 305
 306 class Diagnose(Thread):
 307         def __init__(self, fromRT):
 308                 self.fromRT = fromRT
 309                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 310                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 311
 312                 self.diagnose_in = {}
 313                 self.diagnose_out = {}
 314                 Thread.__init__(self)
 315
 316
 317         def run(self):
 318                 self.accumSickSites()
 319
 320                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 321                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 322
 323                 try:
 324                         stats = self.diagnoseAll()
 325                 except Exception, err:
 326                         print "----------------"
 327                         import traceback
 328                         print traceback.print_exc()
 329                         print err
 330                         #if config.policysavedb:
 331                         sys.exit(1)
 332
 333                 print_stats("sites_observed", stats)
 334                 print_stats("sites_diagnosed", stats)
 335                 print_stats("nodes_diagnosed", stats)
 336
 337                 if config.policysavedb:
 338                         print "Saving Databases... diagnose_out"
 339                         soltesz.dbDump("diagnose_out", self.diagnose_out)
 340
 341         def accumSickSites(self):
 342                 """
 343                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 344                 and insert them into diagnose_in[] as:
 345
 346                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 347                 """
 348                 while 1:
 349                         node_record = self.fromRT.get(block = True)
 350                         if node_record == None:
 351                                 break;
 352
 353                         nodename = node_record['nodename']
 354                         loginbase = self.plcdb_hn2lb[nodename]
 355
 356                         if loginbase not in self.diagnose_in:
 357                                 self.diagnose_in[loginbase] = {}
 358
 359                         self.diagnose_in[loginbase][nodename] = node_record
 360
 361                 return
 362
 363         def diagnoseAll(self):
 364                 i_sites_observed = 0
 365                 i_sites_diagnosed = 0
 366                 i_nodes_diagnosed = 0
 367                 i_nodes_actedon = 0
 368                 i_sites_emailed = 0
 369                 l_allsites = []
 370
 371                 sorted_sites = self.diagnose_in.keys()
 372                 sorted_sites.sort()
 373                 self.diagnose_out= {}
 374                 for loginbase in sorted_sites:
 375                         l_allsites += [loginbase]
 376
 377                         d_diag_nodes = self.diagnose_in[loginbase]
 378                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 379                         # store records in diagnose_out, for saving later.
 380                         self.diagnose_out.update(d_act_records)
 381
 382                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 383                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 384                                 i_sites_diagnosed += 1
 385                         i_sites_observed += 1
 386
 387                 return {'sites_observed': i_sites_observed,
 388                                 'sites_diagnosed': i_sites_diagnosed,
 389                                 'nodes_diagnosed': i_nodes_diagnosed,
 390                                 'allsites':l_allsites}
 391
 392                 pass
 393
 394         def __getDaysDown(self, diag_record, nodename):
 395                 daysdown = -1
 396                 if diag_record['comonstats']['sshstatus'] != "null":
 397                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
 398                 elif diag_record['comonstats']['lastcotop'] != "null":
 399                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
 400                 else:
 401                         now = time.time()
 402                         last_contact = diag_record['plcnode']['last_contact']
 403                         if last_contact == None:
 404                                 # the node has never been up, so give it a break
 405                                 daysdown = -1
 406                         else:
 407                                 diff = now - last_contact
 408                                 daysdown = diff // (60*60*24)
 409                 return daysdown
 410
 411         def __getStrDaysDown(self, diag_record, nodename):
 412                 daysdown = self.__getDaysDown(diag_record, nodename)
 413                 if daysdown > 0:
 414                         return "(%d days down)"%daysdown
 415                 else:
 416                         return "Unknown number of days"
 417
 418         def __getCDVersion(self, diag_record, nodename):
 419                 cdversion = ""
 420                 #print "Getting kernel for: %s" % diag_record['nodename']
 421                 cdversion = diag_record['kernel']
 422                 return cdversion
 423
 424         def __diagnoseSite(self, loginbase, d_diag_nodes):
 425                 """
 426                 d_diag_nodes are diagnose_in entries.
 427                 """
 428                 d_diag_site = {loginbase : { 'config' :
 429                                                                                                 {'squeeze': False,
 430                                                                                                  'email': False
 431                                                                                                 },
 432                                                                         'nodes': {}
 433                                                                         }
 434                                            }
 435                 sorted_nodes = d_diag_nodes.keys()
 436                 sorted_nodes.sort()
 437                 for nodename in sorted_nodes:
 438                         node_record = d_diag_nodes[nodename]
 439                         diag_record = self.__diagnoseNode(loginbase, node_record)
 440
 441                         if diag_record != None:
 442                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 443
 444                                 # NOTE: improvement means, we need to act/squeeze and email.
 445                                 #print "DIAG_RECORD", diag_record
 446                                 if 'monitor-end-record' in diag_record['stage'] or \
 447                                    'nmreset' in diag_record['stage']:
 448                                 #       print "resetting loginbase!"
 449                                         d_diag_site[loginbase]['config']['squeeze'] = True
 450                                         d_diag_site[loginbase]['config']['email'] = True
 451                                 #else:
 452                                 #       print "NO IMPROVEMENT!!!!"
 453                         else:
 454                                 pass # there is nothing to do for this node.
 455
 456                 # NOTE: these settings can be overridden by command line arguments,
 457                 #       or the state of a record, i.e. if already in RT's Support Queue.
 458                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 459                 if nodes_up < MINUP:
 460                         d_diag_site[loginbase]['config']['squeeze'] = True
 461
 462                 max_slices = self.getMaxSlices(loginbase)
 463                 num_nodes = self.getNumNodes(loginbase)
 464                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 465                 #       or an old disabled site from previous monitor (before site['enabled'])
 466                 if nodes_up < num_nodes and max_slices != 0:
 467                         d_diag_site[loginbase]['config']['email'] = True
 468
 469                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 470                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 471
 472                 return d_diag_site
 473
 474         def diagRecordByCategory(self, node_record):
 475                 nodename = node_record['nodename']
 476                 category = node_record['category']
 477                 state    = node_record['state']
 478                 loginbase = self.plcdb_hn2lb[nodename]
 479                 diag_record = None
 480
 481                 if  "ERROR" in category:        # i.e. "DOWN"
 482                         diag_record = {}
 483                         diag_record.update(node_record)
 484                         daysdown = self.__getDaysDown(diag_record, nodename)
 485                         if daysdown < 7:
 486                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 487                                 print format % (loginbase, nodename, daysdown)
 488                                 return None
 489
 490                         s_daysdown = self.__getStrDaysDown(diag_record, nodename)
 491                         diag_record['message'] = emailTxt.mailtxt.newdown
 492                         diag_record['args'] = {'nodename': nodename}
 493                         diag_record['info'] = (nodename, s_daysdown, "")
 494
 495                         if 'reboot_node_failed' in node_record:
 496                                 # there was a previous attempt to use the PCU.
 497                                 if node_record['reboot_node_failed'] == False:
 498                                         # then the last attempt apparently, succeeded.
 499                                         # But, the category is still 'ERROR'.  Therefore, the
 500                                         # PCU-to-Node mapping is broken.
 501                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 502                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 503                                         diag_record['email_pcu'] = True
 504
 505                         if diag_record['ticket_id'] == "":
 506                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 507                                         (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
 508                         else:
 509                                 diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 510                                         (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
 511
 512                 elif "OLDBOOTCD" in category:
 513                         # V2 boot cds as determined by findbad
 514                         s_daysdown = self.__getStrDaysDown(node_record, nodename)
 515                         s_cdversion = self.__getCDVersion(node_record, nodename)
 516                         diag_record = {}
 517                         diag_record.update(node_record)
 518                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 519                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 520                         diag_record['args'] = {'nodename': nodename}
 521                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 522                         if diag_record['ticket_id'] == "":
 523                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 524                                                                         (loginbase, nodename, diag_record['kernel'],
 525                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 526                         else:
 527                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 528                                                                         (loginbase, nodename, diag_record['kernel'],
 529                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 530
 531                 elif "PROD" in category:
 532                         if "DEBUG" in state:
 533                                 # Not sure what to do with these yet.  Probably need to
 534                                 # reboot, and email.
 535                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 536                                 return None
 537                         elif "BOOT" in state:
 538                                 # no action needed.
 539                                 # TODO: remove penalties, if any are applied.
 540                                 now = time.time()
 541                                 last_contact = node_record['plcnode']['last_contact']
 542                                 if last_contact == None:
 543                                         time_diff = 0
 544                                 else:
 545                                         time_diff = now - last_contact;
 546
 547                                 if 'improvement' in node_record['stage']:
 548                                         # then we need to pass this on to 'action'
 549                                         diag_record = {}
 550                                         diag_record.update(node_record)
 551                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 552                                         diag_record['args'] = {'nodename': nodename}
 553                                         diag_record['info'] = (nodename, node_record['prev_category'],
 554                                                                                                          node_record['category'])
 555                                         if 'email_pcu' in diag_record:
 556                                                 if diag_record['email_pcu']:
 557                                                         # previously, the pcu failed to reboot, so send
 558                                                         # email. Now, reset these values to try the reboot
 559                                                         # again.
 560                                                         diag_record['email_pcu'] = False
 561                                                         del diag_record['reboot_node_failed']
 562
 563                                         if diag_record['ticket_id'] == "":
 564                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 565                                                                         (loginbase, nodename, diag_record['stage'],
 566                                                                          state, category, diag_record['found_rt_ticket'])
 567                                         else:
 568                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 569                                                                         (loginbase, nodename, diag_record['stage'],
 570                                                                          state, category, diag_record['ticket_id'])
 571                                         return diag_record
 572                                 elif time_diff >= 6*SPERHOUR:
 573                                         # heartbeat is older than 30 min.
 574                                         # then reset NM.
 575                                         #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 576                                         diag_record = {}
 577                                         diag_record.update(node_record)
 578                                         diag_record['message'] = emailTxt.mailtxt.NMReset
 579                                         diag_record['args'] = {'nodename': nodename}
 580                                         diag_record['stage'] = "nmreset"
 581                                         diag_record['info'] = (nodename,
 582                                                                                         node_record['prev_category'],
 583                                                                                         node_record['category'])
 584                                         if diag_record['ticket_id'] == "":
 585                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 586                                                                         (loginbase, nodename, diag_record['stage'],
 587                                                                          state, category, diag_record['found_rt_ticket'])
 588                                         else:
 589                                                 diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 590                                                                         (loginbase, nodename, diag_record['stage'])
 591
 592                                         return diag_record
 593                                 else:
 594                                         return None
 595                         else:
 596                                 # unknown
 597                                 pass
 598                 elif "ALPHA"    in category:
 599                         pass
 600                 elif "clock_drift" in category:
 601                         pass
 602                 elif "dns"    in category:
 603                         pass
 604                 elif "filerw"    in category:
 605                         pass
 606                 else:
 607                         print "Unknown category!!!! %s" % category
 608                         sys.exit(1)
 609
 610                 return diag_record
 611
 612         def __diagnoseNode(self, loginbase, node_record):
 613                 # TODO: change the format of the hostname in this
 614                 #               record to something more natural.
 615                 nodename                = node_record['nodename']
 616                 category                = node_record['category']
 617                 prev_category   = node_record['prev_category']
 618                 state                   = node_record['state']
 619                 #if 'prev_category' in node_record:
 620                 #       prev_category = node_record['prev_category']
 621                 #else:
 622                 #       prev_category = "ERROR"
 623                 if node_record['prev_category'] != "NORECORD":
 624
 625                         val = cmpCategoryVal(category, prev_category)
 626                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 627                         if val == 1:
 628                                 # improved
 629                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 630                                         print "closing record with no ticket: ", node_record['nodename']
 631                                         node_record['action'] = ['close_rt']
 632                                         node_record['message'] = None
 633                                         node_record['stage'] = 'monitor-end-record'
 634                                         return node_record
 635                                 else:
 636                                         node_record['stage'] = 'improvement'
 637
 638                                 #if 'monitor-end-record' in node_record['stage']:
 639                                 #       # just ignore it if it's already ended.
 640                                 #       # otherwise, the status should be worse, and we won't get
 641                                 #       # here.
 642                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 643                                 #       return None
 644 #
 645 #                                       #return None
 646                         elif val == -1:
 647                                 # current category is worse than previous, carry on
 648                                 pass
 649                         else:
 650                                 #values are equal, carry on.
 651                                 #print "why are we here?"
 652                                 pass
 653
 654                 #### COMPARE category and prev_category
 655                 # if not_equal
 656                 #       then assign a stage based on relative priorities
 657                 # else equal
 658                 #       then check category for stats.
 659                 diag_record = self.diagRecordByCategory(node_record)
 660                 if diag_record == None:
 661                         #print "diag_record == None"
 662                         return None
 663
 664                 #### found_RT_ticket
 665                 # TODO: need to record time found, and maybe add a stage for acting on it...
 666                 if 'found_rt_ticket' in diag_record and \
 667                         diag_record['found_rt_ticket'] is not None:
 668                         if diag_record['stage'] is not 'improvement':
 669                                 diag_record['stage'] = 'ticket_waitforever'
 670
 671                 current_time = time.time()
 672                 # take off four days, for the delay that database caused.
 673                 # TODO: generalize delays at PLC, and prevent enforcement when there
 674                 #               have been no emails.
 675                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 676                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 677                 delta = current_time - diag_record['time']
 678
 679                 message = diag_record['message']
 680                 act_record = {}
 681                 act_record.update(diag_record)
 682
 683                 #### DIAGNOSE STAGES
 684                 if   'findbad' in diag_record['stage']:
 685                         # The node is bad, and there's no previous record of it.
 686                         act_record['email'] = TECH
 687                         act_record['action'] = ['noop']
 688                         act_record['message'] = message[0]
 689                         act_record['stage'] = 'stage_actinoneweek'
 690
 691                 elif 'nmreset' in diag_record['stage']:
 692                         act_record['email']  = ADMIN
 693                         act_record['action'] = ['reset_nodemanager']
 694                         act_record['message'] = message[0]
 695                         act_record['stage']  = 'nmreset'
 696                         return None
 697
 698                 elif 'reboot_node' in diag_record['stage']:
 699                         act_record['email'] = TECH
 700                         act_record['action'] = ['noop']
 701                         act_record['message'] = message[0]
 702                         act_record['stage'] = 'stage_actinoneweek'
 703
 704                 elif 'improvement' in diag_record['stage']:
 705                         # - backoff previous squeeze actions (slice suspend, nocreate)
 706                         # TODO: add a backoff_squeeze section... Needs to runthrough
 707                         act_record['action'] = ['close_rt']
 708                         act_record['message'] = message[0]
 709                         act_record['stage'] = 'monitor-end-record'
 710
 711                 elif 'actinoneweek' in diag_record['stage']:
 712                         if delta >= 7 * SPERDAY:
 713                                 act_record['email'] = TECH | PI
 714                                 act_record['stage'] = 'stage_actintwoweeks'
 715                                 act_record['message'] = message[1]
 716                                 act_record['action'] = ['nocreate' ]
 717                                 act_record['time'] = current_time               # reset clock for waitforever
 718                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 719                                 act_record['email'] = TECH
 720                                 act_record['message'] = message[0]
 721                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 722                                 act_record['second-mail-at-oneweek'] = True
 723                         else:
 724                                 act_record['message'] = None
 725                                 act_record['action'] = ['waitforoneweekaction' ]
 726                                 print "ignoring this record for: %s" % act_record['nodename']
 727                                 return None                     # don't send if there's no action
 728
 729                 elif 'actintwoweeks' in diag_record['stage']:
 730                         if delta >= 7 * SPERDAY:
 731                                 act_record['email'] = TECH | PI | USER
 732                                 act_record['stage'] = 'stage_waitforever'
 733                                 act_record['message'] = message[2]
 734                                 act_record['action'] = ['suspendslices']
 735                                 act_record['time'] = current_time               # reset clock for waitforever
 736                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 737                                 act_record['email'] = TECH | PI
 738                                 act_record['message'] = message[1]
 739                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 740                                 act_record['second-mail-at-twoweeks'] = True
 741                         else:
 742                                 act_record['message'] = None
 743                                 act_record['action'] = ['waitfortwoweeksaction']
 744                                 return None                     # don't send if there's no action
 745
 746                 elif 'ticket_waitforever' in diag_record['stage']:
 747                         act_record['email'] = TECH
 748                         if 'first-found' not in act_record:
 749                                 act_record['first-found'] = True
 750                                 act_record['log'] += " firstfound"
 751                                 act_record['action'] = ['ticket_waitforever']
 752                                 act_record['message'] = None
 753                                 act_record['time'] = current_time
 754                         else:
 755                                 if delta >= 7*SPERDAY:
 756                                         act_record['action'] = ['ticket_waitforever']
 757                                         act_record['message'] = None
 758                                         act_record['time'] = current_time               # reset clock
 759                                 else:
 760                                         act_record['action'] = ['ticket_waitforever']
 761                                         act_record['message'] = None
 762                                         return None
 763
 764                 elif 'waitforever' in diag_record['stage']:
 765                         # more than 3 days since last action
 766                         # TODO: send only on weekdays.
 767                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 768                         if delta >= 3*SPERDAY:
 769                                 act_record['action'] = ['email-againwaitforever']
 770                                 act_record['message'] = message[2]
 771                                 act_record['time'] = current_time               # reset clock
 772                         else:
 773                                 act_record['action'] = ['waitforever']
 774                                 act_record['message'] = None
 775                                 return None                     # don't send if there's no action
 776
 777                 else:
 778                         # There is no action to be taken, possibly b/c the stage has
 779                         # already been performed, but diagnose picked it up again.
 780                         # two cases,
 781                         #       1. stage is unknown, or
 782                         #       2. delta is not big enough to bump it to the next stage.
 783                         # TODO: figure out which. for now assume 2.
 784                         print "UNKNOWN stage for %s; nothing done" % nodename
 785                         act_record['action'] = ['unknown']
 786                         act_record['message'] = message[0]
 787
 788                         act_record['email'] = TECH
 789                         act_record['action'] = ['noop']
 790                         act_record['message'] = message[0]
 791                         act_record['stage'] = 'stage_actinoneweek'
 792                         act_record['time'] = current_time               # reset clock
 793                         #print "Exiting..."
 794                         #return None
 795                         #sys.exit(1)
 796
 797                 print "%s" % act_record['log'],
 798                 print "%15s" % act_record['action']
 799                 return act_record
 800
 801         def getMaxSlices(self, loginbase):
 802                 # if sickdb has a loginbase, then it will have at least one node.
 803                 site_stats = None
 804
 805                 for nodename in self.diagnose_in[loginbase].keys():
 806                         if nodename in self.findbad['nodes']:
 807                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 808                                 break
 809
 810                 if site_stats == None:
 811                         raise Exception, "loginbase with no nodes in findbad"
 812                 else:
 813                         return site_stats['max_slices']
 814
 815         def getNumNodes(self, loginbase):
 816                 # if sickdb has a loginbase, then it will have at least one node.
 817                 site_stats = None
 818
 819                 for nodename in self.diagnose_in[loginbase].keys():
 820                         if nodename in self.findbad['nodes']:
 821                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 822                                 break
 823
 824                 if site_stats == None:
 825                         raise Exception, "loginbase with no nodes in findbad"
 826                 else:
 827                         return site_stats['num_nodes']
 828
 829         """
 830         Returns number of up nodes as the total number *NOT* in act_all with a
 831         stage other than 'steady-state' .
 832         """
 833         def getUpAtSite(self, loginbase, d_diag_site):
 834                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 835                 #               that aren't recorded yet.
 836
 837                 numnodes = self.getNumNodes(loginbase)
 838                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 839                 # TODO: make the 'up' value more representative
 840                 up = numnodes
 841                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 842
 843                         rec = d_diag_site[loginbase]['nodes'][nodename]
 844                         if rec['stage'] != 'monitor-end-record':
 845                                 up -= 1
 846                         else:
 847                                 pass # the node is assumed to be up.
 848
 849                 #if up != numnodes:
 850                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 851
 852                 return up
 853
 854
 855 class SiteAction:
 856         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 857                 self.parameter_names = parameter_names
 858         def checkParam(self, args):
 859                 for param in self.parameter_names:
 860                         if param not in args:
 861                                 raise Exception("Parameter %s not provided in args"%param)
 862         def run(self, args):
 863                 self.checkParam(args)
 864                 return self._run(args)
 865         def _run(self, args):
 866                 pass
 867
 868 class SuspendAction(SiteAction):
 869         def _run(self, args):
 870                 return plc.suspendSlices(args['hostname'])
 871
 872 class RemoveSliceCreation(SiteAction):
 873         def _run(self, args):
 874                 return plc.removeSliceCreation(args['hostname'])
 875
 876 class BackoffActions(SiteAction):
 877         def _run(self, args):
 878                 plc.enableSlices(args['hostname'])
 879                 plc.enableSliceCreation(args['hostname'])
 880                 return True
 881
 882 # TODO: create class for each action below,
 883 #               allow for lists of actions to be performed...
 884
 885 def close_rt_backoff(args):
 886         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
 887                 mailer.closeTicketViaRT(args['ticket_id'],
 888                                                                 "Ticket CLOSED automatically by SiteAssist.")
 889                 plc.enableSlices(args['hostname'])
 890                 plc.enableSliceCreation(args['hostname'])
 891         return
 892
 893 def reboot_node(args):
 894         host = args['hostname']
 895         return reboot.reboot_policy(host, True, config.debug)
 896
 897 def reset_nodemanager(args):
 898         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 899         return
 900
 901 class Action(Thread):
 902         def __init__(self, l_action):
 903                 self.l_action = l_action
 904
 905                 # the hostname to loginbase mapping
 906                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 907
 908                 # Actions to take.
 909                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
 910                 # Actions taken.
 911                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
 912
 913                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 914                 self.actions = {}
 915                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 916                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 917                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 918                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
 919                 self.actions['noop'] = lambda args: args
 920                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 921                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 922
 923                 self.actions['ticket_waitforever'] = lambda args: args
 924                 self.actions['waitforever'] = lambda args: args
 925                 self.actions['unknown'] = lambda args: args
 926                 self.actions['waitforoneweekaction'] = lambda args: args
 927                 self.actions['waitfortwoweeksaction'] = lambda args: args
 928                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 929                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 930                 self.actions['email-againwaitforever'] = lambda args: args
 931                 self.actions['email-againticket_waitforever'] = lambda args: args
 932
 933
 934                 self.sickdb = {}
 935                 Thread.__init__(self)
 936
 937         def run(self):
 938                 self.accumSites()
 939                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 940                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 941
 942                 try:
 943                         stats = self.analyseSites()
 944                 except Exception, err:
 945                         print "----------------"
 946                         import traceback
 947                         print traceback.print_exc()
 948                         print err
 949                         if config.policysavedb:
 950                                 print "Saving Databases... act_all"
 951                                 soltesz.dbDump("act_all", self.act_all)
 952                         sys.exit(1)
 953
 954                 print_stats("sites_observed", stats)
 955                 print_stats("sites_diagnosed", stats)
 956                 print_stats("nodes_diagnosed", stats)
 957                 print_stats("sites_emailed", stats)
 958                 print_stats("nodes_actedon", stats)
 959                 print string.join(stats['allsites'], ",")
 960
 961                 if config.policysavedb:
 962                         print "Saving Databases... act_all"
 963                         #soltesz.dbDump("policy.eventlog", self.eventlog)
 964                         # TODO: remove 'diagnose_out',
 965                         #       or at least the entries that were acted on.
 966                         soltesz.dbDump("act_all", self.act_all)
 967
 968         def accumSites(self):
 969                 """
 970                 Take all nodes, from l_action, look them up in the diagnose_db database,
 971                 and insert them into sickdb[] as:
 972
 973                 This way only the given l_action nodes will be acted on regardless
 974                 of how many from diagnose_db are available.
 975
 976                         sickdb[loginbase][nodename] = diag_record
 977                 """
 978                 # TODO: what if l_action == None ?
 979                 for nodename in self.l_action:
 980
 981                         loginbase = self.plcdb_hn2lb[nodename]
 982
 983                         if loginbase in self.diagnose_db and \
 984                                 nodename in self.diagnose_db[loginbase]['nodes']:
 985
 986                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
 987
 988                                 if loginbase not in self.sickdb:
 989                                         self.sickdb[loginbase] = {'nodes' : {}}
 990
 991                                 # NOTE: don't copy all node records, since not all will be in l_action
 992                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
 993                                 # NOTE: but, we want to get the loginbase config settings,
 994                                 #               this is the easiest way.
 995                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
 996                         #else:
 997                                 #print "%s not in diagnose_db!!" % loginbase
 998                 return
 999
1000         def __emailSite(self, loginbase, roles, message, args):
1001                 """
1002                 loginbase is the unique site abbreviation, prepended to slice names.
1003                 roles contains TECH, PI, USER roles, and derive email aliases.
1004                 record contains {'message': [<subj>,<body>], 'args': {...}}
1005                 """
1006                 ticket_id = 0
1007                 args.update({'loginbase':loginbase})
1008
1009                 if not config.mail and not config.debug and config.bcc:
1010                         roles = ADMIN
1011                 if config.mail and config.debug:
1012                         roles = ADMIN
1013
1014                 # build targets
1015                 contacts = []
1016                 if ADMIN & roles:
1017                         contacts += [config.email]
1018                 if TECH & roles:
1019                         contacts += [TECHEMAIL % loginbase]
1020                 if PI & roles:
1021                         contacts += [PIEMAIL % loginbase]
1022                 if USER & roles:
1023                         slices = plc.slices(loginbase)
1024                         if len(slices) >= 1:
1025                                 for slice in slices:
1026                                         contacts += [SLICEMAIL % slice]
1027                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1028                         else:
1029                                 print "SLIC: %20s : 0 slices" % loginbase
1030
1031                 try:
1032                         subject = message[0] % args
1033                         body = message[1] % args
1034                         if ADMIN & roles:
1035                                 # send only to admin
1036                                 if 'ticket_id' in args:
1037                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1038                                 else:
1039                                         subj = "Re: [PL noticket] %s" % subject
1040                                 mailer.email(subj, body, contacts)
1041                                 ticket_id = args['ticket_id']
1042                         else:
1043                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1044                 except Exception, err:
1045                         print "exception on message:"
1046                         import traceback
1047                         print traceback.print_exc()
1048                         print message
1049
1050                 return ticket_id
1051
1052
1053         def _format_diaginfo(self, diag_node):
1054                 info = diag_node['info']
1055                 if diag_node['stage'] == 'monitor-end-record':
1056                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1057                 else:
1058                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1059                 return hlist
1060
1061
1062         def get_email_args(self, act_recordlist, loginbase=None):
1063
1064                 email_args = {}
1065                 email_args['hostname_list'] = ""
1066
1067                 for act_record in act_recordlist:
1068                         email_args['hostname_list'] += act_record['msg_format']
1069                         email_args['hostname'] = act_record['nodename']
1070                         if  'plcnode' in act_record and \
1071                                 'pcu_ids' in act_record['plcnode'] and \
1072                                 len(act_record['plcnode']['pcu_ids']) > 0:
1073                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1074                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1075                         else:
1076                                 email_args['pcu_id'] = "-1"
1077
1078                         if 'ticket_id' in act_record:
1079                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1080                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1081                                         sys.stdout.flush()
1082                                         line = sys.stdin.readline()
1083                                         try:
1084                                                 ticket_id = int(line)
1085                                         except:
1086                                                 print "could not get ticket_id from stdin..."
1087                                                 os._exit(1)
1088                                 else:
1089                                         ticket_id = act_record['ticket_id']
1090
1091                                 email_args['ticket_id'] = ticket_id
1092
1093                 return email_args
1094
1095         def get_unique_issues(self, act_recordlist):
1096                 # NOTE: only send one email per site, per problem...
1097                 unique_issues = {}
1098                 for act_record in act_recordlist:
1099                         act_key = act_record['action'][0]
1100                         if act_key not in unique_issues:
1101                                 unique_issues[act_key] = []
1102
1103                         unique_issues[act_key] += [act_record]
1104
1105                 return unique_issues
1106
1107
1108         def __actOnSite(self, loginbase, site_record):
1109                 i_nodes_actedon = 0
1110                 i_nodes_emailed = 0
1111
1112                 act_recordlist = []
1113
1114                 for nodename in site_record['nodes'].keys():
1115                         diag_record = site_record['nodes'][nodename]
1116                         act_record  = self.__actOnNode(diag_record)
1117                         #print "nodename: %s %s" % (nodename, act_record)
1118                         if act_record is not None:
1119                                 act_recordlist += [act_record]
1120
1121                 unique_issues = self.get_unique_issues(act_recordlist)
1122
1123                 for issue in unique_issues.keys():
1124                         print "\tworking on issue: %s" % issue
1125                         issue_record_list = unique_issues[issue]
1126                         email_args = self.get_email_args(issue_record_list, loginbase)
1127
1128                         # for each record.
1129                         for act_record in issue_record_list:
1130                                 # if there's a pcu record and email config is set
1131                                 if 'email_pcu' in act_record:
1132                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1133                                                 # and 'reboot_node' in act_record['stage']:
1134
1135                                                 email_args['hostname'] = act_record['nodename']
1136                                                 ticket_id = self.__emailSite(loginbase,
1137                                                                                         act_record['email'],
1138                                                                                         emailTxt.mailtxt.pcudown[0],
1139                                                                                         email_args)
1140                                                 if ticket_id == 0:
1141                                                         # error.
1142                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1143                                                         os._exit(1)
1144                                                         pass
1145                                                 email_args['ticket_id'] = ticket_id
1146
1147
1148                         act_record = issue_record_list[0]
1149                         # send message before squeezing
1150                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1151                                                                                                 site_record['config']['email'])
1152                         if act_record['message'] != None and site_record['config']['email']:
1153                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1154                                                                                          act_record['message'], email_args)
1155
1156                                 if ticket_id == 0:
1157                                         # error.
1158                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1159                                         os._exit(1)
1160                                         pass
1161
1162                                 # Add ticket_id to ALL nodenames
1163                                 for act_record in issue_record_list:
1164                                         nodename = act_record['nodename']
1165                                         # update node record with RT ticket_id
1166                                         if nodename in self.act_all:
1167                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1168                                         if config.mail: i_nodes_emailed += 1
1169
1170                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1171                                                                                                         site_record['config']['squeeze'])
1172                         if config.squeeze and site_record['config']['squeeze']:
1173                                 for act_key in act_record['action']:
1174                                         self.actions[act_key](email_args)
1175                                 i_nodes_actedon += 1
1176
1177                 if config.policysavedb:
1178                         print "Saving Databases... act_all, diagnose_out"
1179                         soltesz.dbDump("act_all", self.act_all)
1180                         # remove site record from diagnose_out, it's in act_all as done.
1181                         del self.diagnose_db[loginbase]
1182                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1183
1184                 print "sleeping for 1 sec"
1185                 time.sleep(1)
1186                 #print "Hit enter to continue..."
1187                 #sys.stdout.flush()
1188                 #line = sys.stdin.readline()
1189
1190                 return (i_nodes_actedon, i_nodes_emailed)
1191
1192         def __actOnNode(self, diag_record):
1193                 nodename = diag_record['nodename']
1194                 message = diag_record['message']
1195
1196                 act_record = {}
1197                 act_record.update(diag_record)
1198                 act_record['nodename'] = nodename
1199                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1200                 print "act_record['stage'] == %s " % act_record['stage']
1201
1202                 # avoid end records, and nmreset records
1203                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1204
1205                 if 'monitor-end-record' not in act_record['stage'] and \
1206                    'nmreset' not in act_record['stage'] and \
1207                    'reboot_node_failed' not in act_record:
1208
1209                         if "DOWN" in act_record['log'] and \
1210                                         'pcu_ids' in act_record['plcnode'] and \
1211                                         len(act_record['plcnode']['pcu_ids']) > 0:
1212
1213                                 print "%s" % act_record['log'],
1214                                 print "%15s" % (['reboot_node'],)
1215                                 # Set node to re-install
1216                                 plc.nodeBootState(act_record['nodename'], "rins")
1217                                 try:
1218                                         ret = reboot_node({'hostname': act_record['nodename']})
1219                                 except Exception, exc:
1220                                         print "exception on reboot_node:"
1221                                         import traceback
1222                                         print traceback.print_exc()
1223                                         ret = False
1224
1225                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1226                                         # Reboot Succeeded
1227                                         print "reboot succeeded for %s" % act_record['nodename']
1228                                         act_record2 = {}
1229                                         act_record2.update(act_record)
1230                                         act_record2['action'] = ['reboot_node']
1231                                         act_record2['stage'] = "reboot_node"
1232                                         act_record2['reboot_node_failed'] = False
1233                                         act_record2['email_pcu'] = False
1234
1235                                         if nodename not in self.act_all:
1236                                                 self.act_all[nodename] = []
1237                                         print "inserting 'reboot_node' record into act_all"
1238                                         self.act_all[nodename].insert(0,act_record2)
1239
1240                                         # return None to avoid further action
1241                                         print "Taking no further action"
1242                                         return None
1243                                 else:
1244                                         print "reboot failed for %s" % act_record['nodename']
1245                                         # set email_pcu to also send pcu notice for this record.
1246                                         act_record['reboot_node_failed'] = True
1247                                         act_record['email_pcu'] = True
1248
1249                         print "%s" % act_record['log'],
1250                         print "%15s" % act_record['action']
1251
1252                 if act_record['stage'] is not 'monitor-end-record' and \
1253                    act_record['stage'] is not 'nmreset':
1254                         if nodename not in self.act_all:
1255                                 self.act_all[nodename] = []
1256
1257                         self.act_all[nodename].insert(0,act_record)
1258                 else:
1259                         print "Not recording %s in act_all" % nodename
1260
1261                 return act_record
1262
1263         def analyseSites(self):
1264                 i_sites_observed = 0
1265                 i_sites_diagnosed = 0
1266                 i_nodes_diagnosed = 0
1267                 i_nodes_actedon = 0
1268                 i_sites_emailed = 0
1269                 l_allsites = []
1270
1271                 sorted_sites = self.sickdb.keys()
1272                 sorted_sites.sort()
1273                 for loginbase in sorted_sites:
1274                         site_record = self.sickdb[loginbase]
1275                         print "sites: %s" % loginbase
1276
1277                         i_nodes_diagnosed += len(site_record.keys())
1278                         i_sites_diagnosed += 1
1279
1280                         (na,ne) = self.__actOnSite(loginbase, site_record)
1281
1282                         i_sites_observed += 1
1283                         i_nodes_actedon += na
1284                         i_sites_emailed += ne
1285
1286                         l_allsites += [loginbase]
1287
1288                 return {'sites_observed': i_sites_observed,
1289                                 'sites_diagnosed': i_sites_diagnosed,
1290                                 'nodes_diagnosed': i_nodes_diagnosed,
1291                                 'sites_emailed': i_sites_emailed,
1292                                 'nodes_actedon': i_nodes_actedon,
1293                                 'allsites':l_allsites}
1294
1295         def print_stats(self, key, stats):
1296                 print "%20s : %d" % (key, stats[key])
1297
1298
1299
1300         #"""
1301         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1302         #"""
1303         #def status(self):
1304         #       sub = "Monitor Summary"
1305         #       msg = "\nThe following nodes were acted upon:  \n\n"
1306         #       for (node, (type, date)) in self.emailed.items():
1307         #               # Print only things acted on today.
1308         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1309         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1310         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1311         #       for (loginbase, (date, type)) in self.squeezed.items():
1312         #               # Print only things acted on today.
1313         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1314         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1315         #       mailer.email(sub, msg, [SUMTO])
1316         #       logger.info(msg)
1317         #       return
1318
1319         #"""
1320         #Store/Load state of emails.  When, where, what.
1321         #"""
1322         #def emailedStore(self, action):
1323         #       try:
1324         #               if action == "LOAD":
1325         #                       f = open(DAT, "r+")
1326         #                       logger.info("POLICY:  Found and reading " + DAT)
1327         #                       self.emailed.update(pickle.load(f))
1328         #               if action == "WRITE":
1329         #                       f = open(DAT, "w")
1330         #                       #logger.debug("Writing " + DAT)
1331         #                       pickle.dump(self.emailed, f)
1332         #               f.close()
1333         #       except Exception, err:
1334         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1335
1336
1337 #class Policy(Thread):
1338
1339 def main():
1340         print "policy.py is a module, not a script for running directly."
1341
1342 if __name__ == '__main__':
1343         import os
1344         import plc
1345         try:
1346                 main()
1347         except KeyboardInterrupt:
1348                 print "Killed.  Exitting."
1349                 logger.info('Monitor Killed')
1350                 os._exit(0)