policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import database
  23 import string
  24 from unified_model import cmpCategoryVal
  25 import config
  26
  27 DAT="./monitor.dat"
  28
  29 logger = logging.getLogger("monitor")
  30
  31 # Time to enforce policy
  32 POLSLEEP = 7200
  33
  34 # Where to email the summary
  35 SUMTO = "soltesz@cs.princeton.edu"
  36 TECHEMAIL="tech-%s@sites.planet-lab.org"
  37 PIEMAIL="pi-%s@sites.planet-lab.org"
  38 SLICEMAIL="%s@slices.planet-lab.org"
  39 PLCEMAIL="support@planet-lab.org"
  40
  41 #Thresholds (DAYS)
  42 SPERMIN = 60
  43 SPERHOUR = 60*60
  44 SPERDAY = 86400
  45 PITHRESH = 7 * SPERDAY
  46 SLICETHRESH = 7 * SPERDAY
  47 # Days before attempting rins again
  48 RINSTHRESH = 5 * SPERDAY
  49
  50 # Days before calling the node dead.
  51 DEADTHRESH = 30 * SPERDAY
  52 # Minimum number of nodes up before squeezing
  53 MINUP = 2
  54
  55 TECH=1
  56 PI=2
  57 USER=4
  58 ADMIN=8
  59
  60 # IF:
  61 #  no SSH, down.
  62 #  bad disk, down
  63 #  DNS, kinda down (sick)
  64 #  clock, kinda down (sick)
  65 #  Full disk, going to be down
  66
  67 # Actions:
  68 #  Email
  69 #  suspend slice creation
  70 #  kill slices
  71 def array_to_priority_map(array):
  72         """ Create a mapping where each entry of array is given a priority equal
  73         to its position in the array.  This is useful for subsequent use in the
  74         cmpMap() function."""
  75         map = {}
  76         count = 0
  77         for i in array:
  78                 map[i] = count
  79                 count += 1
  80         return map
  81
  82 def getdebug():
  83         return config.debug
  84
  85 def print_stats(key, stats):
  86         if key in stats: print "%20s : %d" % (key, stats[key])
  87
  88
  89 class Merge(Thread):
  90         def __init__(self, l_merge, toRT):
  91                 self.toRT = toRT
  92                 self.merge_list = l_merge
  93                 # the hostname to loginbase mapping
  94                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
  95
  96                 # Previous actions taken on nodes.
  97                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
  98                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
  99
 100                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
 101                 self.sickdb = {}
 102                 self.mergedb = {}
 103                 Thread.__init__(self)
 104
 105         def run(self):
 106                 # populate sickdb
 107                 self.accumSickSites()
 108                 # read data from findbad and act_all
 109                 self.mergeActionsAndBadDB()
 110                 # pass node_records to RT
 111                 self.sendToRT()
 112
 113         def accumSickSites(self):
 114                 """
 115                 Take all nodes, from l_diagnose, look them up in the act_all database,
 116                 and insert them into sickdb[] as:
 117
 118                         sickdb[loginbase][nodename] = fb_record
 119                 """
 120                 # look at all problems reported by findbad
 121                 l_nodes = self.findbad['nodes'].keys()
 122                 count = 0
 123                 for nodename in l_nodes:
 124                         if nodename not in self.merge_list:
 125                                 continue                # skip this node, since it's not wanted
 126
 127                         count += 1
 128                         loginbase = self.plcdb_hn2lb[nodename]
 129                         values = self.findbad['nodes'][nodename]['values']
 130
 131                         fb_record = {}
 132                         fb_record['nodename'] = nodename
 133                         try:
 134                                 fb_record['category'] = values['category']
 135                         except:
 136                                 print values
 137                                 print nodename
 138                                 print self.findbad['nodes'][nodename]
 139                                 count -= 1
 140                                 continue
 141                         fb_record['state'] = values['state']
 142                         fb_record['comonstats'] = values['comonstats']
 143                         fb_record['plcnode'] = values['plcnode']
 144                         fb_record['kernel'] = self.getKernel(values['kernel'])
 145                         fb_record['stage'] = "findbad"
 146                         fb_record['message'] = None
 147                         fb_record['bootcd'] = values['bootcd']
 148                         fb_record['args'] = None
 149                         fb_record['info'] = None
 150                         fb_record['time'] = time.time()
 151                         fb_record['date_created'] = time.time()
 152
 153                         if loginbase not in self.sickdb:
 154                                 self.sickdb[loginbase] = {}
 155
 156                         self.sickdb[loginbase][nodename] = fb_record
 157
 158                 print "Found %d nodes" % count
 159
 160         def getKernel(self, unamestr):
 161                 s = unamestr.split()
 162                 if len(s) > 2:
 163                         return s[2]
 164                 else:
 165                         return ""
 166
 167         def mergeActionsAndBadDB(self):
 168                 """
 169                 - Look at the sick node_records as reported in findbad,
 170                 - Then look at the node_records in act_all.
 171
 172                 There are four cases:
 173                 1) Problem in findbad, no problem in act_all
 174                         this ok, b/c it just means it's a new problem
 175                 2) Problem in findbad, problem in act_all
 176                         -Did the problem get better or worse?
 177                                 -If Same, or Worse, then continue looking for open tickets.
 178                                 -If Better, or No problem, then "back-off" penalties.
 179                                         This judgement may need to wait until 'Diagnose()'
 180
 181                 3) No problem in findbad, problem in act_all
 182                         The the node is operational again according to Findbad()
 183
 184                 4) No problem in findbad, no problem in act_all
 185                         There won't be a record in either db, so there's no code.
 186                 """
 187
 188                 sorted_sites = self.sickdb.keys()
 189                 sorted_sites.sort()
 190                 # look at all problems reported by findbad
 191                 for loginbase in sorted_sites:
 192                         d_fb_nodes = self.sickdb[loginbase]
 193                         sorted_nodes = d_fb_nodes.keys()
 194                         sorted_nodes.sort()
 195                         for nodename in sorted_nodes:
 196                                 fb_record = self.sickdb[loginbase][nodename]
 197                                 x = fb_record
 198                                 if loginbase not in self.mergedb:
 199                                         self.mergedb[loginbase] = {}
 200
 201                                 # take the info either from act_all or fb-record.
 202                                 # if node not in act_all
 203                                 #       then take it from fbrecord, obviously.
 204                                 # else node in act_all
 205                                 #   if act_all == 0 length (no previous records)
 206                                 #               then take it from fbrecord.
 207                                 #   else
 208                                 #           take it from act_all.
 209                                 #
 210
 211                                 # We must compare findbad state with act_all state
 212                                 if nodename not in self.act_all:
 213                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 214                                         self.mergedb[loginbase][nodename] = {}
 215                                         self.mergedb[loginbase][nodename].update(x)
 216                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 217                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 218                                 else:
 219                                         if len(self.act_all[nodename]) == 0:
 220                                                 self.mergedb[loginbase][nodename] = {}
 221                                                 self.mergedb[loginbase][nodename].update(x)
 222                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
 223                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 224                                         else:
 225                                                 y = self.act_all[nodename][0]
 226                                                 y['prev_category'] = y['category']
 227
 228                                                 self.mergedb[loginbase][nodename] = {}
 229                                                 self.mergedb[loginbase][nodename].update(y)
 230                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 231                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
 232                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
 233                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 234                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 235                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 236                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
 237                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
 238
 239                                         # delete the entry from cache_all to keep it out of case 3)
 240                                         del self.cache_all[nodename]
 241
 242                 # 3) nodes that remin in cache_all were not identified by findbad.
 243                 #        Do we keep them or not?
 244                 #   NOTE: i think that since the categories are performed before this
 245                 #               step now, and by a monitor-controlled agent.
 246
 247                 # TODO: This does not work correctly.  Do we need this?
 248                 #for hn in self.cache_all.keys():
 249                 #       y = self.act_all[hn][0]
 250                 #       if 'monitor' in y['bucket']:
 251                 #               loginbase = self.plcdb_hn2lb[hn]
 252                 #               if loginbase not in self.sickdb:
 253                 #                       self.sickdb[loginbase] = {}
 254                 #               self.sickdb[loginbase][hn] = y
 255                 #       else:
 256                 #               del self.cache_all[hn]
 257
 258                 print "len of cache_all: %d" % len(self.cache_all.keys())
 259                 return
 260
 261         def sendToRT(self):
 262                 sorted_sites = self.mergedb.keys()
 263                 sorted_sites.sort()
 264                 # look at all problems reported by merge
 265                 for loginbase in sorted_sites:
 266                         d_merge_nodes = self.mergedb[loginbase]
 267                         for nodename in d_merge_nodes.keys():
 268                                 record = self.mergedb[loginbase][nodename]
 269                                 self.toRT.put(record)
 270
 271                 # send signal to stop reading
 272                 self.toRT.put(None)
 273                 return
 274
 275 class Diagnose(Thread):
 276         def __init__(self, fromRT):
 277                 self.fromRT = fromRT
 278                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 279                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 280
 281                 self.diagnose_in = {}
 282                 self.diagnose_out = {}
 283                 Thread.__init__(self)
 284
 285
 286         def run(self):
 287                 self.accumSickSites()
 288
 289                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 290                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 291
 292                 try:
 293                         stats = self.diagnoseAll()
 294                 except Exception, err:
 295                         print "----------------"
 296                         import traceback
 297                         print traceback.print_exc()
 298                         from nodecommon import email_exception
 299                         email_exception()
 300                         print err
 301                         #if config.policysavedb:
 302                         sys.exit(1)
 303
 304                 print_stats("sites_observed", stats)
 305                 print_stats("sites_diagnosed", stats)
 306                 print_stats("nodes_diagnosed", stats)
 307
 308                 if config.policysavedb:
 309                         print "Saving Databases... diagnose_out"
 310                         database.dbDump("diagnose_out", self.diagnose_out)
 311
 312         def accumSickSites(self):
 313                 """
 314                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 315                 and insert them into diagnose_in[] as:
 316
 317                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 318                 """
 319                 while 1:
 320                         node_record = self.fromRT.get(block = True)
 321                         if node_record == None:
 322                                 break;
 323
 324                         nodename = node_record['nodename']
 325                         loginbase = self.plcdb_hn2lb[nodename]
 326
 327                         if loginbase not in self.diagnose_in:
 328                                 self.diagnose_in[loginbase] = {}
 329
 330                         self.diagnose_in[loginbase][nodename] = node_record
 331
 332                 return
 333
 334         def diagnoseAll(self):
 335                 i_sites_observed = 0
 336                 i_sites_diagnosed = 0
 337                 i_nodes_diagnosed = 0
 338                 i_nodes_actedon = 0
 339                 i_sites_emailed = 0
 340                 l_allsites = []
 341
 342                 sorted_sites = self.diagnose_in.keys()
 343                 sorted_sites.sort()
 344                 self.diagnose_out= {}
 345                 for loginbase in sorted_sites:
 346                         l_allsites += [loginbase]
 347
 348                         d_diag_nodes = self.diagnose_in[loginbase]
 349                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 350                         # store records in diagnose_out, for saving later.
 351                         self.diagnose_out.update(d_act_records)
 352
 353                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 354                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 355                                 i_sites_diagnosed += 1
 356                         i_sites_observed += 1
 357
 358                 return {'sites_observed': i_sites_observed,
 359                                 'sites_diagnosed': i_sites_diagnosed,
 360                                 'nodes_diagnosed': i_nodes_diagnosed,
 361                                 'allsites':l_allsites}
 362
 363                 pass
 364
 365         def getDaysDown(cls, diag_record):
 366                 daysdown = -1
 367                 last_contact = diag_record['plcnode']['last_contact']
 368                 date_created = diag_record['plcnode']['date_created']
 369
 370                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
 371                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 372                 elif last_contact is None:
 373                         if date_created is not None:
 374                                 now = time.time()
 375                                 diff = now - date_created
 376                                 daysdown = diff // (60*60*24)
 377                         else:
 378                                 daysdown = -1
 379                 else:
 380                         now = time.time()
 381                         diff = now - last_contact
 382                         daysdown = diff // (60*60*24)
 383                 return daysdown
 384         getDaysDown = classmethod(getDaysDown)
 385
 386         def getStrDaysDown(cls, diag_record):
 387                 daysdown = "unknown"
 388                 last_contact = diag_record['plcnode']['last_contact']
 389                 date_created = diag_record['plcnode']['date_created']
 390
 391                 if      diag_record['comonstats']['uptime'] != "null" and \
 392                         diag_record['comonstats']['uptime'] != "-1":
 393                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 394                         daysdown = "%d days up" % daysdown
 395
 396                 elif last_contact is None:
 397                         if date_created is not None:
 398                                 now = time.time()
 399                                 diff = now - date_created
 400                                 daysdown = diff // (60*60*24)
 401                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
 402                         else:
 403                                 daysdown = "Never contacted PLC"
 404                 else:
 405                         now = time.time()
 406                         diff = now - last_contact
 407                         daysdown = diff // (60*60*24)
 408                         daysdown = "%s days down" % daysdown
 409                 return daysdown
 410         getStrDaysDown = classmethod(getStrDaysDown)
 411         #def getStrDaysDown(cls, diag_record):
 412         #       daysdown = cls.getDaysDown(diag_record)
 413         #       if daysdown > -1:
 414         #               return "%d days down"%daysdown
 415         #       elif daysdown == -1:
 416         #               return "Has never contacted PLC"
 417         #       else:
 418         #               return "%d days up"% -daysdown
 419         #getStrDaysDown = classmethod(getStrDaysDown)
 420
 421         def __getCDVersion(self, diag_record, nodename):
 422                 cdversion = ""
 423                 #print "Getting kernel for: %s" % diag_record['nodename']
 424                 cdversion = diag_record['kernel']
 425                 return cdversion
 426
 427         def __diagnoseSite(self, loginbase, d_diag_nodes):
 428                 """
 429                 d_diag_nodes are diagnose_in entries.
 430                 """
 431                 d_diag_site = {loginbase : { 'config' :
 432                                                                                                 {'squeeze': False,
 433                                                                                                  'email': False
 434                                                                                                 },
 435                                                                         'nodes': {}
 436                                                                         }
 437                                            }
 438                 sorted_nodes = d_diag_nodes.keys()
 439                 sorted_nodes.sort()
 440                 for nodename in sorted_nodes:
 441                         node_record = d_diag_nodes[nodename]
 442                         diag_record = self.__diagnoseNode(loginbase, node_record)
 443
 444                         if diag_record != None:
 445                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 446
 447                                 # NOTE: improvement means, we need to act/squeeze and email.
 448                                 #print "DIAG_RECORD", diag_record
 449                                 if 'monitor-end-record' in diag_record['stage'] or \
 450                                    'nmreset' in diag_record['stage']:
 451                                 #       print "resetting loginbase!"
 452                                         d_diag_site[loginbase]['config']['squeeze'] = True
 453                                         d_diag_site[loginbase]['config']['email'] = True
 454                                 #else:
 455                                 #       print "NO IMPROVEMENT!!!!"
 456                         else:
 457                                 pass # there is nothing to do for this node.
 458
 459                 # NOTE: these settings can be overridden by command line arguments,
 460                 #       or the state of a record, i.e. if already in RT's Support Queue.
 461                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 462                 if nodes_up < MINUP:
 463                         d_diag_site[loginbase]['config']['squeeze'] = True
 464
 465                 max_slices = self.getMaxSlices(loginbase)
 466                 num_nodes = self.getNumNodes(loginbase)
 467                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 468                 #       or an old disabled site from previous monitor (before site['enabled'])
 469                 if nodes_up < num_nodes and max_slices != 0:
 470                         d_diag_site[loginbase]['config']['email'] = True
 471
 472                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 473                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 474
 475                 return d_diag_site
 476
 477         def diagRecordByCategory(self, node_record):
 478                 nodename = node_record['nodename']
 479                 category = node_record['category']
 480                 state    = node_record['state']
 481                 loginbase = self.plcdb_hn2lb[nodename]
 482                 diag_record = None
 483
 484                 if  "ERROR" in category:        # i.e. "DOWN"
 485                         diag_record = {}
 486                         diag_record.update(node_record)
 487                         daysdown = self.getDaysDown(diag_record)
 488                         if daysdown < 7:
 489                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 490                                 print format % (loginbase, nodename, daysdown)
 491                                 return None
 492
 493                         s_daysdown = self.getStrDaysDown(diag_record)
 494                         diag_record['message'] = emailTxt.mailtxt.newdown
 495                         diag_record['args'] = {'nodename': nodename}
 496                         diag_record['info'] = (nodename, s_daysdown, "")
 497
 498                         if 'reboot_node_failed' in node_record:
 499                                 # there was a previous attempt to use the PCU.
 500                                 if node_record['reboot_node_failed'] == False:
 501                                         # then the last attempt apparently, succeeded.
 502                                         # But, the category is still 'ERROR'.  Therefore, the
 503                                         # PCU-to-Node mapping is broken.
 504                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 505                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 506                                         diag_record['email_pcu'] = True
 507
 508                         if 'ticket_id' in diag_record:
 509                                 if diag_record['ticket_id'] == "":
 510                                         if 'found_rt_ticket' in diag_record:
 511                                                 ticket_id = diag_record['found_rt_ticket']
 512                                         else:
 513                                                 ticket_id = "None"
 514                                 else:
 515                                         ticket_id = diag_record['ticket_id']
 516                         else:
 517                                 ticket_id = "None"
 518
 519                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 520                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
 521
 522                 elif "OLDBOOTCD" in category:
 523                         # V2 boot cds as determined by findbad
 524                         s_daysdown = self.getStrDaysDown(node_record)
 525                         s_cdversion = self.__getCDVersion(node_record, nodename)
 526                         diag_record = {}
 527                         diag_record.update(node_record)
 528                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 529                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 530                         diag_record['args'] = {'nodename': nodename}
 531                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 532                         if diag_record['ticket_id'] == "":
 533                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 534                                                                         (loginbase, nodename, diag_record['kernel'],
 535                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 536                         else:
 537                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 538                                                                         (loginbase, nodename, diag_record['kernel'],
 539                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 540
 541                 elif "PROD" in category:
 542                         if "DEBUG" in state:
 543                                 # Not sure what to do with these yet.  Probably need to
 544                                 # reboot, and email.
 545                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 546                                 return None
 547                         elif "BOOT" in state:
 548                                 # no action needed.
 549                                 # TODO: remove penalties, if any are applied.
 550                                 now = time.time()
 551                                 last_contact = node_record['plcnode']['last_contact']
 552                                 if last_contact == None:
 553                                         time_diff = 0
 554                                 else:
 555                                         time_diff = now - last_contact;
 556
 557                                 if 'improvement' in node_record['stage']:
 558                                         # then we need to pass this on to 'action'
 559                                         diag_record = {}
 560                                         diag_record.update(node_record)
 561                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 562                                         diag_record['args'] = {'nodename': nodename}
 563                                         diag_record['info'] = (nodename, node_record['prev_category'],
 564                                                                                                          node_record['category'])
 565                                         if 'email_pcu' in diag_record:
 566                                                 if diag_record['email_pcu']:
 567                                                         # previously, the pcu failed to reboot, so send
 568                                                         # email. Now, reset these values to try the reboot
 569                                                         # again.
 570                                                         diag_record['email_pcu'] = False
 571                                                         del diag_record['reboot_node_failed']
 572
 573                                         if diag_record['ticket_id'] == "":
 574                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 575                                                                         (loginbase, nodename, diag_record['stage'],
 576                                                                          state, category, diag_record['found_rt_ticket'])
 577                                         else:
 578                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 579                                                                         (loginbase, nodename, diag_record['stage'],
 580                                                                          state, category, diag_record['ticket_id'])
 581                                         return diag_record
 582                                 #elif time_diff >= 6*SPERHOUR:
 583                                 #       # heartbeat is older than 30 min.
 584                                 #       # then reset NM.
 585                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 586                                 #       diag_record = {}
 587                                 #       diag_record.update(node_record)
 588                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
 589                                 #       diag_record['args'] = {'nodename': nodename}
 590                                 #       diag_record['stage'] = "nmreset"
 591                                 #       diag_record['info'] = (nodename,
 592                                 #                                                       node_record['prev_category'],
 593                                 #                                                       node_record['category'])
 594                                 #       if diag_record['ticket_id'] == "":
 595                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 596                                 #                                       (loginbase, nodename, diag_record['stage'],
 597                                 #                                        state, category, diag_record['found_rt_ticket'])
 598                                 #       else:
 599                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 600                                 #                                       (loginbase, nodename, diag_record['stage'])
 601 #
 602 #                                       return diag_record
 603                                 else:
 604                                         return None
 605                         else:
 606                                 # unknown
 607                                 pass
 608                 elif "ALPHA"    in category:
 609                         pass
 610                 elif "clock_drift" in category:
 611                         pass
 612                 elif "dns"    in category:
 613                         pass
 614                 elif "filerw"    in category:
 615                         pass
 616                 else:
 617                         print "Unknown category!!!! %s" % category
 618                         sys.exit(1)
 619
 620                 return diag_record
 621
 622         def __diagnoseNode(self, loginbase, node_record):
 623                 # TODO: change the format of the hostname in this
 624                 #               record to something more natural.
 625                 nodename                = node_record['nodename']
 626                 category                = node_record['category']
 627                 prev_category   = node_record['prev_category']
 628                 state                   = node_record['state']
 629                 #if 'prev_category' in node_record:
 630                 #       prev_category = node_record['prev_category']
 631                 #else:
 632                 #       prev_category = "ERROR"
 633                 if node_record['prev_category'] != "NORECORD":
 634
 635                         val = cmpCategoryVal(category, prev_category)
 636                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 637                         if val == 1:
 638                                 # improved
 639                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 640                                         print "closing record with no ticket: ", node_record['nodename']
 641                                         node_record['action'] = ['close_rt']
 642                                         node_record['message'] = None
 643                                         node_record['stage'] = 'monitor-end-record'
 644                                         return node_record
 645                                 else:
 646                                         node_record['stage'] = 'improvement'
 647
 648                                 #if 'monitor-end-record' in node_record['stage']:
 649                                 #       # just ignore it if it's already ended.
 650                                 #       # otherwise, the status should be worse, and we won't get
 651                                 #       # here.
 652                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 653                                 #       return None
 654 #
 655 #                                       #return None
 656                         elif val == -1:
 657                                 # current category is worse than previous, carry on
 658                                 pass
 659                         else:
 660                                 #values are equal, carry on.
 661                                 #print "why are we here?"
 662                                 pass
 663
 664                 if 'rt' in node_record and 'Status' in node_record['rt']:
 665                         if node_record['stage'] == 'ticket_waitforever':
 666                                 if 'resolved' in node_record['rt']['Status']:
 667                                         print "ending waitforever record for: ", node_record['nodename']
 668                                         node_record['action'] = ['noop']
 669                                         node_record['message'] = None
 670                                         node_record['stage'] = 'monitor-end-record'
 671                                         print "oldlog: %s" % node_record['log'],
 672                                         print "%15s" % node_record['action']
 673                                         return node_record
 674                                 if 'new' in node_record['rt']['Status'] and \
 675                                         'Queue' in node_record['rt'] and \
 676                                         'Monitor' in node_record['rt']['Queue']:
 677
 678                                         print "RESETTING stage to findbad"
 679                                         node_record['stage'] = 'findbad'
 680
 681                 #### COMPARE category and prev_category
 682                 # if not_equal
 683                 #       then assign a stage based on relative priorities
 684                 # else equal
 685                 #       then check category for stats.
 686                 diag_record = self.diagRecordByCategory(node_record)
 687                 if diag_record == None:
 688                         #print "diag_record == None"
 689                         return None
 690
 691                 #### found_RT_ticket
 692                 # TODO: need to record time found, and maybe add a stage for acting on it...
 693                 # NOTE: after found, if the support ticket is resolved, the block is
 694                 #               not removed. How to remove the block on this?
 695                 if 'found_rt_ticket' in diag_record and \
 696                         diag_record['found_rt_ticket'] is not None:
 697                         if diag_record['stage'] is not 'improvement':
 698                                 diag_record['stage'] = 'ticket_waitforever'
 699
 700                 current_time = time.time()
 701                 # take off four days, for the delay that database caused.
 702                 # TODO: generalize delays at PLC, and prevent enforcement when there
 703                 #               have been no emails.
 704                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 705                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 706                 delta = current_time - diag_record['time']
 707
 708                 message = diag_record['message']
 709                 act_record = {}
 710                 act_record.update(diag_record)
 711
 712                 #### DIAGNOSE STAGES
 713                 if   'findbad' in diag_record['stage']:
 714                         # The node is bad, and there's no previous record of it.
 715                         act_record['email'] = TECH
 716                         act_record['action'] = ['noop']
 717                         act_record['message'] = message[0]
 718                         act_record['stage'] = 'stage_actinoneweek'
 719
 720                 elif 'nmreset' in diag_record['stage']:
 721                         act_record['email']  = ADMIN
 722                         act_record['action'] = ['reset_nodemanager']
 723                         act_record['message'] = message[0]
 724                         act_record['stage']  = 'nmreset'
 725                         return None
 726
 727                 elif 'reboot_node' in diag_record['stage']:
 728                         act_record['email'] = TECH
 729                         act_record['action'] = ['noop']
 730                         act_record['message'] = message[0]
 731                         act_record['stage'] = 'stage_actinoneweek'
 732
 733                 elif 'improvement' in diag_record['stage']:
 734                         # - backoff previous squeeze actions (slice suspend, nocreate)
 735                         # TODO: add a backoff_squeeze section... Needs to runthrough
 736                         print "backing off of %s" % nodename
 737                         act_record['action'] = ['close_rt']
 738                         act_record['message'] = message[0]
 739                         act_record['stage'] = 'monitor-end-record'
 740
 741                 elif 'actinoneweek' in diag_record['stage']:
 742                         if delta >= 7 * SPERDAY:
 743                                 act_record['email'] = TECH | PI
 744                                 act_record['stage'] = 'stage_actintwoweeks'
 745                                 act_record['message'] = message[1]
 746                                 act_record['action'] = ['nocreate' ]
 747                                 act_record['time'] = current_time               # reset clock for waitforever
 748                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 749                                 act_record['email'] = TECH
 750                                 act_record['message'] = message[0]
 751                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 752                                 act_record['second-mail-at-oneweek'] = True
 753                         else:
 754                                 act_record['message'] = None
 755                                 act_record['action'] = ['waitforoneweekaction' ]
 756                                 print "ignoring this record for: %s" % act_record['nodename']
 757                                 return None                     # don't send if there's no action
 758
 759                 elif 'actintwoweeks' in diag_record['stage']:
 760                         if delta >= 7 * SPERDAY:
 761                                 act_record['email'] = TECH | PI | USER
 762                                 act_record['stage'] = 'stage_waitforever'
 763                                 act_record['message'] = message[2]
 764                                 act_record['action'] = ['suspendslices']
 765                                 act_record['time'] = current_time               # reset clock for waitforever
 766                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 767                                 act_record['email'] = TECH | PI
 768                                 act_record['message'] = message[1]
 769                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 770                                 act_record['second-mail-at-twoweeks'] = True
 771                         else:
 772                                 act_record['message'] = None
 773                                 act_record['action'] = ['waitfortwoweeksaction']
 774                                 return None                     # don't send if there's no action
 775
 776                 elif 'ticket_waitforever' in diag_record['stage']:
 777                         act_record['email'] = TECH
 778                         if 'first-found' not in act_record:
 779                                 act_record['first-found'] = True
 780                                 act_record['log'] += " firstfound"
 781                                 act_record['action'] = ['ticket_waitforever']
 782                                 act_record['message'] = message[0]
 783                                 act_record['time'] = current_time
 784                         else:
 785                                 if delta >= 7*SPERDAY:
 786                                         act_record['action'] = ['ticket_waitforever']
 787                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
 788                                                         act_record['rt']['Status'] == 'new':
 789                                                 act_record['message'] = message[0]
 790                                         else:
 791                                                 act_record['message'] = None
 792
 793                                         act_record['time'] = current_time               # reset clock
 794                                 else:
 795                                         act_record['action'] = ['ticket_waitforever']
 796                                         act_record['message'] = None
 797                                         return None
 798
 799                 elif 'waitforever' in diag_record['stage']:
 800                         # more than 3 days since last action
 801                         # TODO: send only on weekdays.
 802                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 803                         if delta >= 3*SPERDAY:
 804                                 act_record['action'] = ['email-againwaitforever']
 805                                 act_record['message'] = message[2]
 806                                 act_record['time'] = current_time               # reset clock
 807                         else:
 808                                 act_record['action'] = ['waitforever']
 809                                 act_record['message'] = None
 810                                 return None                     # don't send if there's no action
 811
 812                 else:
 813                         # There is no action to be taken, possibly b/c the stage has
 814                         # already been performed, but diagnose picked it up again.
 815                         # two cases,
 816                         #       1. stage is unknown, or
 817                         #       2. delta is not big enough to bump it to the next stage.
 818                         # TODO: figure out which. for now assume 2.
 819                         print "UNKNOWN stage for %s; nothing done" % nodename
 820                         act_record['action'] = ['unknown']
 821                         act_record['message'] = message[0]
 822
 823                         act_record['email'] = TECH
 824                         act_record['action'] = ['noop']
 825                         act_record['message'] = message[0]
 826                         act_record['stage'] = 'stage_actinoneweek'
 827                         act_record['time'] = current_time               # reset clock
 828                         #print "Exiting..."
 829                         #return None
 830                         #sys.exit(1)
 831
 832                 print "%s" % act_record['log'],
 833                 print "%15s" % act_record['action']
 834                 return act_record
 835
 836         def getMaxSlices(self, loginbase):
 837                 # if sickdb has a loginbase, then it will have at least one node.
 838                 site_stats = None
 839
 840                 for nodename in self.diagnose_in[loginbase].keys():
 841                         if nodename in self.findbad['nodes']:
 842                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 843                                 break
 844
 845                 if site_stats == None:
 846                         raise Exception, "loginbase with no nodes in findbad"
 847                 else:
 848                         return site_stats['max_slices']
 849
 850         def getNumNodes(self, loginbase):
 851                 # if sickdb has a loginbase, then it will have at least one node.
 852                 site_stats = None
 853
 854                 for nodename in self.diagnose_in[loginbase].keys():
 855                         if nodename in self.findbad['nodes']:
 856                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 857                                 break
 858
 859                 if site_stats == None:
 860                         raise Exception, "loginbase with no nodes in findbad"
 861                 else:
 862                         if 'num_nodes' in site_stats:
 863                                 return site_stats['num_nodes']
 864                         else:
 865                                 return 0
 866
 867         """
 868         Returns number of up nodes as the total number *NOT* in act_all with a
 869         stage other than 'steady-state' .
 870         """
 871         def getUpAtSite(self, loginbase, d_diag_site):
 872                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 873                 #               that aren't recorded yet.
 874
 875                 numnodes = self.getNumNodes(loginbase)
 876                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 877                 # TODO: make the 'up' value more representative
 878                 up = numnodes
 879                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 880
 881                         rec = d_diag_site[loginbase]['nodes'][nodename]
 882                         if rec['stage'] != 'monitor-end-record':
 883                                 up -= 1
 884                         else:
 885                                 pass # the node is assumed to be up.
 886
 887                 #if up != numnodes:
 888                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 889
 890                 return up
 891
 892
 893 class SiteAction:
 894         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 895                 self.parameter_names = parameter_names
 896         def checkParam(self, args):
 897                 for param in self.parameter_names:
 898                         if param not in args:
 899                                 raise Exception("Parameter %s not provided in args"%param)
 900         def run(self, args):
 901                 self.checkParam(args)
 902                 return self._run(args)
 903         def _run(self, args):
 904                 pass
 905
 906 class SuspendAction(SiteAction):
 907         def _run(self, args):
 908                 return plc.suspendSlices(args['hostname'])
 909
 910 class RemoveSliceCreation(SiteAction):
 911         def _run(self, args):
 912                 return plc.removeSliceCreation(args['hostname'])
 913
 914 class BackoffActions(SiteAction):
 915         def _run(self, args):
 916                 plc.enableSlices(args['hostname'])
 917                 plc.enableSliceCreation(args['hostname'])
 918                 return True
 919
 920 # TODO: create class for each action below,
 921 #               allow for lists of actions to be performed...
 922
 923
 924
 925 def reset_nodemanager(args):
 926         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 927         return
 928
 929 class Action(Thread):
 930         def __init__(self, l_action):
 931                 self.l_action = l_action
 932
 933                 # the hostname to loginbase mapping
 934                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 935
 936                 # Actions to take.
 937                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
 938                 # Actions taken.
 939                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
 940
 941                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 942                 self.actions = {}
 943                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 944                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 945                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 946                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "reinstall")
 947                 self.actions['noop'] = lambda args: args
 948                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 949                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 950
 951                 self.actions['ticket_waitforever'] = lambda args: args
 952                 self.actions['waitforever'] = lambda args: args
 953                 self.actions['unknown'] = lambda args: args
 954                 self.actions['waitforoneweekaction'] = lambda args: args
 955                 self.actions['waitfortwoweeksaction'] = lambda args: args
 956                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 957                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 958                 self.actions['email-againwaitforever'] = lambda args: args
 959                 self.actions['email-againticket_waitforever'] = lambda args: args
 960
 961
 962                 self.sickdb = {}
 963                 Thread.__init__(self)
 964
 965         def run(self):
 966                 self.accumSites()
 967                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 968                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 969
 970                 try:
 971                         stats = self.analyseSites()
 972                 except Exception, err:
 973                         print "----------------"
 974                         import traceback
 975                         print traceback.print_exc()
 976                         print err
 977                         if config.policysavedb:
 978                                 print "Saving Databases... act_all"
 979                                 database.dbDump("act_all", self.act_all)
 980                         sys.exit(1)
 981
 982                 print_stats("sites_observed", stats)
 983                 print_stats("sites_diagnosed", stats)
 984                 print_stats("nodes_diagnosed", stats)
 985                 print_stats("sites_emailed", stats)
 986                 print_stats("nodes_actedon", stats)
 987                 print string.join(stats['allsites'], ",")
 988
 989                 if config.policysavedb:
 990                         print "Saving Databases... act_all"
 991                         #database.dbDump("policy.eventlog", self.eventlog)
 992                         # TODO: remove 'diagnose_out',
 993                         #       or at least the entries that were acted on.
 994                         database.dbDump("act_all", self.act_all)
 995
 996         def accumSites(self):
 997                 """
 998                 Take all nodes, from l_action, look them up in the diagnose_db database,
 999                 and insert them into sickdb[] as:
1000
1001                 This way only the given l_action nodes will be acted on regardless
1002                 of how many from diagnose_db are available.
1003
1004                         sickdb[loginbase][nodename] = diag_record
1005                 """
1006                 # TODO: what if l_action == None ?
1007                 for nodename in self.l_action:
1008
1009                         loginbase = self.plcdb_hn2lb[nodename]
1010
1011                         if loginbase in self.diagnose_db and \
1012                                 nodename in self.diagnose_db[loginbase]['nodes']:
1013
1014                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1015
1016                                 if loginbase not in self.sickdb:
1017                                         self.sickdb[loginbase] = {'nodes' : {}}
1018
1019                                 # NOTE: don't copy all node records, since not all will be in l_action
1020                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1021                                 # NOTE: but, we want to get the loginbase config settings,
1022                                 #               this is the easiest way.
1023                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1024                         #else:
1025                                 #print "%s not in diagnose_db!!" % loginbase
1026                 return
1027
1028         def __emailSite(self, loginbase, roles, message, args):
1029                 """
1030                 loginbase is the unique site abbreviation, prepended to slice names.
1031                 roles contains TECH, PI, USER roles, and derive email aliases.
1032                 record contains {'message': [<subj>,<body>], 'args': {...}}
1033                 """
1034                 ticket_id = 0
1035                 args.update({'loginbase':loginbase})
1036
1037                 if not config.mail and not config.debug and config.bcc:
1038                         roles = ADMIN
1039                 if config.mail and config.debug:
1040                         roles = ADMIN
1041
1042                 # build targets
1043                 contacts = []
1044                 if ADMIN & roles:
1045                         contacts += [config.email]
1046                 if TECH & roles:
1047                         contacts += [TECHEMAIL % loginbase]
1048                 if PI & roles:
1049                         contacts += [PIEMAIL % loginbase]
1050                 if USER & roles:
1051                         slices = plc.slices(loginbase)
1052                         if len(slices) >= 1:
1053                                 for slice in slices:
1054                                         contacts += [SLICEMAIL % slice]
1055                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1056                         else:
1057                                 print "SLIC: %20s : 0 slices" % loginbase
1058
1059                 try:
1060                         subject = message[0] % args
1061                         body = message[1] % args
1062                         if ADMIN & roles:
1063                                 # send only to admin
1064                                 if 'ticket_id' in args:
1065                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1066                                 else:
1067                                         subj = "Re: [PL noticket] %s" % subject
1068                                 mailer.email(subj, body, contacts)
1069                                 ticket_id = args['ticket_id']
1070                         else:
1071                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1072                 except Exception, err:
1073                         print "exception on message:"
1074                         import traceback
1075                         print traceback.print_exc()
1076                         print message
1077
1078                 return ticket_id
1079
1080
1081         def _format_diaginfo(self, diag_node):
1082                 info = diag_node['info']
1083                 if diag_node['stage'] == 'monitor-end-record':
1084                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1085                 else:
1086                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1087                 return hlist
1088
1089
1090         def get_email_args(self, act_recordlist, loginbase=None):
1091
1092                 email_args = {}
1093                 email_args['hostname_list'] = ""
1094
1095                 for act_record in act_recordlist:
1096                         email_args['hostname_list'] += act_record['msg_format']
1097                         email_args['hostname'] = act_record['nodename']
1098                         if  'plcnode' in act_record and \
1099                                 'pcu_ids' in act_record['plcnode'] and \
1100                                 len(act_record['plcnode']['pcu_ids']) > 0:
1101                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1102                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1103                         else:
1104                                 email_args['pcu_id'] = "-1"
1105
1106                         if 'ticket_id' in act_record:
1107                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1108                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1109                                         sys.stdout.flush()
1110                                         line = sys.stdin.readline()
1111                                         try:
1112                                                 ticket_id = int(line)
1113                                         except:
1114                                                 print "could not get ticket_id from stdin..."
1115                                                 os._exit(1)
1116                                 else:
1117                                         ticket_id = act_record['ticket_id']
1118
1119                                 email_args['ticket_id'] = ticket_id
1120
1121                 return email_args
1122
1123         def get_unique_issues(self, act_recordlist):
1124                 # NOTE: only send one email per site, per problem...
1125                 unique_issues = {}
1126                 for act_record in act_recordlist:
1127                         act_key = act_record['action'][0]
1128                         if act_key not in unique_issues:
1129                                 unique_issues[act_key] = []
1130
1131                         unique_issues[act_key] += [act_record]
1132
1133                 return unique_issues
1134
1135
1136         def __actOnSite(self, loginbase, site_record):
1137                 i_nodes_actedon = 0
1138                 i_nodes_emailed = 0
1139
1140                 act_recordlist = []
1141
1142                 for nodename in site_record['nodes'].keys():
1143                         diag_record = site_record['nodes'][nodename]
1144                         act_record  = self.__actOnNode(diag_record)
1145                         #print "nodename: %s %s" % (nodename, act_record)
1146                         if act_record is not None:
1147                                 act_recordlist += [act_record]
1148
1149                 unique_issues = self.get_unique_issues(act_recordlist)
1150
1151                 for issue in unique_issues.keys():
1152                         print "\tworking on issue: %s" % issue
1153                         issue_record_list = unique_issues[issue]
1154                         email_args = self.get_email_args(issue_record_list, loginbase)
1155
1156                         # for each record.
1157                         for act_record in issue_record_list:
1158                                 # if there's a pcu record and email config is set
1159                                 if 'email_pcu' in act_record:
1160                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1161                                                 # and 'reboot_node' in act_record['stage']:
1162
1163                                                 email_args['hostname'] = act_record['nodename']
1164                                                 ticket_id = self.__emailSite(loginbase,
1165                                                                                         act_record['email'],
1166                                                                                         emailTxt.mailtxt.pcudown[0],
1167                                                                                         email_args)
1168                                                 if ticket_id == 0:
1169                                                         # error.
1170                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1171                                                         os._exit(1)
1172                                                         pass
1173                                                 email_args['ticket_id'] = ticket_id
1174
1175
1176                         act_record = issue_record_list[0]
1177                         # send message before squeezing
1178                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1179                                                                                                 site_record['config']['email'])
1180                         if act_record['message'] != None and site_record['config']['email']:
1181                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1182                                                                                          act_record['message'], email_args)
1183
1184                                 if ticket_id == 0:
1185                                         # error.
1186                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1187                                         os._exit(1)
1188                                         pass
1189
1190                                 # Add ticket_id to ALL nodenames
1191                                 for act_record in issue_record_list:
1192                                         nodename = act_record['nodename']
1193                                         # update node record with RT ticket_id
1194                                         if nodename in self.act_all:
1195                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1196                                                 # if the ticket was previously resolved, reset it to new.
1197                                                 if 'rt' in act_record and \
1198                                                         'Status' in act_record['rt'] and \
1199                                                         act_record['rt']['Status'] == 'resolved':
1200                                                         mailer.setTicketStatus(ticket_id, "new")
1201                                                 status = mailer.getTicketStatus(ticket_id)
1202                                                 self.act_all[nodename][0]['rt'] = status
1203                                         if config.mail: i_nodes_emailed += 1
1204
1205                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1206                                                                                                         site_record['config']['squeeze'])
1207                         if config.squeeze and site_record['config']['squeeze']:
1208                                 for act_key in act_record['action']:
1209                                         self.actions[act_key](email_args)
1210                                 i_nodes_actedon += 1
1211
1212                 if config.policysavedb:
1213                         print "Saving Databases... act_all, diagnose_out"
1214                         database.dbDump("act_all", self.act_all)
1215                         # remove site record from diagnose_out, it's in act_all as done.
1216                         del self.diagnose_db[loginbase]
1217                         database.dbDump("diagnose_out", self.diagnose_db)
1218
1219                 print "sleeping for 1 sec"
1220                 time.sleep(1)
1221                 #print "Hit enter to continue..."
1222                 #sys.stdout.flush()
1223                 #line = sys.stdin.readline()
1224
1225                 return (i_nodes_actedon, i_nodes_emailed)
1226
1227         def __actOnNode(self, diag_record):
1228                 nodename = diag_record['nodename']
1229                 message = diag_record['message']
1230
1231                 act_record = {}
1232                 act_record.update(diag_record)
1233                 act_record['nodename'] = nodename
1234                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1235                 print "act_record['stage'] == %s " % act_record['stage']
1236
1237                 # avoid end records, and nmreset records
1238                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1239
1240                 if 'monitor-end-record' not in act_record['stage'] and \
1241                    'nmreset' not in act_record['stage'] and \
1242                    'reboot_node_failed' not in act_record:
1243
1244                         if "DOWN" in act_record['log'] and \
1245                                         'pcu_ids' in act_record['plcnode'] and \
1246                                         len(act_record['plcnode']['pcu_ids']) > 0:
1247
1248                                 print "%s" % act_record['log'],
1249                                 print "%15s" % (['reboot_node'],)
1250                                 # Set node to re-install
1251                                 plc.nodeBootState(act_record['nodename'], "reinstall")
1252                                 try:
1253                                         ret = reboot_node({'hostname': act_record['nodename']})
1254                                 except Exception, exc:
1255                                         print "exception on reboot_node:"
1256                                         import traceback
1257                                         print traceback.print_exc()
1258                                         ret = False
1259
1260                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1261                                         # Reboot Succeeded
1262                                         print "reboot succeeded for %s" % act_record['nodename']
1263                                         act_record2 = {}
1264                                         act_record2.update(act_record)
1265                                         act_record2['action'] = ['reboot_node']
1266                                         act_record2['stage'] = "reboot_node"
1267                                         act_record2['reboot_node_failed'] = False
1268                                         act_record2['email_pcu'] = False
1269
1270                                         if nodename not in self.act_all:
1271                                                 self.act_all[nodename] = []
1272                                         print "inserting 'reboot_node' record into act_all"
1273                                         self.act_all[nodename].insert(0,act_record2)
1274
1275                                         # return None to avoid further action
1276                                         print "Taking no further action"
1277                                         return None
1278                                 else:
1279                                         print "reboot failed for %s" % act_record['nodename']
1280                                         # set email_pcu to also send pcu notice for this record.
1281                                         act_record['reboot_node_failed'] = True
1282                                         act_record['email_pcu'] = True
1283
1284                         print "%s" % act_record['log'],
1285                         print "%15s" % act_record['action']
1286
1287                 if act_record['stage'] is not 'monitor-end-record' and \
1288                    act_record['stage'] is not 'nmreset':
1289                         if nodename not in self.act_all:
1290                                 self.act_all[nodename] = []
1291
1292                         self.act_all[nodename].insert(0,act_record)
1293                 else:
1294                         print "Not recording %s in act_all" % nodename
1295
1296                 return act_record
1297
1298         def analyseSites(self):
1299                 i_sites_observed = 0
1300                 i_sites_diagnosed = 0
1301                 i_nodes_diagnosed = 0
1302                 i_nodes_actedon = 0
1303                 i_sites_emailed = 0
1304                 l_allsites = []
1305
1306                 sorted_sites = self.sickdb.keys()
1307                 sorted_sites.sort()
1308                 for loginbase in sorted_sites:
1309                         site_record = self.sickdb[loginbase]
1310                         print "sites: %s" % loginbase
1311
1312                         i_nodes_diagnosed += len(site_record.keys())
1313                         i_sites_diagnosed += 1
1314
1315                         (na,ne) = self.__actOnSite(loginbase, site_record)
1316
1317                         i_sites_observed += 1
1318                         i_nodes_actedon += na
1319                         i_sites_emailed += ne
1320
1321                         l_allsites += [loginbase]
1322
1323                 return {'sites_observed': i_sites_observed,
1324                                 'sites_diagnosed': i_sites_diagnosed,
1325                                 'nodes_diagnosed': i_nodes_diagnosed,
1326                                 'sites_emailed': i_sites_emailed,
1327                                 'nodes_actedon': i_nodes_actedon,
1328                                 'allsites':l_allsites}
1329
1330         def print_stats(self, key, stats):
1331                 print "%20s : %d" % (key, stats[key])
1332
1333
1334
1335         #"""
1336         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1337         #"""
1338         #def status(self):
1339         #       sub = "Monitor Summary"
1340         #       msg = "\nThe following nodes were acted upon:  \n\n"
1341         #       for (node, (type, date)) in self.emailed.items():
1342         #               # Print only things acted on today.
1343         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1344         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1345         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1346         #       for (loginbase, (date, type)) in self.squeezed.items():
1347         #               # Print only things acted on today.
1348         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1349         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1350         #       mailer.email(sub, msg, [SUMTO])
1351         #       logger.info(msg)
1352         #       return
1353
1354         #"""
1355         #Store/Load state of emails.  When, where, what.
1356         #"""
1357         #def emailedStore(self, action):
1358         #       try:
1359         #               if action == "LOAD":
1360         #                       f = open(DAT, "r+")
1361         #                       logger.info("POLICY:  Found and reading " + DAT)
1362         #                       self.emailed.update(pickle.load(f))
1363         #               if action == "WRITE":
1364         #                       f = open(DAT, "w")
1365         #                       #logger.debug("Writing " + DAT)
1366         #                       pickle.dump(self.emailed, f)
1367         #               f.close()
1368         #       except Exception, err:
1369         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1370
1371
1372 #class Policy(Thread):
1373
1374 def main():
1375         print "policy.py is a module, not a script for running directly."
1376
1377 if __name__ == '__main__':
1378         import os
1379         import plc
1380         try:
1381                 main()
1382         except KeyboardInterrupt:
1383                 print "Killed.  Exitting."
1384                 logger.info('Monitor Killed')
1385                 os._exit(0)