policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import database
  23 import string
  24 from www.printbadnodes import cmpCategoryVal
  25 from config import config
  26 #print "policy"
  27 config = config()
  28
  29 DAT="./monitor.dat"
  30
  31 logger = logging.getLogger("monitor")
  32
  33 # Time to enforce policy
  34 POLSLEEP = 7200
  35
  36 # Where to email the summary
  37 SUMTO = "soltesz@cs.princeton.edu"
  38 TECHEMAIL="tech-%s@sites.planet-lab.org"
  39 PIEMAIL="pi-%s@sites.planet-lab.org"
  40 SLICEMAIL="%s@slices.planet-lab.org"
  41 PLCEMAIL="support@planet-lab.org"
  42
  43 #Thresholds (DAYS)
  44 SPERMIN = 60
  45 SPERHOUR = 60*60
  46 SPERDAY = 86400
  47 PITHRESH = 7 * SPERDAY
  48 SLICETHRESH = 7 * SPERDAY
  49 # Days before attempting rins again
  50 RINSTHRESH = 5 * SPERDAY
  51
  52 # Days before calling the node dead.
  53 DEADTHRESH = 30 * SPERDAY
  54 # Minimum number of nodes up before squeezing
  55 MINUP = 2
  56
  57 TECH=1
  58 PI=2
  59 USER=4
  60 ADMIN=8
  61
  62 # IF:
  63 #  no SSH, down.
  64 #  bad disk, down
  65 #  DNS, kinda down (sick)
  66 #  clock, kinda down (sick)
  67 #  Full disk, going to be down
  68
  69 # Actions:
  70 #  Email
  71 #  suspend slice creation
  72 #  kill slices
  73 def array_to_priority_map(array):
  74         """ Create a mapping where each entry of array is given a priority equal
  75         to its position in the array.  This is useful for subsequent use in the
  76         cmpMap() function."""
  77         map = {}
  78         count = 0
  79         for i in array:
  80                 map[i] = count
  81                 count += 1
  82         return map
  83
  84 def getdebug():
  85         return config.debug
  86
  87 def print_stats(key, stats):
  88         if key in stats: print "%20s : %d" % (key, stats[key])
  89
  90 def get_ticket_id(record):
  91         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
  92                 return record['ticket_id']
  93         elif            'found_rt_ticket' in record and \
  94                  record['found_rt_ticket'] is not "" and \
  95                  record['found_rt_ticket'] is not None:
  96                 return record['found_rt_ticket']
  97         else:
  98                 return None
  99
 100 class Merge(Thread):
 101         def __init__(self, l_merge, toRT):
 102                 self.toRT = toRT
 103                 self.merge_list = l_merge
 104                 # the hostname to loginbase mapping
 105                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 106
 107                 # Previous actions taken on nodes.
 108                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
 109                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 110
 111                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
 112                 self.sickdb = {}
 113                 self.mergedb = {}
 114                 Thread.__init__(self)
 115
 116         def run(self):
 117                 # populate sickdb
 118                 self.accumSickSites()
 119                 # read data from findbad and act_all
 120                 self.mergeActionsAndBadDB()
 121                 # pass node_records to RT
 122                 self.sendToRT()
 123
 124         def accumSickSites(self):
 125                 """
 126                 Take all nodes, from l_diagnose, look them up in the act_all database,
 127                 and insert them into sickdb[] as:
 128
 129                         sickdb[loginbase][nodename] = fb_record
 130                 """
 131                 # look at all problems reported by findbad
 132                 l_nodes = self.findbad['nodes'].keys()
 133                 count = 0
 134                 for nodename in l_nodes:
 135                         if nodename not in self.merge_list:
 136                                 continue                # skip this node, since it's not wanted
 137
 138                         count += 1
 139                         loginbase = self.plcdb_hn2lb[nodename]
 140                         values = self.findbad['nodes'][nodename]['values']
 141
 142                         fb_record = {}
 143                         fb_record['nodename'] = nodename
 144                         try:
 145                                 fb_record['category'] = values['category']
 146                         except:
 147                                 print values
 148                                 print nodename
 149                                 print self.findbad['nodes'][nodename]
 150                                 count -= 1
 151                                 continue
 152                         fb_record['state'] = values['state']
 153                         fb_record['comonstats'] = values['comonstats']
 154                         fb_record['plcnode'] = values['plcnode']
 155                         fb_record['kernel'] = self.getKernel(values['kernel'])
 156                         fb_record['stage'] = "findbad"
 157                         fb_record['message'] = None
 158                         fb_record['bootcd'] = values['bootcd']
 159                         fb_record['args'] = None
 160                         fb_record['info'] = None
 161                         fb_record['time'] = time.time()
 162                         fb_record['date_created'] = time.time()
 163
 164                         if loginbase not in self.sickdb:
 165                                 self.sickdb[loginbase] = {}
 166
 167                         self.sickdb[loginbase][nodename] = fb_record
 168
 169                 print "Found %d nodes" % count
 170
 171         def getKernel(self, unamestr):
 172                 s = unamestr.split()
 173                 if len(s) > 2:
 174                         return s[2]
 175                 else:
 176                         return ""
 177
 178         def mergeActionsAndBadDB(self):
 179                 """
 180                 - Look at the sick node_records as reported in findbad,
 181                 - Then look at the node_records in act_all.
 182
 183                 There are four cases:
 184                 1) Problem in findbad, no problem in act_all
 185                         this ok, b/c it just means it's a new problem
 186                 2) Problem in findbad, problem in act_all
 187                         -Did the problem get better or worse?
 188                                 -If Same, or Worse, then continue looking for open tickets.
 189                                 -If Better, or No problem, then "back-off" penalties.
 190                                         This judgement may need to wait until 'Diagnose()'
 191
 192                 3) No problem in findbad, problem in act_all
 193                         The the node is operational again according to Findbad()
 194
 195                 4) No problem in findbad, no problem in act_all
 196                         There won't be a record in either db, so there's no code.
 197                 """
 198
 199                 sorted_sites = self.sickdb.keys()
 200                 sorted_sites.sort()
 201                 # look at all problems reported by findbad
 202                 for loginbase in sorted_sites:
 203                         d_fb_nodes = self.sickdb[loginbase]
 204                         sorted_nodes = d_fb_nodes.keys()
 205                         sorted_nodes.sort()
 206                         for nodename in sorted_nodes:
 207                                 fb_record = self.sickdb[loginbase][nodename]
 208                                 x = fb_record
 209                                 if loginbase not in self.mergedb:
 210                                         self.mergedb[loginbase] = {}
 211
 212                                 # take the info either from act_all or fb-record.
 213                                 # if node not in act_all
 214                                 #       then take it from fbrecord, obviously.
 215                                 # else node in act_all
 216                                 #   if act_all == 0 length (no previous records)
 217                                 #               then take it from fbrecord.
 218                                 #   else
 219                                 #           take it from act_all.
 220                                 #
 221
 222                                 # We must compare findbad state with act_all state
 223                                 if nodename not in self.act_all:
 224                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 225                                         self.mergedb[loginbase][nodename] = {}
 226                                         self.mergedb[loginbase][nodename].update(x)
 227                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 228                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 229                                 else:
 230                                         if len(self.act_all[nodename]) == 0:
 231                                                 self.mergedb[loginbase][nodename] = {}
 232                                                 self.mergedb[loginbase][nodename].update(x)
 233                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
 234                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 235                                         else:
 236                                                 y = self.act_all[nodename][0]
 237                                                 y['prev_category'] = y['category']
 238
 239                                                 self.mergedb[loginbase][nodename] = {}
 240                                                 self.mergedb[loginbase][nodename].update(y)
 241                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 242                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
 243                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
 244                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 245                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 246                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 247                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
 248                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
 249
 250                                         # delete the entry from cache_all to keep it out of case 3)
 251                                         del self.cache_all[nodename]
 252
 253                 # 3) nodes that remin in cache_all were not identified by findbad.
 254                 #        Do we keep them or not?
 255                 #   NOTE: i think that since the categories are performed before this
 256                 #               step now, and by a monitor-controlled agent.
 257
 258                 # TODO: This does not work correctly.  Do we need this?
 259                 #for hn in self.cache_all.keys():
 260                 #       y = self.act_all[hn][0]
 261                 #       if 'monitor' in y['bucket']:
 262                 #               loginbase = self.plcdb_hn2lb[hn]
 263                 #               if loginbase not in self.sickdb:
 264                 #                       self.sickdb[loginbase] = {}
 265                 #               self.sickdb[loginbase][hn] = y
 266                 #       else:
 267                 #               del self.cache_all[hn]
 268
 269                 print "len of cache_all: %d" % len(self.cache_all.keys())
 270                 return
 271
 272         def sendToRT(self):
 273                 sorted_sites = self.mergedb.keys()
 274                 sorted_sites.sort()
 275                 # look at all problems reported by merge
 276                 for loginbase in sorted_sites:
 277                         d_merge_nodes = self.mergedb[loginbase]
 278                         for nodename in d_merge_nodes.keys():
 279                                 record = self.mergedb[loginbase][nodename]
 280                                 self.toRT.put(record)
 281
 282                 # send signal to stop reading
 283                 self.toRT.put(None)
 284                 return
 285
 286 class Diagnose(Thread):
 287         def __init__(self, fromRT):
 288                 self.fromRT = fromRT
 289                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 290                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 291
 292                 self.diagnose_in = {}
 293                 self.diagnose_out = {}
 294                 Thread.__init__(self)
 295
 296
 297         def run(self):
 298                 self.accumSickSites()
 299
 300                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 301                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 302
 303                 try:
 304                         stats = self.diagnoseAll()
 305                 except Exception, err:
 306                         print "----------------"
 307                         import traceback
 308                         print traceback.print_exc()
 309                         print err
 310                         #if config.policysavedb:
 311                         sys.exit(1)
 312
 313                 print_stats("sites_observed", stats)
 314                 print_stats("sites_diagnosed", stats)
 315                 print_stats("nodes_diagnosed", stats)
 316
 317                 if config.policysavedb:
 318                         print "Saving Databases... diagnose_out"
 319                         database.dbDump("diagnose_out", self.diagnose_out)
 320
 321         def accumSickSites(self):
 322                 """
 323                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 324                 and insert them into diagnose_in[] as:
 325
 326                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 327                 """
 328                 while 1:
 329                         node_record = self.fromRT.get(block = True)
 330                         if node_record == None:
 331                                 break;
 332
 333                         nodename = node_record['nodename']
 334                         loginbase = self.plcdb_hn2lb[nodename]
 335
 336                         if loginbase not in self.diagnose_in:
 337                                 self.diagnose_in[loginbase] = {}
 338
 339                         self.diagnose_in[loginbase][nodename] = node_record
 340
 341                 return
 342
 343         def diagnoseAll(self):
 344                 i_sites_observed = 0
 345                 i_sites_diagnosed = 0
 346                 i_nodes_diagnosed = 0
 347                 i_nodes_actedon = 0
 348                 i_sites_emailed = 0
 349                 l_allsites = []
 350
 351                 sorted_sites = self.diagnose_in.keys()
 352                 sorted_sites.sort()
 353                 self.diagnose_out= {}
 354                 for loginbase in sorted_sites:
 355                         l_allsites += [loginbase]
 356
 357                         d_diag_nodes = self.diagnose_in[loginbase]
 358                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 359                         # store records in diagnose_out, for saving later.
 360                         self.diagnose_out.update(d_act_records)
 361
 362                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 363                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 364                                 i_sites_diagnosed += 1
 365                         i_sites_observed += 1
 366
 367                 return {'sites_observed': i_sites_observed,
 368                                 'sites_diagnosed': i_sites_diagnosed,
 369                                 'nodes_diagnosed': i_nodes_diagnosed,
 370                                 'allsites':l_allsites}
 371
 372                 pass
 373
 374         def getDaysDown(cls, diag_record):
 375                 daysdown = -1
 376                 last_contact = diag_record['plcnode']['last_contact']
 377                 date_created = diag_record['plcnode']['date_created']
 378
 379                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
 380                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 381                 elif last_contact is None:
 382                         if date_created is not None:
 383                                 now = time.time()
 384                                 diff = now - date_created
 385                                 daysdown = diff // (60*60*24)
 386                         else:
 387                                 daysdown = -1
 388                 else:
 389                         now = time.time()
 390                         diff = now - last_contact
 391                         daysdown = diff // (60*60*24)
 392                 return daysdown
 393         getDaysDown = classmethod(getDaysDown)
 394
 395         def getStrDaysDown(cls, diag_record):
 396                 daysdown = "unknown"
 397                 last_contact = diag_record['plcnode']['last_contact']
 398                 date_created = diag_record['plcnode']['date_created']
 399
 400                 if      diag_record['comonstats']['uptime'] != "null" and \
 401                         diag_record['comonstats']['uptime'] != "-1":
 402                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 403                         daysdown = "%d days up" % daysdown
 404
 405                 elif last_contact is None:
 406                         if date_created is not None:
 407                                 now = time.time()
 408                                 diff = now - date_created
 409                                 daysdown = diff // (60*60*24)
 410                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
 411                         else:
 412                                 daysdown = "Never contacted PLC"
 413                 else:
 414                         now = time.time()
 415                         diff = now - last_contact
 416                         daysdown = diff // (60*60*24)
 417                         daysdown = "%s days down" % daysdown
 418                 return daysdown
 419         getStrDaysDown = classmethod(getStrDaysDown)
 420         #def getStrDaysDown(cls, diag_record):
 421         #       daysdown = cls.getDaysDown(diag_record)
 422         #       if daysdown > -1:
 423         #               return "%d days down"%daysdown
 424         #       elif daysdown == -1:
 425         #               return "Has never contacted PLC"
 426         #       else:
 427         #               return "%d days up"% -daysdown
 428         #getStrDaysDown = classmethod(getStrDaysDown)
 429
 430         def __getCDVersion(self, diag_record, nodename):
 431                 cdversion = ""
 432                 #print "Getting kernel for: %s" % diag_record['nodename']
 433                 cdversion = diag_record['kernel']
 434                 return cdversion
 435
 436         def __diagnoseSite(self, loginbase, d_diag_nodes):
 437                 """
 438                 d_diag_nodes are diagnose_in entries.
 439                 """
 440                 d_diag_site = {loginbase : { 'config' :
 441                                                                                                 {'squeeze': False,
 442                                                                                                  'email': False
 443                                                                                                 },
 444                                                                         'nodes': {}
 445                                                                         }
 446                                            }
 447                 sorted_nodes = d_diag_nodes.keys()
 448                 sorted_nodes.sort()
 449                 for nodename in sorted_nodes:
 450                         node_record = d_diag_nodes[nodename]
 451                         diag_record = self.__diagnoseNode(loginbase, node_record)
 452
 453                         if diag_record != None:
 454                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 455
 456                                 # NOTE: improvement means, we need to act/squeeze and email.
 457                                 #print "DIAG_RECORD", diag_record
 458                                 if 'monitor-end-record' in diag_record['stage'] or \
 459                                    'nmreset' in diag_record['stage']:
 460                                 #       print "resetting loginbase!"
 461                                         d_diag_site[loginbase]['config']['squeeze'] = True
 462                                         d_diag_site[loginbase]['config']['email'] = True
 463                                 #else:
 464                                 #       print "NO IMPROVEMENT!!!!"
 465                         else:
 466                                 pass # there is nothing to do for this node.
 467
 468                 # NOTE: these settings can be overridden by command line arguments,
 469                 #       or the state of a record, i.e. if already in RT's Support Queue.
 470                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 471                 if nodes_up < MINUP:
 472                         d_diag_site[loginbase]['config']['squeeze'] = True
 473
 474                 max_slices = self.getMaxSlices(loginbase)
 475                 num_nodes = self.getNumNodes(loginbase)
 476                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 477                 #       or an old disabled site from previous monitor (before site['enabled'])
 478                 if nodes_up < num_nodes and max_slices != 0:
 479                         d_diag_site[loginbase]['config']['email'] = True
 480
 481                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 482                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 483
 484                 return d_diag_site
 485
 486         def diagRecordByCategory(self, node_record):
 487                 nodename = node_record['nodename']
 488                 category = node_record['category']
 489                 state    = node_record['state']
 490                 loginbase = self.plcdb_hn2lb[nodename]
 491                 diag_record = None
 492
 493                 if  "ERROR" in category:        # i.e. "DOWN"
 494                         diag_record = {}
 495                         diag_record.update(node_record)
 496                         daysdown = self.getDaysDown(diag_record)
 497                         if daysdown < 7:
 498                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 499                                 print format % (loginbase, nodename, daysdown)
 500                                 return None
 501
 502                         s_daysdown = self.getStrDaysDown(diag_record)
 503                         diag_record['message'] = emailTxt.mailtxt.newdown
 504                         diag_record['args'] = {'nodename': nodename}
 505                         diag_record['info'] = (nodename, s_daysdown, "")
 506
 507                         if 'reboot_node_failed' in node_record:
 508                                 # there was a previous attempt to use the PCU.
 509                                 if node_record['reboot_node_failed'] == False:
 510                                         # then the last attempt apparently, succeeded.
 511                                         # But, the category is still 'ERROR'.  Therefore, the
 512                                         # PCU-to-Node mapping is broken.
 513                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 514                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 515                                         diag_record['email_pcu'] = True
 516
 517                         if 'ticket_id' in diag_record:
 518                                 if diag_record['ticket_id'] == "":
 519                                         if 'found_rt_ticket' in diag_record:
 520                                                 ticket_id = diag_record['found_rt_ticket']
 521                                         else:
 522                                                 ticket_id = "None"
 523                                 else:
 524                                         ticket_id = diag_record['ticket_id']
 525                         else:
 526                                 ticket_id = "None"
 527
 528                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 529                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
 530
 531                 elif "OLDBOOTCD" in category:
 532                         # V2 boot cds as determined by findbad
 533                         s_daysdown = self.getStrDaysDown(node_record)
 534                         s_cdversion = self.__getCDVersion(node_record, nodename)
 535                         diag_record = {}
 536                         diag_record.update(node_record)
 537                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 538                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 539                         diag_record['args'] = {'nodename': nodename}
 540                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 541                         if diag_record['ticket_id'] == "":
 542                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 543                                                                         (loginbase, nodename, diag_record['kernel'],
 544                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 545                         else:
 546                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 547                                                                         (loginbase, nodename, diag_record['kernel'],
 548                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 549
 550                 elif "PROD" in category:
 551                         if "DEBUG" in state:
 552                                 # Not sure what to do with these yet.  Probably need to
 553                                 # reboot, and email.
 554                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 555                                 return None
 556                         elif "BOOT" in state:
 557                                 # no action needed.
 558                                 # TODO: remove penalties, if any are applied.
 559                                 now = time.time()
 560                                 last_contact = node_record['plcnode']['last_contact']
 561                                 if last_contact == None:
 562                                         time_diff = 0
 563                                 else:
 564                                         time_diff = now - last_contact;
 565
 566                                 if 'improvement' in node_record['stage']:
 567                                         # then we need to pass this on to 'action'
 568                                         diag_record = {}
 569                                         diag_record.update(node_record)
 570                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 571                                         diag_record['args'] = {'nodename': nodename}
 572                                         diag_record['info'] = (nodename, node_record['prev_category'],
 573                                                                                                          node_record['category'])
 574                                         if 'email_pcu' in diag_record:
 575                                                 if diag_record['email_pcu']:
 576                                                         # previously, the pcu failed to reboot, so send
 577                                                         # email. Now, reset these values to try the reboot
 578                                                         # again.
 579                                                         diag_record['email_pcu'] = False
 580                                                         del diag_record['reboot_node_failed']
 581
 582                                         if diag_record['ticket_id'] == "":
 583                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 584                                                                         (loginbase, nodename, diag_record['stage'],
 585                                                                          state, category, diag_record['found_rt_ticket'])
 586                                         else:
 587                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 588                                                                         (loginbase, nodename, diag_record['stage'],
 589                                                                          state, category, diag_record['ticket_id'])
 590                                         return diag_record
 591                                 #elif time_diff >= 6*SPERHOUR:
 592                                 #       # heartbeat is older than 30 min.
 593                                 #       # then reset NM.
 594                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 595                                 #       diag_record = {}
 596                                 #       diag_record.update(node_record)
 597                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
 598                                 #       diag_record['args'] = {'nodename': nodename}
 599                                 #       diag_record['stage'] = "nmreset"
 600                                 #       diag_record['info'] = (nodename,
 601                                 #                                                       node_record['prev_category'],
 602                                 #                                                       node_record['category'])
 603                                 #       if diag_record['ticket_id'] == "":
 604                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 605                                 #                                       (loginbase, nodename, diag_record['stage'],
 606                                 #                                        state, category, diag_record['found_rt_ticket'])
 607                                 #       else:
 608                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 609                                 #                                       (loginbase, nodename, diag_record['stage'])
 610 #
 611 #                                       return diag_record
 612                                 else:
 613                                         return None
 614                         else:
 615                                 # unknown
 616                                 pass
 617                 elif "ALPHA"    in category:
 618                         pass
 619                 elif "clock_drift" in category:
 620                         pass
 621                 elif "dns"    in category:
 622                         pass
 623                 elif "filerw"    in category:
 624                         pass
 625                 else:
 626                         print "Unknown category!!!! %s" % category
 627                         sys.exit(1)
 628
 629                 return diag_record
 630
 631         def __diagnoseNode(self, loginbase, node_record):
 632                 # TODO: change the format of the hostname in this
 633                 #               record to something more natural.
 634                 nodename                = node_record['nodename']
 635                 category                = node_record['category']
 636                 prev_category   = node_record['prev_category']
 637                 state                   = node_record['state']
 638                 #if 'prev_category' in node_record:
 639                 #       prev_category = node_record['prev_category']
 640                 #else:
 641                 #       prev_category = "ERROR"
 642                 if node_record['prev_category'] != "NORECORD":
 643
 644                         val = cmpCategoryVal(category, prev_category)
 645                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 646                         if val == 1:
 647                                 # improved
 648                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 649                                         print "closing record with no ticket: ", node_record['nodename']
 650                                         node_record['action'] = ['close_rt']
 651                                         node_record['message'] = None
 652                                         node_record['stage'] = 'monitor-end-record'
 653                                         return node_record
 654                                 else:
 655                                         node_record['stage'] = 'improvement'
 656
 657                                 #if 'monitor-end-record' in node_record['stage']:
 658                                 #       # just ignore it if it's already ended.
 659                                 #       # otherwise, the status should be worse, and we won't get
 660                                 #       # here.
 661                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 662                                 #       return None
 663 #
 664 #                                       #return None
 665                         elif val == -1:
 666                                 # current category is worse than previous, carry on
 667                                 pass
 668                         else:
 669                                 #values are equal, carry on.
 670                                 #print "why are we here?"
 671                                 pass
 672
 673                 if 'rt' in node_record and 'Status' in node_record['rt']:
 674                         if node_record['stage'] == 'ticket_waitforever':
 675                                 if 'resolved' in node_record['rt']['Status']:
 676                                         print "ending waitforever record for: ", node_record['nodename']
 677                                         node_record['action'] = ['noop']
 678                                         node_record['message'] = None
 679                                         node_record['stage'] = 'monitor-end-record'
 680                                         print "oldlog: %s" % node_record['log'],
 681                                         print "%15s" % node_record['action']
 682                                         return node_record
 683                                 if 'new' in node_record['rt']['Status'] and \
 684                                         'Queue' in node_record['rt'] and \
 685                                         'Monitor' in node_record['rt']['Queue']:
 686
 687                                         print "RESETTING stage to findbad"
 688                                         node_record['stage'] = 'findbad'
 689
 690                 #### COMPARE category and prev_category
 691                 # if not_equal
 692                 #       then assign a stage based on relative priorities
 693                 # else equal
 694                 #       then check category for stats.
 695                 diag_record = self.diagRecordByCategory(node_record)
 696                 if diag_record == None:
 697                         #print "diag_record == None"
 698                         return None
 699
 700                 #### found_RT_ticket
 701                 # TODO: need to record time found, and maybe add a stage for acting on it...
 702                 # NOTE: after found, if the support ticket is resolved, the block is
 703                 #               not removed. How to remove the block on this?
 704                 if 'found_rt_ticket' in diag_record and \
 705                         diag_record['found_rt_ticket'] is not None:
 706                         if diag_record['stage'] is not 'improvement':
 707                                 diag_record['stage'] = 'ticket_waitforever'
 708
 709                 current_time = time.time()
 710                 # take off four days, for the delay that database caused.
 711                 # TODO: generalize delays at PLC, and prevent enforcement when there
 712                 #               have been no emails.
 713                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 714                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 715                 delta = current_time - diag_record['time']
 716
 717                 message = diag_record['message']
 718                 act_record = {}
 719                 act_record.update(diag_record)
 720
 721                 #### DIAGNOSE STAGES
 722                 if   'findbad' in diag_record['stage']:
 723                         # The node is bad, and there's no previous record of it.
 724                         act_record['email'] = TECH
 725                         act_record['action'] = ['noop']
 726                         act_record['message'] = message[0]
 727                         act_record['stage'] = 'stage_actinoneweek'
 728
 729                 elif 'nmreset' in diag_record['stage']:
 730                         act_record['email']  = ADMIN
 731                         act_record['action'] = ['reset_nodemanager']
 732                         act_record['message'] = message[0]
 733                         act_record['stage']  = 'nmreset'
 734                         return None
 735
 736                 elif 'reboot_node' in diag_record['stage']:
 737                         act_record['email'] = TECH
 738                         act_record['action'] = ['noop']
 739                         act_record['message'] = message[0]
 740                         act_record['stage'] = 'stage_actinoneweek'
 741
 742                 elif 'improvement' in diag_record['stage']:
 743                         # - backoff previous squeeze actions (slice suspend, nocreate)
 744                         # TODO: add a backoff_squeeze section... Needs to runthrough
 745                         print "backing off of %s" % nodename
 746                         act_record['action'] = ['close_rt']
 747                         act_record['message'] = message[0]
 748                         act_record['stage'] = 'monitor-end-record'
 749
 750                 elif 'actinoneweek' in diag_record['stage']:
 751                         if delta >= 7 * SPERDAY:
 752                                 act_record['email'] = TECH | PI
 753                                 act_record['stage'] = 'stage_actintwoweeks'
 754                                 act_record['message'] = message[1]
 755                                 act_record['action'] = ['nocreate' ]
 756                                 act_record['time'] = current_time               # reset clock for waitforever
 757                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 758                                 act_record['email'] = TECH
 759                                 act_record['message'] = message[0]
 760                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 761                                 act_record['second-mail-at-oneweek'] = True
 762                         else:
 763                                 act_record['message'] = None
 764                                 act_record['action'] = ['waitforoneweekaction' ]
 765                                 print "ignoring this record for: %s" % act_record['nodename']
 766                                 return None                     # don't send if there's no action
 767
 768                 elif 'actintwoweeks' in diag_record['stage']:
 769                         if delta >= 7 * SPERDAY:
 770                                 act_record['email'] = TECH | PI | USER
 771                                 act_record['stage'] = 'stage_waitforever'
 772                                 act_record['message'] = message[2]
 773                                 act_record['action'] = ['suspendslices']
 774                                 act_record['time'] = current_time               # reset clock for waitforever
 775                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 776                                 act_record['email'] = TECH | PI
 777                                 act_record['message'] = message[1]
 778                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 779                                 act_record['second-mail-at-twoweeks'] = True
 780                         else:
 781                                 act_record['message'] = None
 782                                 act_record['action'] = ['waitfortwoweeksaction']
 783                                 return None                     # don't send if there's no action
 784
 785                 elif 'ticket_waitforever' in diag_record['stage']:
 786                         act_record['email'] = TECH
 787                         if 'first-found' not in act_record:
 788                                 act_record['first-found'] = True
 789                                 act_record['log'] += " firstfound"
 790                                 act_record['action'] = ['ticket_waitforever']
 791                                 act_record['message'] = message[0]
 792                                 act_record['time'] = current_time
 793                         else:
 794                                 if delta >= 7*SPERDAY:
 795                                         act_record['action'] = ['ticket_waitforever']
 796                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
 797                                                         act_record['rt']['Status'] == 'new':
 798                                                 act_record['message'] = message[0]
 799                                         else:
 800                                                 act_record['message'] = None
 801
 802                                         act_record['time'] = current_time               # reset clock
 803                                 else:
 804                                         act_record['action'] = ['ticket_waitforever']
 805                                         act_record['message'] = None
 806                                         return None
 807
 808                 elif 'waitforever' in diag_record['stage']:
 809                         # more than 3 days since last action
 810                         # TODO: send only on weekdays.
 811                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 812                         if delta >= 3*SPERDAY:
 813                                 act_record['action'] = ['email-againwaitforever']
 814                                 act_record['message'] = message[2]
 815                                 act_record['time'] = current_time               # reset clock
 816                         else:
 817                                 act_record['action'] = ['waitforever']
 818                                 act_record['message'] = None
 819                                 return None                     # don't send if there's no action
 820
 821                 else:
 822                         # There is no action to be taken, possibly b/c the stage has
 823                         # already been performed, but diagnose picked it up again.
 824                         # two cases,
 825                         #       1. stage is unknown, or
 826                         #       2. delta is not big enough to bump it to the next stage.
 827                         # TODO: figure out which. for now assume 2.
 828                         print "UNKNOWN stage for %s; nothing done" % nodename
 829                         act_record['action'] = ['unknown']
 830                         act_record['message'] = message[0]
 831
 832                         act_record['email'] = TECH
 833                         act_record['action'] = ['noop']
 834                         act_record['message'] = message[0]
 835                         act_record['stage'] = 'stage_actinoneweek'
 836                         act_record['time'] = current_time               # reset clock
 837                         #print "Exiting..."
 838                         #return None
 839                         #sys.exit(1)
 840
 841                 print "%s" % act_record['log'],
 842                 print "%15s" % act_record['action']
 843                 return act_record
 844
 845         def getMaxSlices(self, loginbase):
 846                 # if sickdb has a loginbase, then it will have at least one node.
 847                 site_stats = None
 848
 849                 for nodename in self.diagnose_in[loginbase].keys():
 850                         if nodename in self.findbad['nodes']:
 851                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 852                                 break
 853
 854                 if site_stats == None:
 855                         raise Exception, "loginbase with no nodes in findbad"
 856                 else:
 857                         return site_stats['max_slices']
 858
 859         def getNumNodes(self, loginbase):
 860                 # if sickdb has a loginbase, then it will have at least one node.
 861                 site_stats = None
 862
 863                 for nodename in self.diagnose_in[loginbase].keys():
 864                         if nodename in self.findbad['nodes']:
 865                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 866                                 break
 867
 868                 if site_stats == None:
 869                         raise Exception, "loginbase with no nodes in findbad"
 870                 else:
 871                         if 'num_nodes' in site_stats:
 872                                 return site_stats['num_nodes']
 873                         else:
 874                                 return 0
 875
 876         """
 877         Returns number of up nodes as the total number *NOT* in act_all with a
 878         stage other than 'steady-state' .
 879         """
 880         def getUpAtSite(self, loginbase, d_diag_site):
 881                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 882                 #               that aren't recorded yet.
 883
 884                 numnodes = self.getNumNodes(loginbase)
 885                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 886                 # TODO: make the 'up' value more representative
 887                 up = numnodes
 888                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 889
 890                         rec = d_diag_site[loginbase]['nodes'][nodename]
 891                         if rec['stage'] != 'monitor-end-record':
 892                                 up -= 1
 893                         else:
 894                                 pass # the node is assumed to be up.
 895
 896                 #if up != numnodes:
 897                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 898
 899                 return up
 900
 901
 902 class SiteAction:
 903         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 904                 self.parameter_names = parameter_names
 905         def checkParam(self, args):
 906                 for param in self.parameter_names:
 907                         if param not in args:
 908                                 raise Exception("Parameter %s not provided in args"%param)
 909         def run(self, args):
 910                 self.checkParam(args)
 911                 return self._run(args)
 912         def _run(self, args):
 913                 pass
 914
 915 class SuspendAction(SiteAction):
 916         def _run(self, args):
 917                 return plc.suspendSlices(args['hostname'])
 918
 919 class RemoveSliceCreation(SiteAction):
 920         def _run(self, args):
 921                 return plc.removeSliceCreation(args['hostname'])
 922
 923 class BackoffActions(SiteAction):
 924         def _run(self, args):
 925                 plc.enableSlices(args['hostname'])
 926                 plc.enableSliceCreation(args['hostname'])
 927                 return True
 928
 929 # TODO: create class for each action below,
 930 #               allow for lists of actions to be performed...
 931
 932 def close_rt_backoff(args):
 933         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
 934                 mailer.closeTicketViaRT(args['ticket_id'],
 935                                                                 "Ticket CLOSED automatically by SiteAssist.")
 936                 plc.enableSlices(args['hostname'])
 937                 plc.enableSliceCreation(args['hostname'])
 938         return
 939
 940 def reboot_node(args):
 941         host = args['hostname']
 942         return reboot.reboot_policy(host, True, config.debug)
 943
 944 def reset_nodemanager(args):
 945         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 946         return
 947
 948 class Action(Thread):
 949         def __init__(self, l_action):
 950                 self.l_action = l_action
 951
 952                 # the hostname to loginbase mapping
 953                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 954
 955                 # Actions to take.
 956                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
 957                 # Actions taken.
 958                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
 959
 960                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 961                 self.actions = {}
 962                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 963                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 964                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 965                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
 966                 self.actions['noop'] = lambda args: args
 967                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 968                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 969
 970                 self.actions['ticket_waitforever'] = lambda args: args
 971                 self.actions['waitforever'] = lambda args: args
 972                 self.actions['unknown'] = lambda args: args
 973                 self.actions['waitforoneweekaction'] = lambda args: args
 974                 self.actions['waitfortwoweeksaction'] = lambda args: args
 975                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 976                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 977                 self.actions['email-againwaitforever'] = lambda args: args
 978                 self.actions['email-againticket_waitforever'] = lambda args: args
 979
 980
 981                 self.sickdb = {}
 982                 Thread.__init__(self)
 983
 984         def run(self):
 985                 self.accumSites()
 986                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 987                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 988
 989                 try:
 990                         stats = self.analyseSites()
 991                 except Exception, err:
 992                         print "----------------"
 993                         import traceback
 994                         print traceback.print_exc()
 995                         print err
 996                         if config.policysavedb:
 997                                 print "Saving Databases... act_all"
 998                                 database.dbDump("act_all", self.act_all)
 999                         sys.exit(1)
1000
1001                 print_stats("sites_observed", stats)
1002                 print_stats("sites_diagnosed", stats)
1003                 print_stats("nodes_diagnosed", stats)
1004                 print_stats("sites_emailed", stats)
1005                 print_stats("nodes_actedon", stats)
1006                 print string.join(stats['allsites'], ",")
1007
1008                 if config.policysavedb:
1009                         print "Saving Databases... act_all"
1010                         #database.dbDump("policy.eventlog", self.eventlog)
1011                         # TODO: remove 'diagnose_out',
1012                         #       or at least the entries that were acted on.
1013                         database.dbDump("act_all", self.act_all)
1014
1015         def accumSites(self):
1016                 """
1017                 Take all nodes, from l_action, look them up in the diagnose_db database,
1018                 and insert them into sickdb[] as:
1019
1020                 This way only the given l_action nodes will be acted on regardless
1021                 of how many from diagnose_db are available.
1022
1023                         sickdb[loginbase][nodename] = diag_record
1024                 """
1025                 # TODO: what if l_action == None ?
1026                 for nodename in self.l_action:
1027
1028                         loginbase = self.plcdb_hn2lb[nodename]
1029
1030                         if loginbase in self.diagnose_db and \
1031                                 nodename in self.diagnose_db[loginbase]['nodes']:
1032
1033                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1034
1035                                 if loginbase not in self.sickdb:
1036                                         self.sickdb[loginbase] = {'nodes' : {}}
1037
1038                                 # NOTE: don't copy all node records, since not all will be in l_action
1039                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1040                                 # NOTE: but, we want to get the loginbase config settings,
1041                                 #               this is the easiest way.
1042                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1043                         #else:
1044                                 #print "%s not in diagnose_db!!" % loginbase
1045                 return
1046
1047         def __emailSite(self, loginbase, roles, message, args):
1048                 """
1049                 loginbase is the unique site abbreviation, prepended to slice names.
1050                 roles contains TECH, PI, USER roles, and derive email aliases.
1051                 record contains {'message': [<subj>,<body>], 'args': {...}}
1052                 """
1053                 ticket_id = 0
1054                 args.update({'loginbase':loginbase})
1055
1056                 if not config.mail and not config.debug and config.bcc:
1057                         roles = ADMIN
1058                 if config.mail and config.debug:
1059                         roles = ADMIN
1060
1061                 # build targets
1062                 contacts = []
1063                 if ADMIN & roles:
1064                         contacts += [config.email]
1065                 if TECH & roles:
1066                         contacts += [TECHEMAIL % loginbase]
1067                 if PI & roles:
1068                         contacts += [PIEMAIL % loginbase]
1069                 if USER & roles:
1070                         slices = plc.slices(loginbase)
1071                         if len(slices) >= 1:
1072                                 for slice in slices:
1073                                         contacts += [SLICEMAIL % slice]
1074                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1075                         else:
1076                                 print "SLIC: %20s : 0 slices" % loginbase
1077
1078                 try:
1079                         subject = message[0] % args
1080                         body = message[1] % args
1081                         if ADMIN & roles:
1082                                 # send only to admin
1083                                 if 'ticket_id' in args:
1084                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1085                                 else:
1086                                         subj = "Re: [PL noticket] %s" % subject
1087                                 mailer.email(subj, body, contacts)
1088                                 ticket_id = args['ticket_id']
1089                         else:
1090                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1091                 except Exception, err:
1092                         print "exception on message:"
1093                         import traceback
1094                         print traceback.print_exc()
1095                         print message
1096
1097                 return ticket_id
1098
1099
1100         def _format_diaginfo(self, diag_node):
1101                 info = diag_node['info']
1102                 if diag_node['stage'] == 'monitor-end-record':
1103                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1104                 else:
1105                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1106                 return hlist
1107
1108
1109         def get_email_args(self, act_recordlist, loginbase=None):
1110
1111                 email_args = {}
1112                 email_args['hostname_list'] = ""
1113
1114                 for act_record in act_recordlist:
1115                         email_args['hostname_list'] += act_record['msg_format']
1116                         email_args['hostname'] = act_record['nodename']
1117                         if  'plcnode' in act_record and \
1118                                 'pcu_ids' in act_record['plcnode'] and \
1119                                 len(act_record['plcnode']['pcu_ids']) > 0:
1120                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1121                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1122                         else:
1123                                 email_args['pcu_id'] = "-1"
1124
1125                         if 'ticket_id' in act_record:
1126                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1127                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1128                                         sys.stdout.flush()
1129                                         line = sys.stdin.readline()
1130                                         try:
1131                                                 ticket_id = int(line)
1132                                         except:
1133                                                 print "could not get ticket_id from stdin..."
1134                                                 os._exit(1)
1135                                 else:
1136                                         ticket_id = act_record['ticket_id']
1137
1138                                 email_args['ticket_id'] = ticket_id
1139
1140                 return email_args
1141
1142         def get_unique_issues(self, act_recordlist):
1143                 # NOTE: only send one email per site, per problem...
1144                 unique_issues = {}
1145                 for act_record in act_recordlist:
1146                         act_key = act_record['action'][0]
1147                         if act_key not in unique_issues:
1148                                 unique_issues[act_key] = []
1149
1150                         unique_issues[act_key] += [act_record]
1151
1152                 return unique_issues
1153
1154
1155         def __actOnSite(self, loginbase, site_record):
1156                 i_nodes_actedon = 0
1157                 i_nodes_emailed = 0
1158
1159                 act_recordlist = []
1160
1161                 for nodename in site_record['nodes'].keys():
1162                         diag_record = site_record['nodes'][nodename]
1163                         act_record  = self.__actOnNode(diag_record)
1164                         #print "nodename: %s %s" % (nodename, act_record)
1165                         if act_record is not None:
1166                                 act_recordlist += [act_record]
1167
1168                 unique_issues = self.get_unique_issues(act_recordlist)
1169
1170                 for issue in unique_issues.keys():
1171                         print "\tworking on issue: %s" % issue
1172                         issue_record_list = unique_issues[issue]
1173                         email_args = self.get_email_args(issue_record_list, loginbase)
1174
1175                         # for each record.
1176                         for act_record in issue_record_list:
1177                                 # if there's a pcu record and email config is set
1178                                 if 'email_pcu' in act_record:
1179                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1180                                                 # and 'reboot_node' in act_record['stage']:
1181
1182                                                 email_args['hostname'] = act_record['nodename']
1183                                                 ticket_id = self.__emailSite(loginbase,
1184                                                                                         act_record['email'],
1185                                                                                         emailTxt.mailtxt.pcudown[0],
1186                                                                                         email_args)
1187                                                 if ticket_id == 0:
1188                                                         # error.
1189                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1190                                                         os._exit(1)
1191                                                         pass
1192                                                 email_args['ticket_id'] = ticket_id
1193
1194
1195                         act_record = issue_record_list[0]
1196                         # send message before squeezing
1197                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1198                                                                                                 site_record['config']['email'])
1199                         if act_record['message'] != None and site_record['config']['email']:
1200                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1201                                                                                          act_record['message'], email_args)
1202
1203                                 if ticket_id == 0:
1204                                         # error.
1205                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1206                                         os._exit(1)
1207                                         pass
1208
1209                                 # Add ticket_id to ALL nodenames
1210                                 for act_record in issue_record_list:
1211                                         nodename = act_record['nodename']
1212                                         # update node record with RT ticket_id
1213                                         if nodename in self.act_all:
1214                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1215                                                 # if the ticket was previously resolved, reset it to new.
1216                                                 if 'rt' in act_record and \
1217                                                         'Status' in act_record['rt'] and \
1218                                                         act_record['rt']['Status'] == 'resolved':
1219                                                         mailer.setTicketStatus(ticket_id, "new")
1220                                                 status = mailer.getTicketStatus(ticket_id)
1221                                                 self.act_all[nodename][0]['rt'] = status
1222                                         if config.mail: i_nodes_emailed += 1
1223
1224                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1225                                                                                                         site_record['config']['squeeze'])
1226                         if config.squeeze and site_record['config']['squeeze']:
1227                                 for act_key in act_record['action']:
1228                                         self.actions[act_key](email_args)
1229                                 i_nodes_actedon += 1
1230
1231                 if config.policysavedb:
1232                         print "Saving Databases... act_all, diagnose_out"
1233                         database.dbDump("act_all", self.act_all)
1234                         # remove site record from diagnose_out, it's in act_all as done.
1235                         del self.diagnose_db[loginbase]
1236                         database.dbDump("diagnose_out", self.diagnose_db)
1237
1238                 print "sleeping for 1 sec"
1239                 time.sleep(1)
1240                 #print "Hit enter to continue..."
1241                 #sys.stdout.flush()
1242                 #line = sys.stdin.readline()
1243
1244                 return (i_nodes_actedon, i_nodes_emailed)
1245
1246         def __actOnNode(self, diag_record):
1247                 nodename = diag_record['nodename']
1248                 message = diag_record['message']
1249
1250                 act_record = {}
1251                 act_record.update(diag_record)
1252                 act_record['nodename'] = nodename
1253                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1254                 print "act_record['stage'] == %s " % act_record['stage']
1255
1256                 # avoid end records, and nmreset records
1257                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1258
1259                 if 'monitor-end-record' not in act_record['stage'] and \
1260                    'nmreset' not in act_record['stage'] and \
1261                    'reboot_node_failed' not in act_record:
1262
1263                         if "DOWN" in act_record['log'] and \
1264                                         'pcu_ids' in act_record['plcnode'] and \
1265                                         len(act_record['plcnode']['pcu_ids']) > 0:
1266
1267                                 print "%s" % act_record['log'],
1268                                 print "%15s" % (['reboot_node'],)
1269                                 # Set node to re-install
1270                                 plc.nodeBootState(act_record['nodename'], "rins")
1271                                 try:
1272                                         ret = reboot_node({'hostname': act_record['nodename']})
1273                                 except Exception, exc:
1274                                         print "exception on reboot_node:"
1275                                         import traceback
1276                                         print traceback.print_exc()
1277                                         ret = False
1278
1279                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1280                                         # Reboot Succeeded
1281                                         print "reboot succeeded for %s" % act_record['nodename']
1282                                         act_record2 = {}
1283                                         act_record2.update(act_record)
1284                                         act_record2['action'] = ['reboot_node']
1285                                         act_record2['stage'] = "reboot_node"
1286                                         act_record2['reboot_node_failed'] = False
1287                                         act_record2['email_pcu'] = False
1288
1289                                         if nodename not in self.act_all:
1290                                                 self.act_all[nodename] = []
1291                                         print "inserting 'reboot_node' record into act_all"
1292                                         self.act_all[nodename].insert(0,act_record2)
1293
1294                                         # return None to avoid further action
1295                                         print "Taking no further action"
1296                                         return None
1297                                 else:
1298                                         print "reboot failed for %s" % act_record['nodename']
1299                                         # set email_pcu to also send pcu notice for this record.
1300                                         act_record['reboot_node_failed'] = True
1301                                         act_record['email_pcu'] = True
1302
1303                         print "%s" % act_record['log'],
1304                         print "%15s" % act_record['action']
1305
1306                 if act_record['stage'] is not 'monitor-end-record' and \
1307                    act_record['stage'] is not 'nmreset':
1308                         if nodename not in self.act_all:
1309                                 self.act_all[nodename] = []
1310
1311                         self.act_all[nodename].insert(0,act_record)
1312                 else:
1313                         print "Not recording %s in act_all" % nodename
1314
1315                 return act_record
1316
1317         def analyseSites(self):
1318                 i_sites_observed = 0
1319                 i_sites_diagnosed = 0
1320                 i_nodes_diagnosed = 0
1321                 i_nodes_actedon = 0
1322                 i_sites_emailed = 0
1323                 l_allsites = []
1324
1325                 sorted_sites = self.sickdb.keys()
1326                 sorted_sites.sort()
1327                 for loginbase in sorted_sites:
1328                         site_record = self.sickdb[loginbase]
1329                         print "sites: %s" % loginbase
1330
1331                         i_nodes_diagnosed += len(site_record.keys())
1332                         i_sites_diagnosed += 1
1333
1334                         (na,ne) = self.__actOnSite(loginbase, site_record)
1335
1336                         i_sites_observed += 1
1337                         i_nodes_actedon += na
1338                         i_sites_emailed += ne
1339
1340                         l_allsites += [loginbase]
1341
1342                 return {'sites_observed': i_sites_observed,
1343                                 'sites_diagnosed': i_sites_diagnosed,
1344                                 'nodes_diagnosed': i_nodes_diagnosed,
1345                                 'sites_emailed': i_sites_emailed,
1346                                 'nodes_actedon': i_nodes_actedon,
1347                                 'allsites':l_allsites}
1348
1349         def print_stats(self, key, stats):
1350                 print "%20s : %d" % (key, stats[key])
1351
1352
1353
1354         #"""
1355         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1356         #"""
1357         #def status(self):
1358         #       sub = "Monitor Summary"
1359         #       msg = "\nThe following nodes were acted upon:  \n\n"
1360         #       for (node, (type, date)) in self.emailed.items():
1361         #               # Print only things acted on today.
1362         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1363         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1364         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1365         #       for (loginbase, (date, type)) in self.squeezed.items():
1366         #               # Print only things acted on today.
1367         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1368         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1369         #       mailer.email(sub, msg, [SUMTO])
1370         #       logger.info(msg)
1371         #       return
1372
1373         #"""
1374         #Store/Load state of emails.  When, where, what.
1375         #"""
1376         #def emailedStore(self, action):
1377         #       try:
1378         #               if action == "LOAD":
1379         #                       f = open(DAT, "r+")
1380         #                       logger.info("POLICY:  Found and reading " + DAT)
1381         #                       self.emailed.update(pickle.load(f))
1382         #               if action == "WRITE":
1383         #                       f = open(DAT, "w")
1384         #                       #logger.debug("Writing " + DAT)
1385         #                       pickle.dump(self.emailed, f)
1386         #               f.close()
1387         #       except Exception, err:
1388         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1389
1390
1391 #class Policy(Thread):
1392
1393 def main():
1394         print "policy.py is a module, not a script for running directly."
1395
1396 if __name__ == '__main__':
1397         import os
1398         import plc
1399         try:
1400                 main()
1401         except KeyboardInterrupt:
1402                 print "Killed.  Exitting."
1403                 logger.info('Monitor Killed')
1404                 os._exit(0)