policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import database
  23 import string
  24 from www.printbadnodes import cmpCategoryVal
  25 import config
  26
  27 DAT="./monitor.dat"
  28
  29 logger = logging.getLogger("monitor")
  30
  31 # Time to enforce policy
  32 POLSLEEP = 7200
  33
  34 # Where to email the summary
  35 SUMTO = "soltesz@cs.princeton.edu"
  36 TECHEMAIL="tech-%s@sites.planet-lab.org"
  37 PIEMAIL="pi-%s@sites.planet-lab.org"
  38 SLICEMAIL="%s@slices.planet-lab.org"
  39 PLCEMAIL="support@planet-lab.org"
  40
  41 #Thresholds (DAYS)
  42 SPERMIN = 60
  43 SPERHOUR = 60*60
  44 SPERDAY = 86400
  45 PITHRESH = 7 * SPERDAY
  46 SLICETHRESH = 7 * SPERDAY
  47 # Days before attempting rins again
  48 RINSTHRESH = 5 * SPERDAY
  49
  50 # Days before calling the node dead.
  51 DEADTHRESH = 30 * SPERDAY
  52 # Minimum number of nodes up before squeezing
  53 MINUP = 2
  54
  55 TECH=1
  56 PI=2
  57 USER=4
  58 ADMIN=8
  59
  60 # IF:
  61 #  no SSH, down.
  62 #  bad disk, down
  63 #  DNS, kinda down (sick)
  64 #  clock, kinda down (sick)
  65 #  Full disk, going to be down
  66
  67 # Actions:
  68 #  Email
  69 #  suspend slice creation
  70 #  kill slices
  71 def array_to_priority_map(array):
  72         """ Create a mapping where each entry of array is given a priority equal
  73         to its position in the array.  This is useful for subsequent use in the
  74         cmpMap() function."""
  75         map = {}
  76         count = 0
  77         for i in array:
  78                 map[i] = count
  79                 count += 1
  80         return map
  81
  82 def getdebug():
  83         return config.debug
  84
  85 def print_stats(key, stats):
  86         if key in stats: print "%20s : %d" % (key, stats[key])
  87
  88 def get_ticket_id(record):
  89         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
  90                 return record['ticket_id']
  91         elif            'found_rt_ticket' in record and \
  92                  record['found_rt_ticket'] is not "" and \
  93                  record['found_rt_ticket'] is not None:
  94                 return record['found_rt_ticket']
  95         else:
  96                 return None
  97
  98 class Merge(Thread):
  99         def __init__(self, l_merge, toRT):
 100                 self.toRT = toRT
 101                 self.merge_list = l_merge
 102                 # the hostname to loginbase mapping
 103                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 104
 105                 # Previous actions taken on nodes.
 106                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
 107                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 108
 109                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
 110                 self.sickdb = {}
 111                 self.mergedb = {}
 112                 Thread.__init__(self)
 113
 114         def run(self):
 115                 # populate sickdb
 116                 self.accumSickSites()
 117                 # read data from findbad and act_all
 118                 self.mergeActionsAndBadDB()
 119                 # pass node_records to RT
 120                 self.sendToRT()
 121
 122         def accumSickSites(self):
 123                 """
 124                 Take all nodes, from l_diagnose, look them up in the act_all database,
 125                 and insert them into sickdb[] as:
 126
 127                         sickdb[loginbase][nodename] = fb_record
 128                 """
 129                 # look at all problems reported by findbad
 130                 l_nodes = self.findbad['nodes'].keys()
 131                 count = 0
 132                 for nodename in l_nodes:
 133                         if nodename not in self.merge_list:
 134                                 continue                # skip this node, since it's not wanted
 135
 136                         count += 1
 137                         loginbase = self.plcdb_hn2lb[nodename]
 138                         values = self.findbad['nodes'][nodename]['values']
 139
 140                         fb_record = {}
 141                         fb_record['nodename'] = nodename
 142                         try:
 143                                 fb_record['category'] = values['category']
 144                         except:
 145                                 print values
 146                                 print nodename
 147                                 print self.findbad['nodes'][nodename]
 148                                 count -= 1
 149                                 continue
 150                         fb_record['state'] = values['state']
 151                         fb_record['comonstats'] = values['comonstats']
 152                         fb_record['plcnode'] = values['plcnode']
 153                         fb_record['kernel'] = self.getKernel(values['kernel'])
 154                         fb_record['stage'] = "findbad"
 155                         fb_record['message'] = None
 156                         fb_record['bootcd'] = values['bootcd']
 157                         fb_record['args'] = None
 158                         fb_record['info'] = None
 159                         fb_record['time'] = time.time()
 160                         fb_record['date_created'] = time.time()
 161
 162                         if loginbase not in self.sickdb:
 163                                 self.sickdb[loginbase] = {}
 164
 165                         self.sickdb[loginbase][nodename] = fb_record
 166
 167                 print "Found %d nodes" % count
 168
 169         def getKernel(self, unamestr):
 170                 s = unamestr.split()
 171                 if len(s) > 2:
 172                         return s[2]
 173                 else:
 174                         return ""
 175
 176         def mergeActionsAndBadDB(self):
 177                 """
 178                 - Look at the sick node_records as reported in findbad,
 179                 - Then look at the node_records in act_all.
 180
 181                 There are four cases:
 182                 1) Problem in findbad, no problem in act_all
 183                         this ok, b/c it just means it's a new problem
 184                 2) Problem in findbad, problem in act_all
 185                         -Did the problem get better or worse?
 186                                 -If Same, or Worse, then continue looking for open tickets.
 187                                 -If Better, or No problem, then "back-off" penalties.
 188                                         This judgement may need to wait until 'Diagnose()'
 189
 190                 3) No problem in findbad, problem in act_all
 191                         The the node is operational again according to Findbad()
 192
 193                 4) No problem in findbad, no problem in act_all
 194                         There won't be a record in either db, so there's no code.
 195                 """
 196
 197                 sorted_sites = self.sickdb.keys()
 198                 sorted_sites.sort()
 199                 # look at all problems reported by findbad
 200                 for loginbase in sorted_sites:
 201                         d_fb_nodes = self.sickdb[loginbase]
 202                         sorted_nodes = d_fb_nodes.keys()
 203                         sorted_nodes.sort()
 204                         for nodename in sorted_nodes:
 205                                 fb_record = self.sickdb[loginbase][nodename]
 206                                 x = fb_record
 207                                 if loginbase not in self.mergedb:
 208                                         self.mergedb[loginbase] = {}
 209
 210                                 # take the info either from act_all or fb-record.
 211                                 # if node not in act_all
 212                                 #       then take it from fbrecord, obviously.
 213                                 # else node in act_all
 214                                 #   if act_all == 0 length (no previous records)
 215                                 #               then take it from fbrecord.
 216                                 #   else
 217                                 #           take it from act_all.
 218                                 #
 219
 220                                 # We must compare findbad state with act_all state
 221                                 if nodename not in self.act_all:
 222                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 223                                         self.mergedb[loginbase][nodename] = {}
 224                                         self.mergedb[loginbase][nodename].update(x)
 225                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 226                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 227                                 else:
 228                                         if len(self.act_all[nodename]) == 0:
 229                                                 self.mergedb[loginbase][nodename] = {}
 230                                                 self.mergedb[loginbase][nodename].update(x)
 231                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
 232                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 233                                         else:
 234                                                 y = self.act_all[nodename][0]
 235                                                 y['prev_category'] = y['category']
 236
 237                                                 self.mergedb[loginbase][nodename] = {}
 238                                                 self.mergedb[loginbase][nodename].update(y)
 239                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 240                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
 241                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
 242                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 243                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 244                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 245                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
 246                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
 247
 248                                         # delete the entry from cache_all to keep it out of case 3)
 249                                         del self.cache_all[nodename]
 250
 251                 # 3) nodes that remin in cache_all were not identified by findbad.
 252                 #        Do we keep them or not?
 253                 #   NOTE: i think that since the categories are performed before this
 254                 #               step now, and by a monitor-controlled agent.
 255
 256                 # TODO: This does not work correctly.  Do we need this?
 257                 #for hn in self.cache_all.keys():
 258                 #       y = self.act_all[hn][0]
 259                 #       if 'monitor' in y['bucket']:
 260                 #               loginbase = self.plcdb_hn2lb[hn]
 261                 #               if loginbase not in self.sickdb:
 262                 #                       self.sickdb[loginbase] = {}
 263                 #               self.sickdb[loginbase][hn] = y
 264                 #       else:
 265                 #               del self.cache_all[hn]
 266
 267                 print "len of cache_all: %d" % len(self.cache_all.keys())
 268                 return
 269
 270         def sendToRT(self):
 271                 sorted_sites = self.mergedb.keys()
 272                 sorted_sites.sort()
 273                 # look at all problems reported by merge
 274                 for loginbase in sorted_sites:
 275                         d_merge_nodes = self.mergedb[loginbase]
 276                         for nodename in d_merge_nodes.keys():
 277                                 record = self.mergedb[loginbase][nodename]
 278                                 self.toRT.put(record)
 279
 280                 # send signal to stop reading
 281                 self.toRT.put(None)
 282                 return
 283
 284 class Diagnose(Thread):
 285         def __init__(self, fromRT):
 286                 self.fromRT = fromRT
 287                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 288                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 289
 290                 self.diagnose_in = {}
 291                 self.diagnose_out = {}
 292                 Thread.__init__(self)
 293
 294
 295         def run(self):
 296                 self.accumSickSites()
 297
 298                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 299                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 300
 301                 try:
 302                         stats = self.diagnoseAll()
 303                 except Exception, err:
 304                         print "----------------"
 305                         import traceback
 306                         print traceback.print_exc()
 307                         print err
 308                         #if config.policysavedb:
 309                         sys.exit(1)
 310
 311                 print_stats("sites_observed", stats)
 312                 print_stats("sites_diagnosed", stats)
 313                 print_stats("nodes_diagnosed", stats)
 314
 315                 if config.policysavedb:
 316                         print "Saving Databases... diagnose_out"
 317                         database.dbDump("diagnose_out", self.diagnose_out)
 318
 319         def accumSickSites(self):
 320                 """
 321                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 322                 and insert them into diagnose_in[] as:
 323
 324                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 325                 """
 326                 while 1:
 327                         node_record = self.fromRT.get(block = True)
 328                         if node_record == None:
 329                                 break;
 330
 331                         nodename = node_record['nodename']
 332                         loginbase = self.plcdb_hn2lb[nodename]
 333
 334                         if loginbase not in self.diagnose_in:
 335                                 self.diagnose_in[loginbase] = {}
 336
 337                         self.diagnose_in[loginbase][nodename] = node_record
 338
 339                 return
 340
 341         def diagnoseAll(self):
 342                 i_sites_observed = 0
 343                 i_sites_diagnosed = 0
 344                 i_nodes_diagnosed = 0
 345                 i_nodes_actedon = 0
 346                 i_sites_emailed = 0
 347                 l_allsites = []
 348
 349                 sorted_sites = self.diagnose_in.keys()
 350                 sorted_sites.sort()
 351                 self.diagnose_out= {}
 352                 for loginbase in sorted_sites:
 353                         l_allsites += [loginbase]
 354
 355                         d_diag_nodes = self.diagnose_in[loginbase]
 356                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 357                         # store records in diagnose_out, for saving later.
 358                         self.diagnose_out.update(d_act_records)
 359
 360                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 361                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 362                                 i_sites_diagnosed += 1
 363                         i_sites_observed += 1
 364
 365                 return {'sites_observed': i_sites_observed,
 366                                 'sites_diagnosed': i_sites_diagnosed,
 367                                 'nodes_diagnosed': i_nodes_diagnosed,
 368                                 'allsites':l_allsites}
 369
 370                 pass
 371
 372         def getDaysDown(cls, diag_record):
 373                 daysdown = -1
 374                 last_contact = diag_record['plcnode']['last_contact']
 375                 date_created = diag_record['plcnode']['date_created']
 376
 377                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
 378                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 379                 elif last_contact is None:
 380                         if date_created is not None:
 381                                 now = time.time()
 382                                 diff = now - date_created
 383                                 daysdown = diff // (60*60*24)
 384                         else:
 385                                 daysdown = -1
 386                 else:
 387                         now = time.time()
 388                         diff = now - last_contact
 389                         daysdown = diff // (60*60*24)
 390                 return daysdown
 391         getDaysDown = classmethod(getDaysDown)
 392
 393         def getStrDaysDown(cls, diag_record):
 394                 daysdown = "unknown"
 395                 last_contact = diag_record['plcnode']['last_contact']
 396                 date_created = diag_record['plcnode']['date_created']
 397
 398                 if      diag_record['comonstats']['uptime'] != "null" and \
 399                         diag_record['comonstats']['uptime'] != "-1":
 400                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 401                         daysdown = "%d days up" % daysdown
 402
 403                 elif last_contact is None:
 404                         if date_created is not None:
 405                                 now = time.time()
 406                                 diff = now - date_created
 407                                 daysdown = diff // (60*60*24)
 408                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
 409                         else:
 410                                 daysdown = "Never contacted PLC"
 411                 else:
 412                         now = time.time()
 413                         diff = now - last_contact
 414                         daysdown = diff // (60*60*24)
 415                         daysdown = "%s days down" % daysdown
 416                 return daysdown
 417         getStrDaysDown = classmethod(getStrDaysDown)
 418         #def getStrDaysDown(cls, diag_record):
 419         #       daysdown = cls.getDaysDown(diag_record)
 420         #       if daysdown > -1:
 421         #               return "%d days down"%daysdown
 422         #       elif daysdown == -1:
 423         #               return "Has never contacted PLC"
 424         #       else:
 425         #               return "%d days up"% -daysdown
 426         #getStrDaysDown = classmethod(getStrDaysDown)
 427
 428         def __getCDVersion(self, diag_record, nodename):
 429                 cdversion = ""
 430                 #print "Getting kernel for: %s" % diag_record['nodename']
 431                 cdversion = diag_record['kernel']
 432                 return cdversion
 433
 434         def __diagnoseSite(self, loginbase, d_diag_nodes):
 435                 """
 436                 d_diag_nodes are diagnose_in entries.
 437                 """
 438                 d_diag_site = {loginbase : { 'config' :
 439                                                                                                 {'squeeze': False,
 440                                                                                                  'email': False
 441                                                                                                 },
 442                                                                         'nodes': {}
 443                                                                         }
 444                                            }
 445                 sorted_nodes = d_diag_nodes.keys()
 446                 sorted_nodes.sort()
 447                 for nodename in sorted_nodes:
 448                         node_record = d_diag_nodes[nodename]
 449                         diag_record = self.__diagnoseNode(loginbase, node_record)
 450
 451                         if diag_record != None:
 452                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 453
 454                                 # NOTE: improvement means, we need to act/squeeze and email.
 455                                 #print "DIAG_RECORD", diag_record
 456                                 if 'monitor-end-record' in diag_record['stage'] or \
 457                                    'nmreset' in diag_record['stage']:
 458                                 #       print "resetting loginbase!"
 459                                         d_diag_site[loginbase]['config']['squeeze'] = True
 460                                         d_diag_site[loginbase]['config']['email'] = True
 461                                 #else:
 462                                 #       print "NO IMPROVEMENT!!!!"
 463                         else:
 464                                 pass # there is nothing to do for this node.
 465
 466                 # NOTE: these settings can be overridden by command line arguments,
 467                 #       or the state of a record, i.e. if already in RT's Support Queue.
 468                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 469                 if nodes_up < MINUP:
 470                         d_diag_site[loginbase]['config']['squeeze'] = True
 471
 472                 max_slices = self.getMaxSlices(loginbase)
 473                 num_nodes = self.getNumNodes(loginbase)
 474                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 475                 #       or an old disabled site from previous monitor (before site['enabled'])
 476                 if nodes_up < num_nodes and max_slices != 0:
 477                         d_diag_site[loginbase]['config']['email'] = True
 478
 479                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 480                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 481
 482                 return d_diag_site
 483
 484         def diagRecordByCategory(self, node_record):
 485                 nodename = node_record['nodename']
 486                 category = node_record['category']
 487                 state    = node_record['state']
 488                 loginbase = self.plcdb_hn2lb[nodename]
 489                 diag_record = None
 490
 491                 if  "ERROR" in category:        # i.e. "DOWN"
 492                         diag_record = {}
 493                         diag_record.update(node_record)
 494                         daysdown = self.getDaysDown(diag_record)
 495                         if daysdown < 7:
 496                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 497                                 print format % (loginbase, nodename, daysdown)
 498                                 return None
 499
 500                         s_daysdown = self.getStrDaysDown(diag_record)
 501                         diag_record['message'] = emailTxt.mailtxt.newdown
 502                         diag_record['args'] = {'nodename': nodename}
 503                         diag_record['info'] = (nodename, s_daysdown, "")
 504
 505                         if 'reboot_node_failed' in node_record:
 506                                 # there was a previous attempt to use the PCU.
 507                                 if node_record['reboot_node_failed'] == False:
 508                                         # then the last attempt apparently, succeeded.
 509                                         # But, the category is still 'ERROR'.  Therefore, the
 510                                         # PCU-to-Node mapping is broken.
 511                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 512                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 513                                         diag_record['email_pcu'] = True
 514
 515                         if 'ticket_id' in diag_record:
 516                                 if diag_record['ticket_id'] == "":
 517                                         if 'found_rt_ticket' in diag_record:
 518                                                 ticket_id = diag_record['found_rt_ticket']
 519                                         else:
 520                                                 ticket_id = "None"
 521                                 else:
 522                                         ticket_id = diag_record['ticket_id']
 523                         else:
 524                                 ticket_id = "None"
 525
 526                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 527                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
 528
 529                 elif "OLDBOOTCD" in category:
 530                         # V2 boot cds as determined by findbad
 531                         s_daysdown = self.getStrDaysDown(node_record)
 532                         s_cdversion = self.__getCDVersion(node_record, nodename)
 533                         diag_record = {}
 534                         diag_record.update(node_record)
 535                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 536                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 537                         diag_record['args'] = {'nodename': nodename}
 538                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 539                         if diag_record['ticket_id'] == "":
 540                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 541                                                                         (loginbase, nodename, diag_record['kernel'],
 542                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 543                         else:
 544                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 545                                                                         (loginbase, nodename, diag_record['kernel'],
 546                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 547
 548                 elif "PROD" in category:
 549                         if "DEBUG" in state:
 550                                 # Not sure what to do with these yet.  Probably need to
 551                                 # reboot, and email.
 552                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 553                                 return None
 554                         elif "BOOT" in state:
 555                                 # no action needed.
 556                                 # TODO: remove penalties, if any are applied.
 557                                 now = time.time()
 558                                 last_contact = node_record['plcnode']['last_contact']
 559                                 if last_contact == None:
 560                                         time_diff = 0
 561                                 else:
 562                                         time_diff = now - last_contact;
 563
 564                                 if 'improvement' in node_record['stage']:
 565                                         # then we need to pass this on to 'action'
 566                                         diag_record = {}
 567                                         diag_record.update(node_record)
 568                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 569                                         diag_record['args'] = {'nodename': nodename}
 570                                         diag_record['info'] = (nodename, node_record['prev_category'],
 571                                                                                                          node_record['category'])
 572                                         if 'email_pcu' in diag_record:
 573                                                 if diag_record['email_pcu']:
 574                                                         # previously, the pcu failed to reboot, so send
 575                                                         # email. Now, reset these values to try the reboot
 576                                                         # again.
 577                                                         diag_record['email_pcu'] = False
 578                                                         del diag_record['reboot_node_failed']
 579
 580                                         if diag_record['ticket_id'] == "":
 581                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 582                                                                         (loginbase, nodename, diag_record['stage'],
 583                                                                          state, category, diag_record['found_rt_ticket'])
 584                                         else:
 585                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 586                                                                         (loginbase, nodename, diag_record['stage'],
 587                                                                          state, category, diag_record['ticket_id'])
 588                                         return diag_record
 589                                 #elif time_diff >= 6*SPERHOUR:
 590                                 #       # heartbeat is older than 30 min.
 591                                 #       # then reset NM.
 592                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 593                                 #       diag_record = {}
 594                                 #       diag_record.update(node_record)
 595                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
 596                                 #       diag_record['args'] = {'nodename': nodename}
 597                                 #       diag_record['stage'] = "nmreset"
 598                                 #       diag_record['info'] = (nodename,
 599                                 #                                                       node_record['prev_category'],
 600                                 #                                                       node_record['category'])
 601                                 #       if diag_record['ticket_id'] == "":
 602                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 603                                 #                                       (loginbase, nodename, diag_record['stage'],
 604                                 #                                        state, category, diag_record['found_rt_ticket'])
 605                                 #       else:
 606                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 607                                 #                                       (loginbase, nodename, diag_record['stage'])
 608 #
 609 #                                       return diag_record
 610                                 else:
 611                                         return None
 612                         else:
 613                                 # unknown
 614                                 pass
 615                 elif "ALPHA"    in category:
 616                         pass
 617                 elif "clock_drift" in category:
 618                         pass
 619                 elif "dns"    in category:
 620                         pass
 621                 elif "filerw"    in category:
 622                         pass
 623                 else:
 624                         print "Unknown category!!!! %s" % category
 625                         sys.exit(1)
 626
 627                 return diag_record
 628
 629         def __diagnoseNode(self, loginbase, node_record):
 630                 # TODO: change the format of the hostname in this
 631                 #               record to something more natural.
 632                 nodename                = node_record['nodename']
 633                 category                = node_record['category']
 634                 prev_category   = node_record['prev_category']
 635                 state                   = node_record['state']
 636                 #if 'prev_category' in node_record:
 637                 #       prev_category = node_record['prev_category']
 638                 #else:
 639                 #       prev_category = "ERROR"
 640                 if node_record['prev_category'] != "NORECORD":
 641
 642                         val = cmpCategoryVal(category, prev_category)
 643                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 644                         if val == 1:
 645                                 # improved
 646                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 647                                         print "closing record with no ticket: ", node_record['nodename']
 648                                         node_record['action'] = ['close_rt']
 649                                         node_record['message'] = None
 650                                         node_record['stage'] = 'monitor-end-record'
 651                                         return node_record
 652                                 else:
 653                                         node_record['stage'] = 'improvement'
 654
 655                                 #if 'monitor-end-record' in node_record['stage']:
 656                                 #       # just ignore it if it's already ended.
 657                                 #       # otherwise, the status should be worse, and we won't get
 658                                 #       # here.
 659                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 660                                 #       return None
 661 #
 662 #                                       #return None
 663                         elif val == -1:
 664                                 # current category is worse than previous, carry on
 665                                 pass
 666                         else:
 667                                 #values are equal, carry on.
 668                                 #print "why are we here?"
 669                                 pass
 670
 671                 if 'rt' in node_record and 'Status' in node_record['rt']:
 672                         if node_record['stage'] == 'ticket_waitforever':
 673                                 if 'resolved' in node_record['rt']['Status']:
 674                                         print "ending waitforever record for: ", node_record['nodename']
 675                                         node_record['action'] = ['noop']
 676                                         node_record['message'] = None
 677                                         node_record['stage'] = 'monitor-end-record'
 678                                         print "oldlog: %s" % node_record['log'],
 679                                         print "%15s" % node_record['action']
 680                                         return node_record
 681                                 if 'new' in node_record['rt']['Status'] and \
 682                                         'Queue' in node_record['rt'] and \
 683                                         'Monitor' in node_record['rt']['Queue']:
 684
 685                                         print "RESETTING stage to findbad"
 686                                         node_record['stage'] = 'findbad'
 687
 688                 #### COMPARE category and prev_category
 689                 # if not_equal
 690                 #       then assign a stage based on relative priorities
 691                 # else equal
 692                 #       then check category for stats.
 693                 diag_record = self.diagRecordByCategory(node_record)
 694                 if diag_record == None:
 695                         #print "diag_record == None"
 696                         return None
 697
 698                 #### found_RT_ticket
 699                 # TODO: need to record time found, and maybe add a stage for acting on it...
 700                 # NOTE: after found, if the support ticket is resolved, the block is
 701                 #               not removed. How to remove the block on this?
 702                 if 'found_rt_ticket' in diag_record and \
 703                         diag_record['found_rt_ticket'] is not None:
 704                         if diag_record['stage'] is not 'improvement':
 705                                 diag_record['stage'] = 'ticket_waitforever'
 706
 707                 current_time = time.time()
 708                 # take off four days, for the delay that database caused.
 709                 # TODO: generalize delays at PLC, and prevent enforcement when there
 710                 #               have been no emails.
 711                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 712                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 713                 delta = current_time - diag_record['time']
 714
 715                 message = diag_record['message']
 716                 act_record = {}
 717                 act_record.update(diag_record)
 718
 719                 #### DIAGNOSE STAGES
 720                 if   'findbad' in diag_record['stage']:
 721                         # The node is bad, and there's no previous record of it.
 722                         act_record['email'] = TECH
 723                         act_record['action'] = ['noop']
 724                         act_record['message'] = message[0]
 725                         act_record['stage'] = 'stage_actinoneweek'
 726
 727                 elif 'nmreset' in diag_record['stage']:
 728                         act_record['email']  = ADMIN
 729                         act_record['action'] = ['reset_nodemanager']
 730                         act_record['message'] = message[0]
 731                         act_record['stage']  = 'nmreset'
 732                         return None
 733
 734                 elif 'reboot_node' in diag_record['stage']:
 735                         act_record['email'] = TECH
 736                         act_record['action'] = ['noop']
 737                         act_record['message'] = message[0]
 738                         act_record['stage'] = 'stage_actinoneweek'
 739
 740                 elif 'improvement' in diag_record['stage']:
 741                         # - backoff previous squeeze actions (slice suspend, nocreate)
 742                         # TODO: add a backoff_squeeze section... Needs to runthrough
 743                         print "backing off of %s" % nodename
 744                         act_record['action'] = ['close_rt']
 745                         act_record['message'] = message[0]
 746                         act_record['stage'] = 'monitor-end-record'
 747
 748                 elif 'actinoneweek' in diag_record['stage']:
 749                         if delta >= 7 * SPERDAY:
 750                                 act_record['email'] = TECH | PI
 751                                 act_record['stage'] = 'stage_actintwoweeks'
 752                                 act_record['message'] = message[1]
 753                                 act_record['action'] = ['nocreate' ]
 754                                 act_record['time'] = current_time               # reset clock for waitforever
 755                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 756                                 act_record['email'] = TECH
 757                                 act_record['message'] = message[0]
 758                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 759                                 act_record['second-mail-at-oneweek'] = True
 760                         else:
 761                                 act_record['message'] = None
 762                                 act_record['action'] = ['waitforoneweekaction' ]
 763                                 print "ignoring this record for: %s" % act_record['nodename']
 764                                 return None                     # don't send if there's no action
 765
 766                 elif 'actintwoweeks' in diag_record['stage']:
 767                         if delta >= 7 * SPERDAY:
 768                                 act_record['email'] = TECH | PI | USER
 769                                 act_record['stage'] = 'stage_waitforever'
 770                                 act_record['message'] = message[2]
 771                                 act_record['action'] = ['suspendslices']
 772                                 act_record['time'] = current_time               # reset clock for waitforever
 773                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 774                                 act_record['email'] = TECH | PI
 775                                 act_record['message'] = message[1]
 776                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 777                                 act_record['second-mail-at-twoweeks'] = True
 778                         else:
 779                                 act_record['message'] = None
 780                                 act_record['action'] = ['waitfortwoweeksaction']
 781                                 return None                     # don't send if there's no action
 782
 783                 elif 'ticket_waitforever' in diag_record['stage']:
 784                         act_record['email'] = TECH
 785                         if 'first-found' not in act_record:
 786                                 act_record['first-found'] = True
 787                                 act_record['log'] += " firstfound"
 788                                 act_record['action'] = ['ticket_waitforever']
 789                                 act_record['message'] = message[0]
 790                                 act_record['time'] = current_time
 791                         else:
 792                                 if delta >= 7*SPERDAY:
 793                                         act_record['action'] = ['ticket_waitforever']
 794                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
 795                                                         act_record['rt']['Status'] == 'new':
 796                                                 act_record['message'] = message[0]
 797                                         else:
 798                                                 act_record['message'] = None
 799
 800                                         act_record['time'] = current_time               # reset clock
 801                                 else:
 802                                         act_record['action'] = ['ticket_waitforever']
 803                                         act_record['message'] = None
 804                                         return None
 805
 806                 elif 'waitforever' in diag_record['stage']:
 807                         # more than 3 days since last action
 808                         # TODO: send only on weekdays.
 809                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 810                         if delta >= 3*SPERDAY:
 811                                 act_record['action'] = ['email-againwaitforever']
 812                                 act_record['message'] = message[2]
 813                                 act_record['time'] = current_time               # reset clock
 814                         else:
 815                                 act_record['action'] = ['waitforever']
 816                                 act_record['message'] = None
 817                                 return None                     # don't send if there's no action
 818
 819                 else:
 820                         # There is no action to be taken, possibly b/c the stage has
 821                         # already been performed, but diagnose picked it up again.
 822                         # two cases,
 823                         #       1. stage is unknown, or
 824                         #       2. delta is not big enough to bump it to the next stage.
 825                         # TODO: figure out which. for now assume 2.
 826                         print "UNKNOWN stage for %s; nothing done" % nodename
 827                         act_record['action'] = ['unknown']
 828                         act_record['message'] = message[0]
 829
 830                         act_record['email'] = TECH
 831                         act_record['action'] = ['noop']
 832                         act_record['message'] = message[0]
 833                         act_record['stage'] = 'stage_actinoneweek'
 834                         act_record['time'] = current_time               # reset clock
 835                         #print "Exiting..."
 836                         #return None
 837                         #sys.exit(1)
 838
 839                 print "%s" % act_record['log'],
 840                 print "%15s" % act_record['action']
 841                 return act_record
 842
 843         def getMaxSlices(self, loginbase):
 844                 # if sickdb has a loginbase, then it will have at least one node.
 845                 site_stats = None
 846
 847                 for nodename in self.diagnose_in[loginbase].keys():
 848                         if nodename in self.findbad['nodes']:
 849                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 850                                 break
 851
 852                 if site_stats == None:
 853                         raise Exception, "loginbase with no nodes in findbad"
 854                 else:
 855                         return site_stats['max_slices']
 856
 857         def getNumNodes(self, loginbase):
 858                 # if sickdb has a loginbase, then it will have at least one node.
 859                 site_stats = None
 860
 861                 for nodename in self.diagnose_in[loginbase].keys():
 862                         if nodename in self.findbad['nodes']:
 863                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 864                                 break
 865
 866                 if site_stats == None:
 867                         raise Exception, "loginbase with no nodes in findbad"
 868                 else:
 869                         if 'num_nodes' in site_stats:
 870                                 return site_stats['num_nodes']
 871                         else:
 872                                 return 0
 873
 874         """
 875         Returns number of up nodes as the total number *NOT* in act_all with a
 876         stage other than 'steady-state' .
 877         """
 878         def getUpAtSite(self, loginbase, d_diag_site):
 879                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 880                 #               that aren't recorded yet.
 881
 882                 numnodes = self.getNumNodes(loginbase)
 883                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 884                 # TODO: make the 'up' value more representative
 885                 up = numnodes
 886                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 887
 888                         rec = d_diag_site[loginbase]['nodes'][nodename]
 889                         if rec['stage'] != 'monitor-end-record':
 890                                 up -= 1
 891                         else:
 892                                 pass # the node is assumed to be up.
 893
 894                 #if up != numnodes:
 895                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 896
 897                 return up
 898
 899
 900 class SiteAction:
 901         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 902                 self.parameter_names = parameter_names
 903         def checkParam(self, args):
 904                 for param in self.parameter_names:
 905                         if param not in args:
 906                                 raise Exception("Parameter %s not provided in args"%param)
 907         def run(self, args):
 908                 self.checkParam(args)
 909                 return self._run(args)
 910         def _run(self, args):
 911                 pass
 912
 913 class SuspendAction(SiteAction):
 914         def _run(self, args):
 915                 return plc.suspendSlices(args['hostname'])
 916
 917 class RemoveSliceCreation(SiteAction):
 918         def _run(self, args):
 919                 return plc.removeSliceCreation(args['hostname'])
 920
 921 class BackoffActions(SiteAction):
 922         def _run(self, args):
 923                 plc.enableSlices(args['hostname'])
 924                 plc.enableSliceCreation(args['hostname'])
 925                 return True
 926
 927 # TODO: create class for each action below,
 928 #               allow for lists of actions to be performed...
 929
 930 def close_rt_backoff(args):
 931         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
 932                 mailer.closeTicketViaRT(args['ticket_id'],
 933                                                                 "Ticket CLOSED automatically by SiteAssist.")
 934                 plc.enableSlices(args['hostname'])
 935                 plc.enableSliceCreation(args['hostname'])
 936         return
 937
 938 def reboot_node(args):
 939         host = args['hostname']
 940         return reboot.reboot_policy(host, True, config.debug)
 941
 942 def reset_nodemanager(args):
 943         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 944         return
 945
 946 class Action(Thread):
 947         def __init__(self, l_action):
 948                 self.l_action = l_action
 949
 950                 # the hostname to loginbase mapping
 951                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 952
 953                 # Actions to take.
 954                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
 955                 # Actions taken.
 956                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
 957
 958                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 959                 self.actions = {}
 960                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 961                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 962                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 963                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
 964                 self.actions['noop'] = lambda args: args
 965                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 966                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 967
 968                 self.actions['ticket_waitforever'] = lambda args: args
 969                 self.actions['waitforever'] = lambda args: args
 970                 self.actions['unknown'] = lambda args: args
 971                 self.actions['waitforoneweekaction'] = lambda args: args
 972                 self.actions['waitfortwoweeksaction'] = lambda args: args
 973                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 974                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 975                 self.actions['email-againwaitforever'] = lambda args: args
 976                 self.actions['email-againticket_waitforever'] = lambda args: args
 977
 978
 979                 self.sickdb = {}
 980                 Thread.__init__(self)
 981
 982         def run(self):
 983                 self.accumSites()
 984                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 985                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 986
 987                 try:
 988                         stats = self.analyseSites()
 989                 except Exception, err:
 990                         print "----------------"
 991                         import traceback
 992                         print traceback.print_exc()
 993                         print err
 994                         if config.policysavedb:
 995                                 print "Saving Databases... act_all"
 996                                 database.dbDump("act_all", self.act_all)
 997                         sys.exit(1)
 998
 999                 print_stats("sites_observed", stats)
1000                 print_stats("sites_diagnosed", stats)
1001                 print_stats("nodes_diagnosed", stats)
1002                 print_stats("sites_emailed", stats)
1003                 print_stats("nodes_actedon", stats)
1004                 print string.join(stats['allsites'], ",")
1005
1006                 if config.policysavedb:
1007                         print "Saving Databases... act_all"
1008                         #database.dbDump("policy.eventlog", self.eventlog)
1009                         # TODO: remove 'diagnose_out',
1010                         #       or at least the entries that were acted on.
1011                         database.dbDump("act_all", self.act_all)
1012
1013         def accumSites(self):
1014                 """
1015                 Take all nodes, from l_action, look them up in the diagnose_db database,
1016                 and insert them into sickdb[] as:
1017
1018                 This way only the given l_action nodes will be acted on regardless
1019                 of how many from diagnose_db are available.
1020
1021                         sickdb[loginbase][nodename] = diag_record
1022                 """
1023                 # TODO: what if l_action == None ?
1024                 for nodename in self.l_action:
1025
1026                         loginbase = self.plcdb_hn2lb[nodename]
1027
1028                         if loginbase in self.diagnose_db and \
1029                                 nodename in self.diagnose_db[loginbase]['nodes']:
1030
1031                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1032
1033                                 if loginbase not in self.sickdb:
1034                                         self.sickdb[loginbase] = {'nodes' : {}}
1035
1036                                 # NOTE: don't copy all node records, since not all will be in l_action
1037                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1038                                 # NOTE: but, we want to get the loginbase config settings,
1039                                 #               this is the easiest way.
1040                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1041                         #else:
1042                                 #print "%s not in diagnose_db!!" % loginbase
1043                 return
1044
1045         def __emailSite(self, loginbase, roles, message, args):
1046                 """
1047                 loginbase is the unique site abbreviation, prepended to slice names.
1048                 roles contains TECH, PI, USER roles, and derive email aliases.
1049                 record contains {'message': [<subj>,<body>], 'args': {...}}
1050                 """
1051                 ticket_id = 0
1052                 args.update({'loginbase':loginbase})
1053
1054                 if not config.mail and not config.debug and config.bcc:
1055                         roles = ADMIN
1056                 if config.mail and config.debug:
1057                         roles = ADMIN
1058
1059                 # build targets
1060                 contacts = []
1061                 if ADMIN & roles:
1062                         contacts += [config.email]
1063                 if TECH & roles:
1064                         contacts += [TECHEMAIL % loginbase]
1065                 if PI & roles:
1066                         contacts += [PIEMAIL % loginbase]
1067                 if USER & roles:
1068                         slices = plc.slices(loginbase)
1069                         if len(slices) >= 1:
1070                                 for slice in slices:
1071                                         contacts += [SLICEMAIL % slice]
1072                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1073                         else:
1074                                 print "SLIC: %20s : 0 slices" % loginbase
1075
1076                 try:
1077                         subject = message[0] % args
1078                         body = message[1] % args
1079                         if ADMIN & roles:
1080                                 # send only to admin
1081                                 if 'ticket_id' in args:
1082                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1083                                 else:
1084                                         subj = "Re: [PL noticket] %s" % subject
1085                                 mailer.email(subj, body, contacts)
1086                                 ticket_id = args['ticket_id']
1087                         else:
1088                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1089                 except Exception, err:
1090                         print "exception on message:"
1091                         import traceback
1092                         print traceback.print_exc()
1093                         print message
1094
1095                 return ticket_id
1096
1097
1098         def _format_diaginfo(self, diag_node):
1099                 info = diag_node['info']
1100                 if diag_node['stage'] == 'monitor-end-record':
1101                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1102                 else:
1103                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1104                 return hlist
1105
1106
1107         def get_email_args(self, act_recordlist, loginbase=None):
1108
1109                 email_args = {}
1110                 email_args['hostname_list'] = ""
1111
1112                 for act_record in act_recordlist:
1113                         email_args['hostname_list'] += act_record['msg_format']
1114                         email_args['hostname'] = act_record['nodename']
1115                         if  'plcnode' in act_record and \
1116                                 'pcu_ids' in act_record['plcnode'] and \
1117                                 len(act_record['plcnode']['pcu_ids']) > 0:
1118                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1119                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1120                         else:
1121                                 email_args['pcu_id'] = "-1"
1122
1123                         if 'ticket_id' in act_record:
1124                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1125                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1126                                         sys.stdout.flush()
1127                                         line = sys.stdin.readline()
1128                                         try:
1129                                                 ticket_id = int(line)
1130                                         except:
1131                                                 print "could not get ticket_id from stdin..."
1132                                                 os._exit(1)
1133                                 else:
1134                                         ticket_id = act_record['ticket_id']
1135
1136                                 email_args['ticket_id'] = ticket_id
1137
1138                 return email_args
1139
1140         def get_unique_issues(self, act_recordlist):
1141                 # NOTE: only send one email per site, per problem...
1142                 unique_issues = {}
1143                 for act_record in act_recordlist:
1144                         act_key = act_record['action'][0]
1145                         if act_key not in unique_issues:
1146                                 unique_issues[act_key] = []
1147
1148                         unique_issues[act_key] += [act_record]
1149
1150                 return unique_issues
1151
1152
1153         def __actOnSite(self, loginbase, site_record):
1154                 i_nodes_actedon = 0
1155                 i_nodes_emailed = 0
1156
1157                 act_recordlist = []
1158
1159                 for nodename in site_record['nodes'].keys():
1160                         diag_record = site_record['nodes'][nodename]
1161                         act_record  = self.__actOnNode(diag_record)
1162                         #print "nodename: %s %s" % (nodename, act_record)
1163                         if act_record is not None:
1164                                 act_recordlist += [act_record]
1165
1166                 unique_issues = self.get_unique_issues(act_recordlist)
1167
1168                 for issue in unique_issues.keys():
1169                         print "\tworking on issue: %s" % issue
1170                         issue_record_list = unique_issues[issue]
1171                         email_args = self.get_email_args(issue_record_list, loginbase)
1172
1173                         # for each record.
1174                         for act_record in issue_record_list:
1175                                 # if there's a pcu record and email config is set
1176                                 if 'email_pcu' in act_record:
1177                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1178                                                 # and 'reboot_node' in act_record['stage']:
1179
1180                                                 email_args['hostname'] = act_record['nodename']
1181                                                 ticket_id = self.__emailSite(loginbase,
1182                                                                                         act_record['email'],
1183                                                                                         emailTxt.mailtxt.pcudown[0],
1184                                                                                         email_args)
1185                                                 if ticket_id == 0:
1186                                                         # error.
1187                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1188                                                         os._exit(1)
1189                                                         pass
1190                                                 email_args['ticket_id'] = ticket_id
1191
1192
1193                         act_record = issue_record_list[0]
1194                         # send message before squeezing
1195                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1196                                                                                                 site_record['config']['email'])
1197                         if act_record['message'] != None and site_record['config']['email']:
1198                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1199                                                                                          act_record['message'], email_args)
1200
1201                                 if ticket_id == 0:
1202                                         # error.
1203                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1204                                         os._exit(1)
1205                                         pass
1206
1207                                 # Add ticket_id to ALL nodenames
1208                                 for act_record in issue_record_list:
1209                                         nodename = act_record['nodename']
1210                                         # update node record with RT ticket_id
1211                                         if nodename in self.act_all:
1212                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1213                                                 # if the ticket was previously resolved, reset it to new.
1214                                                 if 'rt' in act_record and \
1215                                                         'Status' in act_record['rt'] and \
1216                                                         act_record['rt']['Status'] == 'resolved':
1217                                                         mailer.setTicketStatus(ticket_id, "new")
1218                                                 status = mailer.getTicketStatus(ticket_id)
1219                                                 self.act_all[nodename][0]['rt'] = status
1220                                         if config.mail: i_nodes_emailed += 1
1221
1222                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1223                                                                                                         site_record['config']['squeeze'])
1224                         if config.squeeze and site_record['config']['squeeze']:
1225                                 for act_key in act_record['action']:
1226                                         self.actions[act_key](email_args)
1227                                 i_nodes_actedon += 1
1228
1229                 if config.policysavedb:
1230                         print "Saving Databases... act_all, diagnose_out"
1231                         database.dbDump("act_all", self.act_all)
1232                         # remove site record from diagnose_out, it's in act_all as done.
1233                         del self.diagnose_db[loginbase]
1234                         database.dbDump("diagnose_out", self.diagnose_db)
1235
1236                 print "sleeping for 1 sec"
1237                 time.sleep(1)
1238                 #print "Hit enter to continue..."
1239                 #sys.stdout.flush()
1240                 #line = sys.stdin.readline()
1241
1242                 return (i_nodes_actedon, i_nodes_emailed)
1243
1244         def __actOnNode(self, diag_record):
1245                 nodename = diag_record['nodename']
1246                 message = diag_record['message']
1247
1248                 act_record = {}
1249                 act_record.update(diag_record)
1250                 act_record['nodename'] = nodename
1251                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1252                 print "act_record['stage'] == %s " % act_record['stage']
1253
1254                 # avoid end records, and nmreset records
1255                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1256
1257                 if 'monitor-end-record' not in act_record['stage'] and \
1258                    'nmreset' not in act_record['stage'] and \
1259                    'reboot_node_failed' not in act_record:
1260
1261                         if "DOWN" in act_record['log'] and \
1262                                         'pcu_ids' in act_record['plcnode'] and \
1263                                         len(act_record['plcnode']['pcu_ids']) > 0:
1264
1265                                 print "%s" % act_record['log'],
1266                                 print "%15s" % (['reboot_node'],)
1267                                 # Set node to re-install
1268                                 plc.nodeBootState(act_record['nodename'], "rins")
1269                                 try:
1270                                         ret = reboot_node({'hostname': act_record['nodename']})
1271                                 except Exception, exc:
1272                                         print "exception on reboot_node:"
1273                                         import traceback
1274                                         print traceback.print_exc()
1275                                         ret = False
1276
1277                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1278                                         # Reboot Succeeded
1279                                         print "reboot succeeded for %s" % act_record['nodename']
1280                                         act_record2 = {}
1281                                         act_record2.update(act_record)
1282                                         act_record2['action'] = ['reboot_node']
1283                                         act_record2['stage'] = "reboot_node"
1284                                         act_record2['reboot_node_failed'] = False
1285                                         act_record2['email_pcu'] = False
1286
1287                                         if nodename not in self.act_all:
1288                                                 self.act_all[nodename] = []
1289                                         print "inserting 'reboot_node' record into act_all"
1290                                         self.act_all[nodename].insert(0,act_record2)
1291
1292                                         # return None to avoid further action
1293                                         print "Taking no further action"
1294                                         return None
1295                                 else:
1296                                         print "reboot failed for %s" % act_record['nodename']
1297                                         # set email_pcu to also send pcu notice for this record.
1298                                         act_record['reboot_node_failed'] = True
1299                                         act_record['email_pcu'] = True
1300
1301                         print "%s" % act_record['log'],
1302                         print "%15s" % act_record['action']
1303
1304                 if act_record['stage'] is not 'monitor-end-record' and \
1305                    act_record['stage'] is not 'nmreset':
1306                         if nodename not in self.act_all:
1307                                 self.act_all[nodename] = []
1308
1309                         self.act_all[nodename].insert(0,act_record)
1310                 else:
1311                         print "Not recording %s in act_all" % nodename
1312
1313                 return act_record
1314
1315         def analyseSites(self):
1316                 i_sites_observed = 0
1317                 i_sites_diagnosed = 0
1318                 i_nodes_diagnosed = 0
1319                 i_nodes_actedon = 0
1320                 i_sites_emailed = 0
1321                 l_allsites = []
1322
1323                 sorted_sites = self.sickdb.keys()
1324                 sorted_sites.sort()
1325                 for loginbase in sorted_sites:
1326                         site_record = self.sickdb[loginbase]
1327                         print "sites: %s" % loginbase
1328
1329                         i_nodes_diagnosed += len(site_record.keys())
1330                         i_sites_diagnosed += 1
1331
1332                         (na,ne) = self.__actOnSite(loginbase, site_record)
1333
1334                         i_sites_observed += 1
1335                         i_nodes_actedon += na
1336                         i_sites_emailed += ne
1337
1338                         l_allsites += [loginbase]
1339
1340                 return {'sites_observed': i_sites_observed,
1341                                 'sites_diagnosed': i_sites_diagnosed,
1342                                 'nodes_diagnosed': i_nodes_diagnosed,
1343                                 'sites_emailed': i_sites_emailed,
1344                                 'nodes_actedon': i_nodes_actedon,
1345                                 'allsites':l_allsites}
1346
1347         def print_stats(self, key, stats):
1348                 print "%20s : %d" % (key, stats[key])
1349
1350
1351
1352         #"""
1353         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1354         #"""
1355         #def status(self):
1356         #       sub = "Monitor Summary"
1357         #       msg = "\nThe following nodes were acted upon:  \n\n"
1358         #       for (node, (type, date)) in self.emailed.items():
1359         #               # Print only things acted on today.
1360         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1361         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1362         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1363         #       for (loginbase, (date, type)) in self.squeezed.items():
1364         #               # Print only things acted on today.
1365         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1366         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1367         #       mailer.email(sub, msg, [SUMTO])
1368         #       logger.info(msg)
1369         #       return
1370
1371         #"""
1372         #Store/Load state of emails.  When, where, what.
1373         #"""
1374         #def emailedStore(self, action):
1375         #       try:
1376         #               if action == "LOAD":
1377         #                       f = open(DAT, "r+")
1378         #                       logger.info("POLICY:  Found and reading " + DAT)
1379         #                       self.emailed.update(pickle.load(f))
1380         #               if action == "WRITE":
1381         #                       f = open(DAT, "w")
1382         #                       #logger.debug("Writing " + DAT)
1383         #                       pickle.dump(self.emailed, f)
1384         #               f.close()
1385         #       except Exception, err:
1386         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1387
1388
1389 #class Policy(Thread):
1390
1391 def main():
1392         print "policy.py is a module, not a script for running directly."
1393
1394 if __name__ == '__main__':
1395         import os
1396         import plc
1397         try:
1398                 main()
1399         except KeyboardInterrupt:
1400                 print "Killed.  Exitting."
1401                 logger.info('Monitor Killed')
1402                 os._exit(0)