policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import database
  23 import string
  24 from unified_model import cmpCategoryVal
  25 import config
  26
  27 DAT="./monitor.dat"
  28
  29 logger = logging.getLogger("monitor")
  30
  31 # Time to enforce policy
  32 POLSLEEP = 7200
  33
  34 # Where to email the summary
  35 SUMTO = "soltesz@cs.princeton.edu"
  36 TECHEMAIL="tech-%s@sites.planet-lab.org"
  37 PIEMAIL="pi-%s@sites.planet-lab.org"
  38 SLICEMAIL="%s@slices.planet-lab.org"
  39 PLCEMAIL="support@planet-lab.org"
  40
  41 #Thresholds (DAYS)
  42 SPERMIN = 60
  43 SPERHOUR = 60*60
  44 SPERDAY = 86400
  45 PITHRESH = 7 * SPERDAY
  46 SLICETHRESH = 7 * SPERDAY
  47 # Days before attempting rins again
  48 RINSTHRESH = 5 * SPERDAY
  49
  50 # Days before calling the node dead.
  51 DEADTHRESH = 30 * SPERDAY
  52 # Minimum number of nodes up before squeezing
  53 MINUP = 2
  54
  55 TECH=1
  56 PI=2
  57 USER=4
  58 ADMIN=8
  59
  60 # IF:
  61 #  no SSH, down.
  62 #  bad disk, down
  63 #  DNS, kinda down (sick)
  64 #  clock, kinda down (sick)
  65 #  Full disk, going to be down
  66
  67 # Actions:
  68 #  Email
  69 #  suspend slice creation
  70 #  kill slices
  71 def array_to_priority_map(array):
  72         """ Create a mapping where each entry of array is given a priority equal
  73         to its position in the array.  This is useful for subsequent use in the
  74         cmpMap() function."""
  75         map = {}
  76         count = 0
  77         for i in array:
  78                 map[i] = count
  79                 count += 1
  80         return map
  81
  82 def getdebug():
  83         return config.debug
  84
  85 def print_stats(key, stats):
  86         if key in stats: print "%20s : %d" % (key, stats[key])
  87
  88
  89 class Merge(Thread):
  90         def __init__(self, l_merge, toRT):
  91                 self.toRT = toRT
  92                 self.merge_list = l_merge
  93                 # the hostname to loginbase mapping
  94                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
  95
  96                 # Previous actions taken on nodes.
  97                 self.act_all = database.if_cached_else(1, "act_all", lambda : {})
  98                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
  99
 100                 self.cache_all = database.if_cached_else(1, "act_all", lambda : {})
 101                 self.sickdb = {}
 102                 self.mergedb = {}
 103                 Thread.__init__(self)
 104
 105         def run(self):
 106                 # populate sickdb
 107                 self.accumSickSites()
 108                 # read data from findbad and act_all
 109                 self.mergeActionsAndBadDB()
 110                 # pass node_records to RT
 111                 self.sendToRT()
 112
 113         def accumSickSites(self):
 114                 """
 115                 Take all nodes, from l_diagnose, look them up in the act_all database,
 116                 and insert them into sickdb[] as:
 117
 118                         sickdb[loginbase][nodename] = fb_record
 119                 """
 120                 # look at all problems reported by findbad
 121                 l_nodes = self.findbad['nodes'].keys()
 122                 count = 0
 123                 for nodename in l_nodes:
 124                         if nodename not in self.merge_list:
 125                                 continue                # skip this node, since it's not wanted
 126
 127                         count += 1
 128                         loginbase = self.plcdb_hn2lb[nodename]
 129                         values = self.findbad['nodes'][nodename]['values']
 130
 131                         fb_record = {}
 132                         fb_record['nodename'] = nodename
 133                         try:
 134                                 fb_record['category'] = values['category']
 135                         except:
 136                                 print values
 137                                 print nodename
 138                                 print self.findbad['nodes'][nodename]
 139                                 count -= 1
 140                                 continue
 141                         fb_record['state'] = values['state']
 142                         fb_record['comonstats'] = values['comonstats']
 143                         fb_record['plcnode'] = values['plcnode']
 144                         fb_record['kernel'] = self.getKernel(values['kernel'])
 145                         fb_record['stage'] = "findbad"
 146                         fb_record['message'] = None
 147                         fb_record['bootcd'] = values['bootcd']
 148                         fb_record['args'] = None
 149                         fb_record['info'] = None
 150                         fb_record['time'] = time.time()
 151                         fb_record['date_created'] = time.time()
 152
 153                         if loginbase not in self.sickdb:
 154                                 self.sickdb[loginbase] = {}
 155
 156                         self.sickdb[loginbase][nodename] = fb_record
 157
 158                 print "Found %d nodes" % count
 159
 160         def getKernel(self, unamestr):
 161                 s = unamestr.split()
 162                 if len(s) > 2:
 163                         return s[2]
 164                 else:
 165                         return ""
 166
 167         def mergeActionsAndBadDB(self):
 168                 """
 169                 - Look at the sick node_records as reported in findbad,
 170                 - Then look at the node_records in act_all.
 171
 172                 There are four cases:
 173                 1) Problem in findbad, no problem in act_all
 174                         this ok, b/c it just means it's a new problem
 175                 2) Problem in findbad, problem in act_all
 176                         -Did the problem get better or worse?
 177                                 -If Same, or Worse, then continue looking for open tickets.
 178                                 -If Better, or No problem, then "back-off" penalties.
 179                                         This judgement may need to wait until 'Diagnose()'
 180
 181                 3) No problem in findbad, problem in act_all
 182                         The the node is operational again according to Findbad()
 183
 184                 4) No problem in findbad, no problem in act_all
 185                         There won't be a record in either db, so there's no code.
 186                 """
 187
 188                 sorted_sites = self.sickdb.keys()
 189                 sorted_sites.sort()
 190                 # look at all problems reported by findbad
 191                 for loginbase in sorted_sites:
 192                         d_fb_nodes = self.sickdb[loginbase]
 193                         sorted_nodes = d_fb_nodes.keys()
 194                         sorted_nodes.sort()
 195                         for nodename in sorted_nodes:
 196                                 fb_record = self.sickdb[loginbase][nodename]
 197                                 x = fb_record
 198                                 if loginbase not in self.mergedb:
 199                                         self.mergedb[loginbase] = {}
 200
 201                                 # take the info either from act_all or fb-record.
 202                                 # if node not in act_all
 203                                 #       then take it from fbrecord, obviously.
 204                                 # else node in act_all
 205                                 #   if act_all == 0 length (no previous records)
 206                                 #               then take it from fbrecord.
 207                                 #   else
 208                                 #           take it from act_all.
 209                                 #
 210
 211                                 # We must compare findbad state with act_all state
 212                                 if nodename not in self.act_all:
 213                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 214                                         self.mergedb[loginbase][nodename] = {}
 215                                         self.mergedb[loginbase][nodename].update(x)
 216                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 217                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 218                                 else:
 219                                         if len(self.act_all[nodename]) == 0:
 220                                                 self.mergedb[loginbase][nodename] = {}
 221                                                 self.mergedb[loginbase][nodename].update(x)
 222                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
 223                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 224                                         else:
 225                                                 y = self.act_all[nodename][0]
 226                                                 y['prev_category'] = y['category']
 227
 228                                                 self.mergedb[loginbase][nodename] = {}
 229                                                 self.mergedb[loginbase][nodename].update(y)
 230                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 231                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
 232                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
 233                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 234                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 235                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 236                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
 237                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
 238
 239                                         # delete the entry from cache_all to keep it out of case 3)
 240                                         del self.cache_all[nodename]
 241
 242                 # 3) nodes that remin in cache_all were not identified by findbad.
 243                 #        Do we keep them or not?
 244                 #   NOTE: i think that since the categories are performed before this
 245                 #               step now, and by a monitor-controlled agent.
 246
 247                 # TODO: This does not work correctly.  Do we need this?
 248                 #for hn in self.cache_all.keys():
 249                 #       y = self.act_all[hn][0]
 250                 #       if 'monitor' in y['bucket']:
 251                 #               loginbase = self.plcdb_hn2lb[hn]
 252                 #               if loginbase not in self.sickdb:
 253                 #                       self.sickdb[loginbase] = {}
 254                 #               self.sickdb[loginbase][hn] = y
 255                 #       else:
 256                 #               del self.cache_all[hn]
 257
 258                 print "len of cache_all: %d" % len(self.cache_all.keys())
 259                 return
 260
 261         def sendToRT(self):
 262                 sorted_sites = self.mergedb.keys()
 263                 sorted_sites.sort()
 264                 # look at all problems reported by merge
 265                 for loginbase in sorted_sites:
 266                         d_merge_nodes = self.mergedb[loginbase]
 267                         for nodename in d_merge_nodes.keys():
 268                                 record = self.mergedb[loginbase][nodename]
 269                                 self.toRT.put(record)
 270
 271                 # send signal to stop reading
 272                 self.toRT.put(None)
 273                 return
 274
 275 class Diagnose(Thread):
 276         def __init__(self, fromRT):
 277                 self.fromRT = fromRT
 278                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 279                 self.findbad = database.if_cached_else(1, "findbad", lambda : {})
 280
 281                 self.diagnose_in = {}
 282                 self.diagnose_out = {}
 283                 Thread.__init__(self)
 284
 285
 286         def run(self):
 287                 self.accumSickSites()
 288
 289                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 290                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 291
 292                 try:
 293                         stats = self.diagnoseAll()
 294                 except Exception, err:
 295                         print "----------------"
 296                         import traceback
 297                         print traceback.print_exc()
 298                         print err
 299                         #if config.policysavedb:
 300                         sys.exit(1)
 301
 302                 print_stats("sites_observed", stats)
 303                 print_stats("sites_diagnosed", stats)
 304                 print_stats("nodes_diagnosed", stats)
 305
 306                 if config.policysavedb:
 307                         print "Saving Databases... diagnose_out"
 308                         database.dbDump("diagnose_out", self.diagnose_out)
 309
 310         def accumSickSites(self):
 311                 """
 312                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 313                 and insert them into diagnose_in[] as:
 314
 315                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 316                 """
 317                 while 1:
 318                         node_record = self.fromRT.get(block = True)
 319                         if node_record == None:
 320                                 break;
 321
 322                         nodename = node_record['nodename']
 323                         loginbase = self.plcdb_hn2lb[nodename]
 324
 325                         if loginbase not in self.diagnose_in:
 326                                 self.diagnose_in[loginbase] = {}
 327
 328                         self.diagnose_in[loginbase][nodename] = node_record
 329
 330                 return
 331
 332         def diagnoseAll(self):
 333                 i_sites_observed = 0
 334                 i_sites_diagnosed = 0
 335                 i_nodes_diagnosed = 0
 336                 i_nodes_actedon = 0
 337                 i_sites_emailed = 0
 338                 l_allsites = []
 339
 340                 sorted_sites = self.diagnose_in.keys()
 341                 sorted_sites.sort()
 342                 self.diagnose_out= {}
 343                 for loginbase in sorted_sites:
 344                         l_allsites += [loginbase]
 345
 346                         d_diag_nodes = self.diagnose_in[loginbase]
 347                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 348                         # store records in diagnose_out, for saving later.
 349                         self.diagnose_out.update(d_act_records)
 350
 351                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 352                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 353                                 i_sites_diagnosed += 1
 354                         i_sites_observed += 1
 355
 356                 return {'sites_observed': i_sites_observed,
 357                                 'sites_diagnosed': i_sites_diagnosed,
 358                                 'nodes_diagnosed': i_nodes_diagnosed,
 359                                 'allsites':l_allsites}
 360
 361                 pass
 362
 363         def getDaysDown(cls, diag_record):
 364                 daysdown = -1
 365                 last_contact = diag_record['plcnode']['last_contact']
 366                 date_created = diag_record['plcnode']['date_created']
 367
 368                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
 369                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 370                 elif last_contact is None:
 371                         if date_created is not None:
 372                                 now = time.time()
 373                                 diff = now - date_created
 374                                 daysdown = diff // (60*60*24)
 375                         else:
 376                                 daysdown = -1
 377                 else:
 378                         now = time.time()
 379                         diff = now - last_contact
 380                         daysdown = diff // (60*60*24)
 381                 return daysdown
 382         getDaysDown = classmethod(getDaysDown)
 383
 384         def getStrDaysDown(cls, diag_record):
 385                 daysdown = "unknown"
 386                 last_contact = diag_record['plcnode']['last_contact']
 387                 date_created = diag_record['plcnode']['date_created']
 388
 389                 if      diag_record['comonstats']['uptime'] != "null" and \
 390                         diag_record['comonstats']['uptime'] != "-1":
 391                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 392                         daysdown = "%d days up" % daysdown
 393
 394                 elif last_contact is None:
 395                         if date_created is not None:
 396                                 now = time.time()
 397                                 diff = now - date_created
 398                                 daysdown = diff // (60*60*24)
 399                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
 400                         else:
 401                                 daysdown = "Never contacted PLC"
 402                 else:
 403                         now = time.time()
 404                         diff = now - last_contact
 405                         daysdown = diff // (60*60*24)
 406                         daysdown = "%s days down" % daysdown
 407                 return daysdown
 408         getStrDaysDown = classmethod(getStrDaysDown)
 409         #def getStrDaysDown(cls, diag_record):
 410         #       daysdown = cls.getDaysDown(diag_record)
 411         #       if daysdown > -1:
 412         #               return "%d days down"%daysdown
 413         #       elif daysdown == -1:
 414         #               return "Has never contacted PLC"
 415         #       else:
 416         #               return "%d days up"% -daysdown
 417         #getStrDaysDown = classmethod(getStrDaysDown)
 418
 419         def __getCDVersion(self, diag_record, nodename):
 420                 cdversion = ""
 421                 #print "Getting kernel for: %s" % diag_record['nodename']
 422                 cdversion = diag_record['kernel']
 423                 return cdversion
 424
 425         def __diagnoseSite(self, loginbase, d_diag_nodes):
 426                 """
 427                 d_diag_nodes are diagnose_in entries.
 428                 """
 429                 d_diag_site = {loginbase : { 'config' :
 430                                                                                                 {'squeeze': False,
 431                                                                                                  'email': False
 432                                                                                                 },
 433                                                                         'nodes': {}
 434                                                                         }
 435                                            }
 436                 sorted_nodes = d_diag_nodes.keys()
 437                 sorted_nodes.sort()
 438                 for nodename in sorted_nodes:
 439                         node_record = d_diag_nodes[nodename]
 440                         diag_record = self.__diagnoseNode(loginbase, node_record)
 441
 442                         if diag_record != None:
 443                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 444
 445                                 # NOTE: improvement means, we need to act/squeeze and email.
 446                                 #print "DIAG_RECORD", diag_record
 447                                 if 'monitor-end-record' in diag_record['stage'] or \
 448                                    'nmreset' in diag_record['stage']:
 449                                 #       print "resetting loginbase!"
 450                                         d_diag_site[loginbase]['config']['squeeze'] = True
 451                                         d_diag_site[loginbase]['config']['email'] = True
 452                                 #else:
 453                                 #       print "NO IMPROVEMENT!!!!"
 454                         else:
 455                                 pass # there is nothing to do for this node.
 456
 457                 # NOTE: these settings can be overridden by command line arguments,
 458                 #       or the state of a record, i.e. if already in RT's Support Queue.
 459                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 460                 if nodes_up < MINUP:
 461                         d_diag_site[loginbase]['config']['squeeze'] = True
 462
 463                 max_slices = self.getMaxSlices(loginbase)
 464                 num_nodes = self.getNumNodes(loginbase)
 465                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 466                 #       or an old disabled site from previous monitor (before site['enabled'])
 467                 if nodes_up < num_nodes and max_slices != 0:
 468                         d_diag_site[loginbase]['config']['email'] = True
 469
 470                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 471                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 472
 473                 return d_diag_site
 474
 475         def diagRecordByCategory(self, node_record):
 476                 nodename = node_record['nodename']
 477                 category = node_record['category']
 478                 state    = node_record['state']
 479                 loginbase = self.plcdb_hn2lb[nodename]
 480                 diag_record = None
 481
 482                 if  "ERROR" in category:        # i.e. "DOWN"
 483                         diag_record = {}
 484                         diag_record.update(node_record)
 485                         daysdown = self.getDaysDown(diag_record)
 486                         if daysdown < 7:
 487                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 488                                 print format % (loginbase, nodename, daysdown)
 489                                 return None
 490
 491                         s_daysdown = self.getStrDaysDown(diag_record)
 492                         diag_record['message'] = emailTxt.mailtxt.newdown
 493                         diag_record['args'] = {'nodename': nodename}
 494                         diag_record['info'] = (nodename, s_daysdown, "")
 495
 496                         if 'reboot_node_failed' in node_record:
 497                                 # there was a previous attempt to use the PCU.
 498                                 if node_record['reboot_node_failed'] == False:
 499                                         # then the last attempt apparently, succeeded.
 500                                         # But, the category is still 'ERROR'.  Therefore, the
 501                                         # PCU-to-Node mapping is broken.
 502                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 503                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 504                                         diag_record['email_pcu'] = True
 505
 506                         if 'ticket_id' in diag_record:
 507                                 if diag_record['ticket_id'] == "":
 508                                         if 'found_rt_ticket' in diag_record:
 509                                                 ticket_id = diag_record['found_rt_ticket']
 510                                         else:
 511                                                 ticket_id = "None"
 512                                 else:
 513                                         ticket_id = diag_record['ticket_id']
 514                         else:
 515                                 ticket_id = "None"
 516
 517                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 518                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
 519
 520                 elif "OLDBOOTCD" in category:
 521                         # V2 boot cds as determined by findbad
 522                         s_daysdown = self.getStrDaysDown(node_record)
 523                         s_cdversion = self.__getCDVersion(node_record, nodename)
 524                         diag_record = {}
 525                         diag_record.update(node_record)
 526                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 527                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 528                         diag_record['args'] = {'nodename': nodename}
 529                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 530                         if diag_record['ticket_id'] == "":
 531                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 532                                                                         (loginbase, nodename, diag_record['kernel'],
 533                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 534                         else:
 535                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 536                                                                         (loginbase, nodename, diag_record['kernel'],
 537                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 538
 539                 elif "PROD" in category:
 540                         if "DEBUG" in state:
 541                                 # Not sure what to do with these yet.  Probably need to
 542                                 # reboot, and email.
 543                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 544                                 return None
 545                         elif "BOOT" in state:
 546                                 # no action needed.
 547                                 # TODO: remove penalties, if any are applied.
 548                                 now = time.time()
 549                                 last_contact = node_record['plcnode']['last_contact']
 550                                 if last_contact == None:
 551                                         time_diff = 0
 552                                 else:
 553                                         time_diff = now - last_contact;
 554
 555                                 if 'improvement' in node_record['stage']:
 556                                         # then we need to pass this on to 'action'
 557                                         diag_record = {}
 558                                         diag_record.update(node_record)
 559                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 560                                         diag_record['args'] = {'nodename': nodename}
 561                                         diag_record['info'] = (nodename, node_record['prev_category'],
 562                                                                                                          node_record['category'])
 563                                         if 'email_pcu' in diag_record:
 564                                                 if diag_record['email_pcu']:
 565                                                         # previously, the pcu failed to reboot, so send
 566                                                         # email. Now, reset these values to try the reboot
 567                                                         # again.
 568                                                         diag_record['email_pcu'] = False
 569                                                         del diag_record['reboot_node_failed']
 570
 571                                         if diag_record['ticket_id'] == "":
 572                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 573                                                                         (loginbase, nodename, diag_record['stage'],
 574                                                                          state, category, diag_record['found_rt_ticket'])
 575                                         else:
 576                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 577                                                                         (loginbase, nodename, diag_record['stage'],
 578                                                                          state, category, diag_record['ticket_id'])
 579                                         return diag_record
 580                                 #elif time_diff >= 6*SPERHOUR:
 581                                 #       # heartbeat is older than 30 min.
 582                                 #       # then reset NM.
 583                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 584                                 #       diag_record = {}
 585                                 #       diag_record.update(node_record)
 586                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
 587                                 #       diag_record['args'] = {'nodename': nodename}
 588                                 #       diag_record['stage'] = "nmreset"
 589                                 #       diag_record['info'] = (nodename,
 590                                 #                                                       node_record['prev_category'],
 591                                 #                                                       node_record['category'])
 592                                 #       if diag_record['ticket_id'] == "":
 593                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 594                                 #                                       (loginbase, nodename, diag_record['stage'],
 595                                 #                                        state, category, diag_record['found_rt_ticket'])
 596                                 #       else:
 597                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 598                                 #                                       (loginbase, nodename, diag_record['stage'])
 599 #
 600 #                                       return diag_record
 601                                 else:
 602                                         return None
 603                         else:
 604                                 # unknown
 605                                 pass
 606                 elif "ALPHA"    in category:
 607                         pass
 608                 elif "clock_drift" in category:
 609                         pass
 610                 elif "dns"    in category:
 611                         pass
 612                 elif "filerw"    in category:
 613                         pass
 614                 else:
 615                         print "Unknown category!!!! %s" % category
 616                         sys.exit(1)
 617
 618                 return diag_record
 619
 620         def __diagnoseNode(self, loginbase, node_record):
 621                 # TODO: change the format of the hostname in this
 622                 #               record to something more natural.
 623                 nodename                = node_record['nodename']
 624                 category                = node_record['category']
 625                 prev_category   = node_record['prev_category']
 626                 state                   = node_record['state']
 627                 #if 'prev_category' in node_record:
 628                 #       prev_category = node_record['prev_category']
 629                 #else:
 630                 #       prev_category = "ERROR"
 631                 if node_record['prev_category'] != "NORECORD":
 632
 633                         val = cmpCategoryVal(category, prev_category)
 634                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 635                         if val == 1:
 636                                 # improved
 637                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 638                                         print "closing record with no ticket: ", node_record['nodename']
 639                                         node_record['action'] = ['close_rt']
 640                                         node_record['message'] = None
 641                                         node_record['stage'] = 'monitor-end-record'
 642                                         return node_record
 643                                 else:
 644                                         node_record['stage'] = 'improvement'
 645
 646                                 #if 'monitor-end-record' in node_record['stage']:
 647                                 #       # just ignore it if it's already ended.
 648                                 #       # otherwise, the status should be worse, and we won't get
 649                                 #       # here.
 650                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 651                                 #       return None
 652 #
 653 #                                       #return None
 654                         elif val == -1:
 655                                 # current category is worse than previous, carry on
 656                                 pass
 657                         else:
 658                                 #values are equal, carry on.
 659                                 #print "why are we here?"
 660                                 pass
 661
 662                 if 'rt' in node_record and 'Status' in node_record['rt']:
 663                         if node_record['stage'] == 'ticket_waitforever':
 664                                 if 'resolved' in node_record['rt']['Status']:
 665                                         print "ending waitforever record for: ", node_record['nodename']
 666                                         node_record['action'] = ['noop']
 667                                         node_record['message'] = None
 668                                         node_record['stage'] = 'monitor-end-record'
 669                                         print "oldlog: %s" % node_record['log'],
 670                                         print "%15s" % node_record['action']
 671                                         return node_record
 672                                 if 'new' in node_record['rt']['Status'] and \
 673                                         'Queue' in node_record['rt'] and \
 674                                         'Monitor' in node_record['rt']['Queue']:
 675
 676                                         print "RESETTING stage to findbad"
 677                                         node_record['stage'] = 'findbad'
 678
 679                 #### COMPARE category and prev_category
 680                 # if not_equal
 681                 #       then assign a stage based on relative priorities
 682                 # else equal
 683                 #       then check category for stats.
 684                 diag_record = self.diagRecordByCategory(node_record)
 685                 if diag_record == None:
 686                         #print "diag_record == None"
 687                         return None
 688
 689                 #### found_RT_ticket
 690                 # TODO: need to record time found, and maybe add a stage for acting on it...
 691                 # NOTE: after found, if the support ticket is resolved, the block is
 692                 #               not removed. How to remove the block on this?
 693                 if 'found_rt_ticket' in diag_record and \
 694                         diag_record['found_rt_ticket'] is not None:
 695                         if diag_record['stage'] is not 'improvement':
 696                                 diag_record['stage'] = 'ticket_waitforever'
 697
 698                 current_time = time.time()
 699                 # take off four days, for the delay that database caused.
 700                 # TODO: generalize delays at PLC, and prevent enforcement when there
 701                 #               have been no emails.
 702                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 703                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 704                 delta = current_time - diag_record['time']
 705
 706                 message = diag_record['message']
 707                 act_record = {}
 708                 act_record.update(diag_record)
 709
 710                 #### DIAGNOSE STAGES
 711                 if   'findbad' in diag_record['stage']:
 712                         # The node is bad, and there's no previous record of it.
 713                         act_record['email'] = TECH
 714                         act_record['action'] = ['noop']
 715                         act_record['message'] = message[0]
 716                         act_record['stage'] = 'stage_actinoneweek'
 717
 718                 elif 'nmreset' in diag_record['stage']:
 719                         act_record['email']  = ADMIN
 720                         act_record['action'] = ['reset_nodemanager']
 721                         act_record['message'] = message[0]
 722                         act_record['stage']  = 'nmreset'
 723                         return None
 724
 725                 elif 'reboot_node' in diag_record['stage']:
 726                         act_record['email'] = TECH
 727                         act_record['action'] = ['noop']
 728                         act_record['message'] = message[0]
 729                         act_record['stage'] = 'stage_actinoneweek'
 730
 731                 elif 'improvement' in diag_record['stage']:
 732                         # - backoff previous squeeze actions (slice suspend, nocreate)
 733                         # TODO: add a backoff_squeeze section... Needs to runthrough
 734                         print "backing off of %s" % nodename
 735                         act_record['action'] = ['close_rt']
 736                         act_record['message'] = message[0]
 737                         act_record['stage'] = 'monitor-end-record'
 738
 739                 elif 'actinoneweek' in diag_record['stage']:
 740                         if delta >= 7 * SPERDAY:
 741                                 act_record['email'] = TECH | PI
 742                                 act_record['stage'] = 'stage_actintwoweeks'
 743                                 act_record['message'] = message[1]
 744                                 act_record['action'] = ['nocreate' ]
 745                                 act_record['time'] = current_time               # reset clock for waitforever
 746                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 747                                 act_record['email'] = TECH
 748                                 act_record['message'] = message[0]
 749                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 750                                 act_record['second-mail-at-oneweek'] = True
 751                         else:
 752                                 act_record['message'] = None
 753                                 act_record['action'] = ['waitforoneweekaction' ]
 754                                 print "ignoring this record for: %s" % act_record['nodename']
 755                                 return None                     # don't send if there's no action
 756
 757                 elif 'actintwoweeks' in diag_record['stage']:
 758                         if delta >= 7 * SPERDAY:
 759                                 act_record['email'] = TECH | PI | USER
 760                                 act_record['stage'] = 'stage_waitforever'
 761                                 act_record['message'] = message[2]
 762                                 act_record['action'] = ['suspendslices']
 763                                 act_record['time'] = current_time               # reset clock for waitforever
 764                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 765                                 act_record['email'] = TECH | PI
 766                                 act_record['message'] = message[1]
 767                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 768                                 act_record['second-mail-at-twoweeks'] = True
 769                         else:
 770                                 act_record['message'] = None
 771                                 act_record['action'] = ['waitfortwoweeksaction']
 772                                 return None                     # don't send if there's no action
 773
 774                 elif 'ticket_waitforever' in diag_record['stage']:
 775                         act_record['email'] = TECH
 776                         if 'first-found' not in act_record:
 777                                 act_record['first-found'] = True
 778                                 act_record['log'] += " firstfound"
 779                                 act_record['action'] = ['ticket_waitforever']
 780                                 act_record['message'] = message[0]
 781                                 act_record['time'] = current_time
 782                         else:
 783                                 if delta >= 7*SPERDAY:
 784                                         act_record['action'] = ['ticket_waitforever']
 785                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
 786                                                         act_record['rt']['Status'] == 'new':
 787                                                 act_record['message'] = message[0]
 788                                         else:
 789                                                 act_record['message'] = None
 790
 791                                         act_record['time'] = current_time               # reset clock
 792                                 else:
 793                                         act_record['action'] = ['ticket_waitforever']
 794                                         act_record['message'] = None
 795                                         return None
 796
 797                 elif 'waitforever' in diag_record['stage']:
 798                         # more than 3 days since last action
 799                         # TODO: send only on weekdays.
 800                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 801                         if delta >= 3*SPERDAY:
 802                                 act_record['action'] = ['email-againwaitforever']
 803                                 act_record['message'] = message[2]
 804                                 act_record['time'] = current_time               # reset clock
 805                         else:
 806                                 act_record['action'] = ['waitforever']
 807                                 act_record['message'] = None
 808                                 return None                     # don't send if there's no action
 809
 810                 else:
 811                         # There is no action to be taken, possibly b/c the stage has
 812                         # already been performed, but diagnose picked it up again.
 813                         # two cases,
 814                         #       1. stage is unknown, or
 815                         #       2. delta is not big enough to bump it to the next stage.
 816                         # TODO: figure out which. for now assume 2.
 817                         print "UNKNOWN stage for %s; nothing done" % nodename
 818                         act_record['action'] = ['unknown']
 819                         act_record['message'] = message[0]
 820
 821                         act_record['email'] = TECH
 822                         act_record['action'] = ['noop']
 823                         act_record['message'] = message[0]
 824                         act_record['stage'] = 'stage_actinoneweek'
 825                         act_record['time'] = current_time               # reset clock
 826                         #print "Exiting..."
 827                         #return None
 828                         #sys.exit(1)
 829
 830                 print "%s" % act_record['log'],
 831                 print "%15s" % act_record['action']
 832                 return act_record
 833
 834         def getMaxSlices(self, loginbase):
 835                 # if sickdb has a loginbase, then it will have at least one node.
 836                 site_stats = None
 837
 838                 for nodename in self.diagnose_in[loginbase].keys():
 839                         if nodename in self.findbad['nodes']:
 840                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 841                                 break
 842
 843                 if site_stats == None:
 844                         raise Exception, "loginbase with no nodes in findbad"
 845                 else:
 846                         return site_stats['max_slices']
 847
 848         def getNumNodes(self, loginbase):
 849                 # if sickdb has a loginbase, then it will have at least one node.
 850                 site_stats = None
 851
 852                 for nodename in self.diagnose_in[loginbase].keys():
 853                         if nodename in self.findbad['nodes']:
 854                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 855                                 break
 856
 857                 if site_stats == None:
 858                         raise Exception, "loginbase with no nodes in findbad"
 859                 else:
 860                         if 'num_nodes' in site_stats:
 861                                 return site_stats['num_nodes']
 862                         else:
 863                                 return 0
 864
 865         """
 866         Returns number of up nodes as the total number *NOT* in act_all with a
 867         stage other than 'steady-state' .
 868         """
 869         def getUpAtSite(self, loginbase, d_diag_site):
 870                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 871                 #               that aren't recorded yet.
 872
 873                 numnodes = self.getNumNodes(loginbase)
 874                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 875                 # TODO: make the 'up' value more representative
 876                 up = numnodes
 877                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 878
 879                         rec = d_diag_site[loginbase]['nodes'][nodename]
 880                         if rec['stage'] != 'monitor-end-record':
 881                                 up -= 1
 882                         else:
 883                                 pass # the node is assumed to be up.
 884
 885                 #if up != numnodes:
 886                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 887
 888                 return up
 889
 890
 891 class SiteAction:
 892         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 893                 self.parameter_names = parameter_names
 894         def checkParam(self, args):
 895                 for param in self.parameter_names:
 896                         if param not in args:
 897                                 raise Exception("Parameter %s not provided in args"%param)
 898         def run(self, args):
 899                 self.checkParam(args)
 900                 return self._run(args)
 901         def _run(self, args):
 902                 pass
 903
 904 class SuspendAction(SiteAction):
 905         def _run(self, args):
 906                 return plc.suspendSlices(args['hostname'])
 907
 908 class RemoveSliceCreation(SiteAction):
 909         def _run(self, args):
 910                 return plc.removeSliceCreation(args['hostname'])
 911
 912 class BackoffActions(SiteAction):
 913         def _run(self, args):
 914                 plc.enableSlices(args['hostname'])
 915                 plc.enableSliceCreation(args['hostname'])
 916                 return True
 917
 918 # TODO: create class for each action below,
 919 #               allow for lists of actions to be performed...
 920
 921
 922
 923 def reset_nodemanager(args):
 924         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 925         return
 926
 927 class Action(Thread):
 928         def __init__(self, l_action):
 929                 self.l_action = l_action
 930
 931                 # the hostname to loginbase mapping
 932                 self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
 933
 934                 # Actions to take.
 935                 self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
 936                 # Actions taken.
 937                 self.act_all   = database.if_cached_else(1, "act_all", lambda : {})
 938
 939                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 940                 self.actions = {}
 941                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 942                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 943                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 944                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
 945                 self.actions['noop'] = lambda args: args
 946                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 947                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 948
 949                 self.actions['ticket_waitforever'] = lambda args: args
 950                 self.actions['waitforever'] = lambda args: args
 951                 self.actions['unknown'] = lambda args: args
 952                 self.actions['waitforoneweekaction'] = lambda args: args
 953                 self.actions['waitfortwoweeksaction'] = lambda args: args
 954                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 955                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 956                 self.actions['email-againwaitforever'] = lambda args: args
 957                 self.actions['email-againticket_waitforever'] = lambda args: args
 958
 959
 960                 self.sickdb = {}
 961                 Thread.__init__(self)
 962
 963         def run(self):
 964                 self.accumSites()
 965                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 966                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 967
 968                 try:
 969                         stats = self.analyseSites()
 970                 except Exception, err:
 971                         print "----------------"
 972                         import traceback
 973                         print traceback.print_exc()
 974                         print err
 975                         if config.policysavedb:
 976                                 print "Saving Databases... act_all"
 977                                 database.dbDump("act_all", self.act_all)
 978                         sys.exit(1)
 979
 980                 print_stats("sites_observed", stats)
 981                 print_stats("sites_diagnosed", stats)
 982                 print_stats("nodes_diagnosed", stats)
 983                 print_stats("sites_emailed", stats)
 984                 print_stats("nodes_actedon", stats)
 985                 print string.join(stats['allsites'], ",")
 986
 987                 if config.policysavedb:
 988                         print "Saving Databases... act_all"
 989                         #database.dbDump("policy.eventlog", self.eventlog)
 990                         # TODO: remove 'diagnose_out',
 991                         #       or at least the entries that were acted on.
 992                         database.dbDump("act_all", self.act_all)
 993
 994         def accumSites(self):
 995                 """
 996                 Take all nodes, from l_action, look them up in the diagnose_db database,
 997                 and insert them into sickdb[] as:
 998
 999                 This way only the given l_action nodes will be acted on regardless
1000                 of how many from diagnose_db are available.
1001
1002                         sickdb[loginbase][nodename] = diag_record
1003                 """
1004                 # TODO: what if l_action == None ?
1005                 for nodename in self.l_action:
1006
1007                         loginbase = self.plcdb_hn2lb[nodename]
1008
1009                         if loginbase in self.diagnose_db and \
1010                                 nodename in self.diagnose_db[loginbase]['nodes']:
1011
1012                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1013
1014                                 if loginbase not in self.sickdb:
1015                                         self.sickdb[loginbase] = {'nodes' : {}}
1016
1017                                 # NOTE: don't copy all node records, since not all will be in l_action
1018                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1019                                 # NOTE: but, we want to get the loginbase config settings,
1020                                 #               this is the easiest way.
1021                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1022                         #else:
1023                                 #print "%s not in diagnose_db!!" % loginbase
1024                 return
1025
1026         def __emailSite(self, loginbase, roles, message, args):
1027                 """
1028                 loginbase is the unique site abbreviation, prepended to slice names.
1029                 roles contains TECH, PI, USER roles, and derive email aliases.
1030                 record contains {'message': [<subj>,<body>], 'args': {...}}
1031                 """
1032                 ticket_id = 0
1033                 args.update({'loginbase':loginbase})
1034
1035                 if not config.mail and not config.debug and config.bcc:
1036                         roles = ADMIN
1037                 if config.mail and config.debug:
1038                         roles = ADMIN
1039
1040                 # build targets
1041                 contacts = []
1042                 if ADMIN & roles:
1043                         contacts += [config.email]
1044                 if TECH & roles:
1045                         contacts += [TECHEMAIL % loginbase]
1046                 if PI & roles:
1047                         contacts += [PIEMAIL % loginbase]
1048                 if USER & roles:
1049                         slices = plc.slices(loginbase)
1050                         if len(slices) >= 1:
1051                                 for slice in slices:
1052                                         contacts += [SLICEMAIL % slice]
1053                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1054                         else:
1055                                 print "SLIC: %20s : 0 slices" % loginbase
1056
1057                 try:
1058                         subject = message[0] % args
1059                         body = message[1] % args
1060                         if ADMIN & roles:
1061                                 # send only to admin
1062                                 if 'ticket_id' in args:
1063                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1064                                 else:
1065                                         subj = "Re: [PL noticket] %s" % subject
1066                                 mailer.email(subj, body, contacts)
1067                                 ticket_id = args['ticket_id']
1068                         else:
1069                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1070                 except Exception, err:
1071                         print "exception on message:"
1072                         import traceback
1073                         print traceback.print_exc()
1074                         print message
1075
1076                 return ticket_id
1077
1078
1079         def _format_diaginfo(self, diag_node):
1080                 info = diag_node['info']
1081                 if diag_node['stage'] == 'monitor-end-record':
1082                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1083                 else:
1084                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1085                 return hlist
1086
1087
1088         def get_email_args(self, act_recordlist, loginbase=None):
1089
1090                 email_args = {}
1091                 email_args['hostname_list'] = ""
1092
1093                 for act_record in act_recordlist:
1094                         email_args['hostname_list'] += act_record['msg_format']
1095                         email_args['hostname'] = act_record['nodename']
1096                         if  'plcnode' in act_record and \
1097                                 'pcu_ids' in act_record['plcnode'] and \
1098                                 len(act_record['plcnode']['pcu_ids']) > 0:
1099                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1100                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1101                         else:
1102                                 email_args['pcu_id'] = "-1"
1103
1104                         if 'ticket_id' in act_record:
1105                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1106                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1107                                         sys.stdout.flush()
1108                                         line = sys.stdin.readline()
1109                                         try:
1110                                                 ticket_id = int(line)
1111                                         except:
1112                                                 print "could not get ticket_id from stdin..."
1113                                                 os._exit(1)
1114                                 else:
1115                                         ticket_id = act_record['ticket_id']
1116
1117                                 email_args['ticket_id'] = ticket_id
1118
1119                 return email_args
1120
1121         def get_unique_issues(self, act_recordlist):
1122                 # NOTE: only send one email per site, per problem...
1123                 unique_issues = {}
1124                 for act_record in act_recordlist:
1125                         act_key = act_record['action'][0]
1126                         if act_key not in unique_issues:
1127                                 unique_issues[act_key] = []
1128
1129                         unique_issues[act_key] += [act_record]
1130
1131                 return unique_issues
1132
1133
1134         def __actOnSite(self, loginbase, site_record):
1135                 i_nodes_actedon = 0
1136                 i_nodes_emailed = 0
1137
1138                 act_recordlist = []
1139
1140                 for nodename in site_record['nodes'].keys():
1141                         diag_record = site_record['nodes'][nodename]
1142                         act_record  = self.__actOnNode(diag_record)
1143                         #print "nodename: %s %s" % (nodename, act_record)
1144                         if act_record is not None:
1145                                 act_recordlist += [act_record]
1146
1147                 unique_issues = self.get_unique_issues(act_recordlist)
1148
1149                 for issue in unique_issues.keys():
1150                         print "\tworking on issue: %s" % issue
1151                         issue_record_list = unique_issues[issue]
1152                         email_args = self.get_email_args(issue_record_list, loginbase)
1153
1154                         # for each record.
1155                         for act_record in issue_record_list:
1156                                 # if there's a pcu record and email config is set
1157                                 if 'email_pcu' in act_record:
1158                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1159                                                 # and 'reboot_node' in act_record['stage']:
1160
1161                                                 email_args['hostname'] = act_record['nodename']
1162                                                 ticket_id = self.__emailSite(loginbase,
1163                                                                                         act_record['email'],
1164                                                                                         emailTxt.mailtxt.pcudown[0],
1165                                                                                         email_args)
1166                                                 if ticket_id == 0:
1167                                                         # error.
1168                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1169                                                         os._exit(1)
1170                                                         pass
1171                                                 email_args['ticket_id'] = ticket_id
1172
1173
1174                         act_record = issue_record_list[0]
1175                         # send message before squeezing
1176                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1177                                                                                                 site_record['config']['email'])
1178                         if act_record['message'] != None and site_record['config']['email']:
1179                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1180                                                                                          act_record['message'], email_args)
1181
1182                                 if ticket_id == 0:
1183                                         # error.
1184                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1185                                         os._exit(1)
1186                                         pass
1187
1188                                 # Add ticket_id to ALL nodenames
1189                                 for act_record in issue_record_list:
1190                                         nodename = act_record['nodename']
1191                                         # update node record with RT ticket_id
1192                                         if nodename in self.act_all:
1193                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1194                                                 # if the ticket was previously resolved, reset it to new.
1195                                                 if 'rt' in act_record and \
1196                                                         'Status' in act_record['rt'] and \
1197                                                         act_record['rt']['Status'] == 'resolved':
1198                                                         mailer.setTicketStatus(ticket_id, "new")
1199                                                 status = mailer.getTicketStatus(ticket_id)
1200                                                 self.act_all[nodename][0]['rt'] = status
1201                                         if config.mail: i_nodes_emailed += 1
1202
1203                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1204                                                                                                         site_record['config']['squeeze'])
1205                         if config.squeeze and site_record['config']['squeeze']:
1206                                 for act_key in act_record['action']:
1207                                         self.actions[act_key](email_args)
1208                                 i_nodes_actedon += 1
1209
1210                 if config.policysavedb:
1211                         print "Saving Databases... act_all, diagnose_out"
1212                         database.dbDump("act_all", self.act_all)
1213                         # remove site record from diagnose_out, it's in act_all as done.
1214                         del self.diagnose_db[loginbase]
1215                         database.dbDump("diagnose_out", self.diagnose_db)
1216
1217                 print "sleeping for 1 sec"
1218                 time.sleep(1)
1219                 #print "Hit enter to continue..."
1220                 #sys.stdout.flush()
1221                 #line = sys.stdin.readline()
1222
1223                 return (i_nodes_actedon, i_nodes_emailed)
1224
1225         def __actOnNode(self, diag_record):
1226                 nodename = diag_record['nodename']
1227                 message = diag_record['message']
1228
1229                 act_record = {}
1230                 act_record.update(diag_record)
1231                 act_record['nodename'] = nodename
1232                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1233                 print "act_record['stage'] == %s " % act_record['stage']
1234
1235                 # avoid end records, and nmreset records
1236                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1237
1238                 if 'monitor-end-record' not in act_record['stage'] and \
1239                    'nmreset' not in act_record['stage'] and \
1240                    'reboot_node_failed' not in act_record:
1241
1242                         if "DOWN" in act_record['log'] and \
1243                                         'pcu_ids' in act_record['plcnode'] and \
1244                                         len(act_record['plcnode']['pcu_ids']) > 0:
1245
1246                                 print "%s" % act_record['log'],
1247                                 print "%15s" % (['reboot_node'],)
1248                                 # Set node to re-install
1249                                 plc.nodeBootState(act_record['nodename'], "rins")
1250                                 try:
1251                                         ret = reboot_node({'hostname': act_record['nodename']})
1252                                 except Exception, exc:
1253                                         print "exception on reboot_node:"
1254                                         import traceback
1255                                         print traceback.print_exc()
1256                                         ret = False
1257
1258                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1259                                         # Reboot Succeeded
1260                                         print "reboot succeeded for %s" % act_record['nodename']
1261                                         act_record2 = {}
1262                                         act_record2.update(act_record)
1263                                         act_record2['action'] = ['reboot_node']
1264                                         act_record2['stage'] = "reboot_node"
1265                                         act_record2['reboot_node_failed'] = False
1266                                         act_record2['email_pcu'] = False
1267
1268                                         if nodename not in self.act_all:
1269                                                 self.act_all[nodename] = []
1270                                         print "inserting 'reboot_node' record into act_all"
1271                                         self.act_all[nodename].insert(0,act_record2)
1272
1273                                         # return None to avoid further action
1274                                         print "Taking no further action"
1275                                         return None
1276                                 else:
1277                                         print "reboot failed for %s" % act_record['nodename']
1278                                         # set email_pcu to also send pcu notice for this record.
1279                                         act_record['reboot_node_failed'] = True
1280                                         act_record['email_pcu'] = True
1281
1282                         print "%s" % act_record['log'],
1283                         print "%15s" % act_record['action']
1284
1285                 if act_record['stage'] is not 'monitor-end-record' and \
1286                    act_record['stage'] is not 'nmreset':
1287                         if nodename not in self.act_all:
1288                                 self.act_all[nodename] = []
1289
1290                         self.act_all[nodename].insert(0,act_record)
1291                 else:
1292                         print "Not recording %s in act_all" % nodename
1293
1294                 return act_record
1295
1296         def analyseSites(self):
1297                 i_sites_observed = 0
1298                 i_sites_diagnosed = 0
1299                 i_nodes_diagnosed = 0
1300                 i_nodes_actedon = 0
1301                 i_sites_emailed = 0
1302                 l_allsites = []
1303
1304                 sorted_sites = self.sickdb.keys()
1305                 sorted_sites.sort()
1306                 for loginbase in sorted_sites:
1307                         site_record = self.sickdb[loginbase]
1308                         print "sites: %s" % loginbase
1309
1310                         i_nodes_diagnosed += len(site_record.keys())
1311                         i_sites_diagnosed += 1
1312
1313                         (na,ne) = self.__actOnSite(loginbase, site_record)
1314
1315                         i_sites_observed += 1
1316                         i_nodes_actedon += na
1317                         i_sites_emailed += ne
1318
1319                         l_allsites += [loginbase]
1320
1321                 return {'sites_observed': i_sites_observed,
1322                                 'sites_diagnosed': i_sites_diagnosed,
1323                                 'nodes_diagnosed': i_nodes_diagnosed,
1324                                 'sites_emailed': i_sites_emailed,
1325                                 'nodes_actedon': i_nodes_actedon,
1326                                 'allsites':l_allsites}
1327
1328         def print_stats(self, key, stats):
1329                 print "%20s : %d" % (key, stats[key])
1330
1331
1332
1333         #"""
1334         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1335         #"""
1336         #def status(self):
1337         #       sub = "Monitor Summary"
1338         #       msg = "\nThe following nodes were acted upon:  \n\n"
1339         #       for (node, (type, date)) in self.emailed.items():
1340         #               # Print only things acted on today.
1341         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1342         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1343         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1344         #       for (loginbase, (date, type)) in self.squeezed.items():
1345         #               # Print only things acted on today.
1346         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1347         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1348         #       mailer.email(sub, msg, [SUMTO])
1349         #       logger.info(msg)
1350         #       return
1351
1352         #"""
1353         #Store/Load state of emails.  When, where, what.
1354         #"""
1355         #def emailedStore(self, action):
1356         #       try:
1357         #               if action == "LOAD":
1358         #                       f = open(DAT, "r+")
1359         #                       logger.info("POLICY:  Found and reading " + DAT)
1360         #                       self.emailed.update(pickle.load(f))
1361         #               if action == "WRITE":
1362         #                       f = open(DAT, "w")
1363         #                       #logger.debug("Writing " + DAT)
1364         #                       pickle.dump(self.emailed, f)
1365         #               f.close()
1366         #       except Exception, err:
1367         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1368
1369
1370 #class Policy(Thread):
1371
1372 def main():
1373         print "policy.py is a module, not a script for running directly."
1374
1375 if __name__ == '__main__':
1376         import os
1377         import plc
1378         try:
1379                 main()
1380         except KeyboardInterrupt:
1381                 print "Killed.  Exitting."
1382                 logger.info('Monitor Killed')
1383                 os._exit(0)