policy.py

   1 #
   2 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
   3 #
   4 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
   5 #
   6 # $Id: policy.py,v 1.17 2007/08/29 17:26:50 soltesz Exp $
   7 #
   8 # Policy Engine.
   9
  10 #from monitor import *
  11 from threading import *
  12 import time
  13 import logging
  14 import mailer
  15 import emailTxt
  16 import pickle
  17 import Queue
  18 import plc
  19 import sys
  20 import os
  21 import reboot
  22 import soltesz
  23 import string
  24 from www.printbadnodes import cmpCategoryVal
  25 from config import config
  26 #print "policy"
  27 config = config()
  28
  29 DAT="./monitor.dat"
  30
  31 logger = logging.getLogger("monitor")
  32
  33 # Time to enforce policy
  34 POLSLEEP = 7200
  35
  36 # Where to email the summary
  37 SUMTO = "soltesz@cs.princeton.edu"
  38 TECHEMAIL="tech-%s@sites.planet-lab.org"
  39 PIEMAIL="pi-%s@sites.planet-lab.org"
  40 SLICEMAIL="%s@slices.planet-lab.org"
  41 PLCEMAIL="support@planet-lab.org"
  42
  43 #Thresholds (DAYS)
  44 SPERMIN = 60
  45 SPERHOUR = 60*60
  46 SPERDAY = 86400
  47 PITHRESH = 7 * SPERDAY
  48 SLICETHRESH = 7 * SPERDAY
  49 # Days before attempting rins again
  50 RINSTHRESH = 5 * SPERDAY
  51
  52 # Days before calling the node dead.
  53 DEADTHRESH = 30 * SPERDAY
  54 # Minimum number of nodes up before squeezing
  55 MINUP = 2
  56
  57 TECH=1
  58 PI=2
  59 USER=4
  60 ADMIN=8
  61
  62 # IF:
  63 #  no SSH, down.
  64 #  bad disk, down
  65 #  DNS, kinda down (sick)
  66 #  clock, kinda down (sick)
  67 #  Full disk, going to be down
  68
  69 # Actions:
  70 #  Email
  71 #  suspend slice creation
  72 #  kill slices
  73 def array_to_priority_map(array):
  74         """ Create a mapping where each entry of array is given a priority equal
  75         to its position in the array.  This is useful for subsequent use in the
  76         cmpMap() function."""
  77         map = {}
  78         count = 0
  79         for i in array:
  80                 map[i] = count
  81                 count += 1
  82         return map
  83
  84 def getdebug():
  85         return config.debug
  86
  87 def print_stats(key, stats):
  88         if key in stats: print "%20s : %d" % (key, stats[key])
  89
  90 def get_ticket_id(record):
  91         if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
  92                 return record['ticket_id']
  93         elif            'found_rt_ticket' in record and \
  94                  record['found_rt_ticket'] is not "" and \
  95                  record['found_rt_ticket'] is not None:
  96                 return record['found_rt_ticket']
  97         else:
  98                 return None
  99
 100 class Merge(Thread):
 101         def __init__(self, l_merge, toRT):
 102                 self.toRT = toRT
 103                 self.merge_list = l_merge
 104                 # the hostname to loginbase mapping
 105                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 106
 107                 # Previous actions taken on nodes.
 108                 self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
 109                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 110
 111                 self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
 112                 self.sickdb = {}
 113                 self.mergedb = {}
 114                 Thread.__init__(self)
 115
 116         def run(self):
 117                 # populate sickdb
 118                 self.accumSickSites()
 119                 # read data from findbad and act_all
 120                 self.mergeActionsAndBadDB()
 121                 # pass node_records to RT
 122                 self.sendToRT()
 123
 124         def accumSickSites(self):
 125                 """
 126                 Take all nodes, from l_diagnose, look them up in the act_all database,
 127                 and insert them into sickdb[] as:
 128
 129                         sickdb[loginbase][nodename] = fb_record
 130                 """
 131                 # look at all problems reported by findbad
 132                 l_nodes = self.findbad['nodes'].keys()
 133                 count = 0
 134                 for nodename in l_nodes:
 135                         if nodename not in self.merge_list:
 136                                 continue                # skip this node, since it's not wanted
 137
 138                         count += 1
 139                         loginbase = self.plcdb_hn2lb[nodename]
 140                         values = self.findbad['nodes'][nodename]['values']
 141
 142                         fb_record = {}
 143                         fb_record['nodename'] = nodename
 144                         try:
 145                                 fb_record['category'] = values['category']
 146                         except:
 147                                 print values
 148                                 print nodename
 149                                 print self.findbad['nodes'][nodename]
 150                                 count -= 1
 151                                 continue
 152                         fb_record['state'] = values['state']
 153                         fb_record['comonstats'] = values['comonstats']
 154                         fb_record['plcnode'] = values['plcnode']
 155                         fb_record['kernel'] = self.getKernel(values['kernel'])
 156                         fb_record['stage'] = "findbad"
 157                         fb_record['message'] = None
 158                         fb_record['bootcd'] = values['bootcd']
 159                         fb_record['args'] = None
 160                         fb_record['info'] = None
 161                         fb_record['time'] = time.time()
 162                         fb_record['date_created'] = time.time()
 163
 164                         if loginbase not in self.sickdb:
 165                                 self.sickdb[loginbase] = {}
 166
 167                         self.sickdb[loginbase][nodename] = fb_record
 168
 169                 print "Found %d nodes" % count
 170
 171         def getKernel(self, unamestr):
 172                 s = unamestr.split()
 173                 if len(s) > 2:
 174                         return s[2]
 175                 else:
 176                         return ""
 177
 178         def mergeActionsAndBadDB(self):
 179                 """
 180                 - Look at the sick node_records as reported in findbad,
 181                 - Then look at the node_records in act_all.
 182
 183                 There are four cases:
 184                 1) Problem in findbad, no problem in act_all
 185                         this ok, b/c it just means it's a new problem
 186                 2) Problem in findbad, problem in act_all
 187                         -Did the problem get better or worse?
 188                                 -If Same, or Worse, then continue looking for open tickets.
 189                                 -If Better, or No problem, then "back-off" penalties.
 190                                         This judgement may need to wait until 'Diagnose()'
 191
 192                 3) No problem in findbad, problem in act_all
 193                         The the node is operational again according to Findbad()
 194
 195                 4) No problem in findbad, no problem in act_all
 196                         There won't be a record in either db, so there's no code.
 197                 """
 198
 199                 sorted_sites = self.sickdb.keys()
 200                 sorted_sites.sort()
 201                 # look at all problems reported by findbad
 202                 for loginbase in sorted_sites:
 203                         d_fb_nodes = self.sickdb[loginbase]
 204                         sorted_nodes = d_fb_nodes.keys()
 205                         sorted_nodes.sort()
 206                         for nodename in sorted_nodes:
 207                                 fb_record = self.sickdb[loginbase][nodename]
 208                                 x = fb_record
 209                                 if loginbase not in self.mergedb:
 210                                         self.mergedb[loginbase] = {}
 211
 212                                 # take the info either from act_all or fb-record.
 213                                 # if node not in act_all
 214                                 #       then take it from fbrecord, obviously.
 215                                 # else node in act_all
 216                                 #   if act_all == 0 length (no previous records)
 217                                 #               then take it from fbrecord.
 218                                 #   else
 219                                 #           take it from act_all.
 220                                 #
 221
 222                                 # We must compare findbad state with act_all state
 223                                 if nodename not in self.act_all:
 224                                         # 1) ok, b/c it's a new problem. set ticket_id to null
 225                                         self.mergedb[loginbase][nodename] = {}
 226                                         self.mergedb[loginbase][nodename].update(x)
 227                                         self.mergedb[loginbase][nodename]['ticket_id'] = ""
 228                                         self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 229                                 else:
 230                                         if len(self.act_all[nodename]) == 0:
 231                                                 self.mergedb[loginbase][nodename] = {}
 232                                                 self.mergedb[loginbase][nodename].update(x)
 233                                                 self.mergedb[loginbase][nodename]['ticket_id'] = ""
 234                                                 self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
 235                                         else:
 236                                                 y = self.act_all[nodename][0]
 237                                                 y['prev_category'] = y['category']
 238
 239                                                 self.mergedb[loginbase][nodename] = {}
 240                                                 self.mergedb[loginbase][nodename].update(y)
 241                                                 self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
 242                                                 self.mergedb[loginbase][nodename]['category']   = x['category']
 243                                                 self.mergedb[loginbase][nodename]['state'] = x['state']
 244                                                 self.mergedb[loginbase][nodename]['kernel']=x['kernel']
 245                                                 self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
 246                                                 self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
 247                                                 ticket = get_ticket_id(self.mergedb[loginbase][nodename])
 248                                                 self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
 249
 250                                         # delete the entry from cache_all to keep it out of case 3)
 251                                         del self.cache_all[nodename]
 252
 253                 # 3) nodes that remin in cache_all were not identified by findbad.
 254                 #        Do we keep them or not?
 255                 #   NOTE: i think that since the categories are performed before this
 256                 #               step now, and by a monitor-controlled agent.
 257
 258                 # TODO: This does not work correctly.  Do we need this?
 259                 #for hn in self.cache_all.keys():
 260                 #       y = self.act_all[hn][0]
 261                 #       if 'monitor' in y['bucket']:
 262                 #               loginbase = self.plcdb_hn2lb[hn]
 263                 #               if loginbase not in self.sickdb:
 264                 #                       self.sickdb[loginbase] = {}
 265                 #               self.sickdb[loginbase][hn] = y
 266                 #       else:
 267                 #               del self.cache_all[hn]
 268
 269                 print "len of cache_all: %d" % len(self.cache_all.keys())
 270                 return
 271
 272         def sendToRT(self):
 273                 sorted_sites = self.mergedb.keys()
 274                 sorted_sites.sort()
 275                 # look at all problems reported by merge
 276                 for loginbase in sorted_sites:
 277                         d_merge_nodes = self.mergedb[loginbase]
 278                         for nodename in d_merge_nodes.keys():
 279                                 record = self.mergedb[loginbase][nodename]
 280                                 self.toRT.put(record)
 281
 282                 # send signal to stop reading
 283                 self.toRT.put(None)
 284                 return
 285
 286 class Diagnose(Thread):
 287         def __init__(self, fromRT):
 288                 self.fromRT = fromRT
 289                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 290                 self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
 291
 292                 self.diagnose_in = {}
 293                 self.diagnose_out = {}
 294                 Thread.__init__(self)
 295
 296
 297         def run(self):
 298                 self.accumSickSites()
 299
 300                 print "Accumulated %d sick sites" % len(self.diagnose_in.keys())
 301                 logger.debug("Accumulated %d sick sites" % len(self.diagnose_in.keys()))
 302
 303                 try:
 304                         stats = self.diagnoseAll()
 305                 except Exception, err:
 306                         print "----------------"
 307                         import traceback
 308                         print traceback.print_exc()
 309                         print err
 310                         #if config.policysavedb:
 311                         sys.exit(1)
 312
 313                 print_stats("sites_observed", stats)
 314                 print_stats("sites_diagnosed", stats)
 315                 print_stats("nodes_diagnosed", stats)
 316
 317                 if config.policysavedb:
 318                         print "Saving Databases... diagnose_out"
 319                         soltesz.dbDump("diagnose_out", self.diagnose_out)
 320
 321         def accumSickSites(self):
 322                 """
 323                 Take all nodes, from l_diagnose, look them up in the diagnose_out database,
 324                 and insert them into diagnose_in[] as:
 325
 326                         diagnose_in[loginbase] = [diag_node1, diag_node2, ...]
 327                 """
 328                 while 1:
 329                         node_record = self.fromRT.get(block = True)
 330                         if node_record == None:
 331                                 break;
 332
 333                         nodename = node_record['nodename']
 334                         loginbase = self.plcdb_hn2lb[nodename]
 335
 336                         if loginbase not in self.diagnose_in:
 337                                 self.diagnose_in[loginbase] = {}
 338
 339                         self.diagnose_in[loginbase][nodename] = node_record
 340
 341                 return
 342
 343         def diagnoseAll(self):
 344                 i_sites_observed = 0
 345                 i_sites_diagnosed = 0
 346                 i_nodes_diagnosed = 0
 347                 i_nodes_actedon = 0
 348                 i_sites_emailed = 0
 349                 l_allsites = []
 350
 351                 sorted_sites = self.diagnose_in.keys()
 352                 sorted_sites.sort()
 353                 self.diagnose_out= {}
 354                 for loginbase in sorted_sites:
 355                         l_allsites += [loginbase]
 356
 357                         d_diag_nodes = self.diagnose_in[loginbase]
 358                         d_act_records = self.__diagnoseSite(loginbase, d_diag_nodes)
 359                         # store records in diagnose_out, for saving later.
 360                         self.diagnose_out.update(d_act_records)
 361
 362                         if len(d_act_records[loginbase]['nodes'].keys()) > 0:
 363                                 i_nodes_diagnosed += (len(d_act_records[loginbase]['nodes'].keys()))
 364                                 i_sites_diagnosed += 1
 365                         i_sites_observed += 1
 366
 367                 return {'sites_observed': i_sites_observed,
 368                                 'sites_diagnosed': i_sites_diagnosed,
 369                                 'nodes_diagnosed': i_nodes_diagnosed,
 370                                 'allsites':l_allsites}
 371
 372                 pass
 373
 374         def getDaysDown(cls, diag_record):
 375                 daysdown = -1
 376                 if diag_record['comonstats']['uptime'] != "null":
 377                         #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
 378                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
 379                 elif diag_record['comonstats']['sshstatus'] != "null":
 380                         daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
 381                 elif diag_record['comonstats']['lastcotop'] != "null":
 382                         daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
 383                 else:
 384                         now = time.time()
 385                         last_contact = diag_record['plcnode']['last_contact']
 386                         if last_contact == None:
 387                                 # the node has never been up, so give it a break
 388                                 daysdown = -1
 389                         else:
 390                                 diff = now - last_contact
 391                                 daysdown = diff // (60*60*24)
 392                 return daysdown
 393         getDaysDown = classmethod(getDaysDown)
 394
 395         def getStrDaysDown(cls, diag_record):
 396                 daysdown = cls.getDaysDown(diag_record)
 397                 if daysdown > 0:
 398                         return "%d days down"%daysdown
 399                 elif daysdown == -1:
 400                         return "Unknown number of days"
 401                 else:
 402                         return "%d days up"% -daysdown
 403         getStrDaysDown = classmethod(getStrDaysDown)
 404
 405         def __getCDVersion(self, diag_record, nodename):
 406                 cdversion = ""
 407                 #print "Getting kernel for: %s" % diag_record['nodename']
 408                 cdversion = diag_record['kernel']
 409                 return cdversion
 410
 411         def __diagnoseSite(self, loginbase, d_diag_nodes):
 412                 """
 413                 d_diag_nodes are diagnose_in entries.
 414                 """
 415                 d_diag_site = {loginbase : { 'config' :
 416                                                                                                 {'squeeze': False,
 417                                                                                                  'email': False
 418                                                                                                 },
 419                                                                         'nodes': {}
 420                                                                         }
 421                                            }
 422                 sorted_nodes = d_diag_nodes.keys()
 423                 sorted_nodes.sort()
 424                 for nodename in sorted_nodes:
 425                         node_record = d_diag_nodes[nodename]
 426                         diag_record = self.__diagnoseNode(loginbase, node_record)
 427
 428                         if diag_record != None:
 429                                 d_diag_site[loginbase]['nodes'][nodename] = diag_record
 430
 431                                 # NOTE: improvement means, we need to act/squeeze and email.
 432                                 #print "DIAG_RECORD", diag_record
 433                                 if 'monitor-end-record' in diag_record['stage'] or \
 434                                    'nmreset' in diag_record['stage']:
 435                                 #       print "resetting loginbase!"
 436                                         d_diag_site[loginbase]['config']['squeeze'] = True
 437                                         d_diag_site[loginbase]['config']['email'] = True
 438                                 #else:
 439                                 #       print "NO IMPROVEMENT!!!!"
 440                         else:
 441                                 pass # there is nothing to do for this node.
 442
 443                 # NOTE: these settings can be overridden by command line arguments,
 444                 #       or the state of a record, i.e. if already in RT's Support Queue.
 445                 nodes_up = self.getUpAtSite(loginbase, d_diag_site)
 446                 if nodes_up < MINUP:
 447                         d_diag_site[loginbase]['config']['squeeze'] = True
 448
 449                 max_slices = self.getMaxSlices(loginbase)
 450                 num_nodes = self.getNumNodes(loginbase)
 451                 # NOTE: when max_slices == 0, this is either a new site (the old way)
 452                 #       or an old disabled site from previous monitor (before site['enabled'])
 453                 if nodes_up < num_nodes and max_slices != 0:
 454                         d_diag_site[loginbase]['config']['email'] = True
 455
 456                 if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
 457                         print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
 458
 459                 return d_diag_site
 460
 461         def diagRecordByCategory(self, node_record):
 462                 nodename = node_record['nodename']
 463                 category = node_record['category']
 464                 state    = node_record['state']
 465                 loginbase = self.plcdb_hn2lb[nodename]
 466                 diag_record = None
 467
 468                 if  "ERROR" in category:        # i.e. "DOWN"
 469                         diag_record = {}
 470                         diag_record.update(node_record)
 471                         daysdown = self.getDaysDown(diag_record)
 472                         if daysdown < 7:
 473                                 format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
 474                                 print format % (loginbase, nodename, daysdown)
 475                                 return None
 476
 477                         s_daysdown = self.getStrDaysDown(diag_record)
 478                         diag_record['message'] = emailTxt.mailtxt.newdown
 479                         diag_record['args'] = {'nodename': nodename}
 480                         diag_record['info'] = (nodename, s_daysdown, "")
 481
 482                         if 'reboot_node_failed' in node_record:
 483                                 # there was a previous attempt to use the PCU.
 484                                 if node_record['reboot_node_failed'] == False:
 485                                         # then the last attempt apparently, succeeded.
 486                                         # But, the category is still 'ERROR'.  Therefore, the
 487                                         # PCU-to-Node mapping is broken.
 488                                         #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
 489                                         diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
 490                                         diag_record['email_pcu'] = True
 491
 492                         if 'ticket_id' in diag_record:
 493                                 if diag_record['ticket_id'] == "":
 494                                         if 'found_rt_ticket' in diag_record:
 495                                                 ticket_id = diag_record['found_rt_ticket']
 496                                         else:
 497                                                 ticket_id = "None"
 498                                 else:
 499                                         ticket_id = diag_record['ticket_id']
 500                         else:
 501                                 ticket_id = "None"
 502
 503                         diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
 504                                         (loginbase, nodename, diag_record['info'][1:], ticket_id)
 505
 506                 elif "OLDBOOTCD" in category:
 507                         # V2 boot cds as determined by findbad
 508                         s_daysdown = self.getStrDaysDown(node_record)
 509                         s_cdversion = self.__getCDVersion(node_record, nodename)
 510                         diag_record = {}
 511                         diag_record.update(node_record)
 512                         #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
 513                         diag_record['message'] = emailTxt.mailtxt.newbootcd
 514                         diag_record['args'] = {'nodename': nodename}
 515                         diag_record['info'] = (nodename, s_daysdown, s_cdversion)
 516                         if diag_record['ticket_id'] == "":
 517                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 518                                                                         (loginbase, nodename, diag_record['kernel'],
 519                                                                          diag_record['bootcd'], diag_record['found_rt_ticket'])
 520                         else:
 521                                 diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
 522                                                                         (loginbase, nodename, diag_record['kernel'],
 523                                                                          diag_record['bootcd'], diag_record['ticket_id'])
 524
 525                 elif "PROD" in category:
 526                         if "DEBUG" in state:
 527                                 # Not sure what to do with these yet.  Probably need to
 528                                 # reboot, and email.
 529                                 print "DEBG: %20s : %-40s  NOTHING DONE" % (loginbase, nodename)
 530                                 return None
 531                         elif "BOOT" in state:
 532                                 # no action needed.
 533                                 # TODO: remove penalties, if any are applied.
 534                                 now = time.time()
 535                                 last_contact = node_record['plcnode']['last_contact']
 536                                 if last_contact == None:
 537                                         time_diff = 0
 538                                 else:
 539                                         time_diff = now - last_contact;
 540
 541                                 if 'improvement' in node_record['stage']:
 542                                         # then we need to pass this on to 'action'
 543                                         diag_record = {}
 544                                         diag_record.update(node_record)
 545                                         diag_record['message'] = emailTxt.mailtxt.newthankyou
 546                                         diag_record['args'] = {'nodename': nodename}
 547                                         diag_record['info'] = (nodename, node_record['prev_category'],
 548                                                                                                          node_record['category'])
 549                                         if 'email_pcu' in diag_record:
 550                                                 if diag_record['email_pcu']:
 551                                                         # previously, the pcu failed to reboot, so send
 552                                                         # email. Now, reset these values to try the reboot
 553                                                         # again.
 554                                                         diag_record['email_pcu'] = False
 555                                                         del diag_record['reboot_node_failed']
 556
 557                                         if diag_record['ticket_id'] == "":
 558                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 559                                                                         (loginbase, nodename, diag_record['stage'],
 560                                                                          state, category, diag_record['found_rt_ticket'])
 561                                         else:
 562                                                 diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
 563                                                                         (loginbase, nodename, diag_record['stage'],
 564                                                                          state, category, diag_record['ticket_id'])
 565                                         return diag_record
 566                                 #elif time_diff >= 6*SPERHOUR:
 567                                 #       # heartbeat is older than 30 min.
 568                                 #       # then reset NM.
 569                                 #       #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
 570                                 #       diag_record = {}
 571                                 #       diag_record.update(node_record)
 572                                 #       diag_record['message'] = emailTxt.mailtxt.NMReset
 573                                 #       diag_record['args'] = {'nodename': nodename}
 574                                 #       diag_record['stage'] = "nmreset"
 575                                 #       diag_record['info'] = (nodename,
 576                                 #                                                       node_record['prev_category'],
 577                                 #                                                       node_record['category'])
 578                                 #       if diag_record['ticket_id'] == "":
 579                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s %20s %s %s" % \
 580                                 #                                       (loginbase, nodename, diag_record['stage'],
 581                                 #                                        state, category, diag_record['found_rt_ticket'])
 582                                 #       else:
 583                                 #               diag_record['log'] = "NM  : %20s : %-40s == %20s" % \
 584                                 #                                       (loginbase, nodename, diag_record['stage'])
 585 #
 586 #                                       return diag_record
 587                                 else:
 588                                         return None
 589                         else:
 590                                 # unknown
 591                                 pass
 592                 elif "ALPHA"    in category:
 593                         pass
 594                 elif "clock_drift" in category:
 595                         pass
 596                 elif "dns"    in category:
 597                         pass
 598                 elif "filerw"    in category:
 599                         pass
 600                 else:
 601                         print "Unknown category!!!! %s" % category
 602                         sys.exit(1)
 603
 604                 return diag_record
 605
 606         def __diagnoseNode(self, loginbase, node_record):
 607                 # TODO: change the format of the hostname in this
 608                 #               record to something more natural.
 609                 nodename                = node_record['nodename']
 610                 category                = node_record['category']
 611                 prev_category   = node_record['prev_category']
 612                 state                   = node_record['state']
 613                 #if 'prev_category' in node_record:
 614                 #       prev_category = node_record['prev_category']
 615                 #else:
 616                 #       prev_category = "ERROR"
 617                 if node_record['prev_category'] != "NORECORD":
 618
 619                         val = cmpCategoryVal(category, prev_category)
 620                         print "%s went from %s -> %s" % (nodename, prev_category, category)
 621                         if val == 1:
 622                                 # improved
 623                                 if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
 624                                         print "closing record with no ticket: ", node_record['nodename']
 625                                         node_record['action'] = ['close_rt']
 626                                         node_record['message'] = None
 627                                         node_record['stage'] = 'monitor-end-record'
 628                                         return node_record
 629                                 else:
 630                                         node_record['stage'] = 'improvement'
 631
 632                                 #if 'monitor-end-record' in node_record['stage']:
 633                                 #       # just ignore it if it's already ended.
 634                                 #       # otherwise, the status should be worse, and we won't get
 635                                 #       # here.
 636                                 #       print "monitor-end-record: ignoring ", node_record['nodename']
 637                                 #       return None
 638 #
 639 #                                       #return None
 640                         elif val == -1:
 641                                 # current category is worse than previous, carry on
 642                                 pass
 643                         else:
 644                                 #values are equal, carry on.
 645                                 #print "why are we here?"
 646                                 pass
 647
 648                 if 'rt' in node_record and 'Status' in node_record['rt']:
 649                         if node_record['stage'] == 'ticket_waitforever':
 650                                 if 'resolved' in node_record['rt']['Status']:
 651                                         print "ending waitforever record for: ", node_record['nodename']
 652                                         node_record['action'] = ['noop']
 653                                         node_record['message'] = None
 654                                         node_record['stage'] = 'monitor-end-record'
 655                                         print "oldlog: %s" % node_record['log'],
 656                                         print "%15s" % node_record['action']
 657                                         return node_record
 658                                 if 'new' in node_record['rt']['Status'] and \
 659                                         'Queue' in node_record['rt'] and \
 660                                         'Monitor' in node_record['rt']['Queue']:
 661
 662                                         print "RESETTING stage to findbad"
 663                                         node_record['stage'] = 'findbad'
 664
 665                 #### COMPARE category and prev_category
 666                 # if not_equal
 667                 #       then assign a stage based on relative priorities
 668                 # else equal
 669                 #       then check category for stats.
 670                 diag_record = self.diagRecordByCategory(node_record)
 671                 if diag_record == None:
 672                         #print "diag_record == None"
 673                         return None
 674
 675                 #### found_RT_ticket
 676                 # TODO: need to record time found, and maybe add a stage for acting on it...
 677                 # NOTE: after found, if the support ticket is resolved, the block is
 678                 #               not removed. How to remove the block on this?
 679                 if 'found_rt_ticket' in diag_record and \
 680                         diag_record['found_rt_ticket'] is not None:
 681                         if diag_record['stage'] is not 'improvement':
 682                                 diag_record['stage'] = 'ticket_waitforever'
 683
 684                 current_time = time.time()
 685                 # take off four days, for the delay that database caused.
 686                 # TODO: generalize delays at PLC, and prevent enforcement when there
 687                 #               have been no emails.
 688                 # NOTE: 7*SPERDAY exists to offset the 'bad week'
 689                 #delta = current_time - diag_record['time'] - 7*SPERDAY
 690                 delta = current_time - diag_record['time']
 691
 692                 message = diag_record['message']
 693                 act_record = {}
 694                 act_record.update(diag_record)
 695
 696                 #### DIAGNOSE STAGES
 697                 if   'findbad' in diag_record['stage']:
 698                         # The node is bad, and there's no previous record of it.
 699                         act_record['email'] = TECH
 700                         act_record['action'] = ['noop']
 701                         act_record['message'] = message[0]
 702                         act_record['stage'] = 'stage_actinoneweek'
 703
 704                 elif 'nmreset' in diag_record['stage']:
 705                         act_record['email']  = ADMIN
 706                         act_record['action'] = ['reset_nodemanager']
 707                         act_record['message'] = message[0]
 708                         act_record['stage']  = 'nmreset'
 709                         return None
 710
 711                 elif 'reboot_node' in diag_record['stage']:
 712                         act_record['email'] = TECH
 713                         act_record['action'] = ['noop']
 714                         act_record['message'] = message[0]
 715                         act_record['stage'] = 'stage_actinoneweek'
 716
 717                 elif 'improvement' in diag_record['stage']:
 718                         # - backoff previous squeeze actions (slice suspend, nocreate)
 719                         # TODO: add a backoff_squeeze section... Needs to runthrough
 720                         print "backing off of %s" % nodename
 721                         act_record['action'] = ['close_rt']
 722                         act_record['message'] = message[0]
 723                         act_record['stage'] = 'monitor-end-record'
 724
 725                 elif 'actinoneweek' in diag_record['stage']:
 726                         if delta >= 7 * SPERDAY:
 727                                 act_record['email'] = TECH | PI
 728                                 act_record['stage'] = 'stage_actintwoweeks'
 729                                 act_record['message'] = message[1]
 730                                 act_record['action'] = ['nocreate' ]
 731                                 act_record['time'] = current_time               # reset clock for waitforever
 732                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
 733                                 act_record['email'] = TECH
 734                                 act_record['message'] = message[0]
 735                                 act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
 736                                 act_record['second-mail-at-oneweek'] = True
 737                         else:
 738                                 act_record['message'] = None
 739                                 act_record['action'] = ['waitforoneweekaction' ]
 740                                 print "ignoring this record for: %s" % act_record['nodename']
 741                                 return None                     # don't send if there's no action
 742
 743                 elif 'actintwoweeks' in diag_record['stage']:
 744                         if delta >= 7 * SPERDAY:
 745                                 act_record['email'] = TECH | PI | USER
 746                                 act_record['stage'] = 'stage_waitforever'
 747                                 act_record['message'] = message[2]
 748                                 act_record['action'] = ['suspendslices']
 749                                 act_record['time'] = current_time               # reset clock for waitforever
 750                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
 751                                 act_record['email'] = TECH | PI
 752                                 act_record['message'] = message[1]
 753                                 act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
 754                                 act_record['second-mail-at-twoweeks'] = True
 755                         else:
 756                                 act_record['message'] = None
 757                                 act_record['action'] = ['waitfortwoweeksaction']
 758                                 return None                     # don't send if there's no action
 759
 760                 elif 'ticket_waitforever' in diag_record['stage']:
 761                         act_record['email'] = TECH
 762                         if 'first-found' not in act_record:
 763                                 act_record['first-found'] = True
 764                                 act_record['log'] += " firstfound"
 765                                 act_record['action'] = ['ticket_waitforever']
 766                                 act_record['message'] = message[0]
 767                                 act_record['time'] = current_time
 768                         else:
 769                                 if delta >= 7*SPERDAY:
 770                                         act_record['action'] = ['ticket_waitforever']
 771                                         if 'rt' in act_record and 'Status' in act_record['rt'] and \
 772                                                         act_record['rt']['Status'] == 'new':
 773                                                 act_record['message'] = message[0]
 774                                         else:
 775                                                 act_record['message'] = None
 776
 777                                         act_record['time'] = current_time               # reset clock
 778                                 else:
 779                                         act_record['action'] = ['ticket_waitforever']
 780                                         act_record['message'] = None
 781                                         return None
 782
 783                 elif 'waitforever' in diag_record['stage']:
 784                         # more than 3 days since last action
 785                         # TODO: send only on weekdays.
 786                         # NOTE: expects that 'time' has been reset before entering waitforever stage
 787                         if delta >= 3*SPERDAY:
 788                                 act_record['action'] = ['email-againwaitforever']
 789                                 act_record['message'] = message[2]
 790                                 act_record['time'] = current_time               # reset clock
 791                         else:
 792                                 act_record['action'] = ['waitforever']
 793                                 act_record['message'] = None
 794                                 return None                     # don't send if there's no action
 795
 796                 else:
 797                         # There is no action to be taken, possibly b/c the stage has
 798                         # already been performed, but diagnose picked it up again.
 799                         # two cases,
 800                         #       1. stage is unknown, or
 801                         #       2. delta is not big enough to bump it to the next stage.
 802                         # TODO: figure out which. for now assume 2.
 803                         print "UNKNOWN stage for %s; nothing done" % nodename
 804                         act_record['action'] = ['unknown']
 805                         act_record['message'] = message[0]
 806
 807                         act_record['email'] = TECH
 808                         act_record['action'] = ['noop']
 809                         act_record['message'] = message[0]
 810                         act_record['stage'] = 'stage_actinoneweek'
 811                         act_record['time'] = current_time               # reset clock
 812                         #print "Exiting..."
 813                         #return None
 814                         #sys.exit(1)
 815
 816                 print "%s" % act_record['log'],
 817                 print "%15s" % act_record['action']
 818                 return act_record
 819
 820         def getMaxSlices(self, loginbase):
 821                 # if sickdb has a loginbase, then it will have at least one node.
 822                 site_stats = None
 823
 824                 for nodename in self.diagnose_in[loginbase].keys():
 825                         if nodename in self.findbad['nodes']:
 826                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 827                                 break
 828
 829                 if site_stats == None:
 830                         raise Exception, "loginbase with no nodes in findbad"
 831                 else:
 832                         return site_stats['max_slices']
 833
 834         def getNumNodes(self, loginbase):
 835                 # if sickdb has a loginbase, then it will have at least one node.
 836                 site_stats = None
 837
 838                 for nodename in self.diagnose_in[loginbase].keys():
 839                         if nodename in self.findbad['nodes']:
 840                                 site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
 841                                 break
 842
 843                 if site_stats == None:
 844                         raise Exception, "loginbase with no nodes in findbad"
 845                 else:
 846                         if 'num_nodes' in site_stats:
 847                                 return site_stats['num_nodes']
 848                         else:
 849                                 return 0
 850
 851         """
 852         Returns number of up nodes as the total number *NOT* in act_all with a
 853         stage other than 'steady-state' .
 854         """
 855         def getUpAtSite(self, loginbase, d_diag_site):
 856                 # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
 857                 #               that aren't recorded yet.
 858
 859                 numnodes = self.getNumNodes(loginbase)
 860                 # NOTE: assume nodes we have no record of are ok. (too conservative)
 861                 # TODO: make the 'up' value more representative
 862                 up = numnodes
 863                 for nodename in d_diag_site[loginbase]['nodes'].keys():
 864
 865                         rec = d_diag_site[loginbase]['nodes'][nodename]
 866                         if rec['stage'] != 'monitor-end-record':
 867                                 up -= 1
 868                         else:
 869                                 pass # the node is assumed to be up.
 870
 871                 #if up != numnodes:
 872                 #       print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
 873
 874                 return up
 875
 876
 877 class SiteAction:
 878         def __init__(self, parameter_names=['hostname', 'ticket_id']):
 879                 self.parameter_names = parameter_names
 880         def checkParam(self, args):
 881                 for param in self.parameter_names:
 882                         if param not in args:
 883                                 raise Exception("Parameter %s not provided in args"%param)
 884         def run(self, args):
 885                 self.checkParam(args)
 886                 return self._run(args)
 887         def _run(self, args):
 888                 pass
 889
 890 class SuspendAction(SiteAction):
 891         def _run(self, args):
 892                 return plc.suspendSlices(args['hostname'])
 893
 894 class RemoveSliceCreation(SiteAction):
 895         def _run(self, args):
 896                 return plc.removeSliceCreation(args['hostname'])
 897
 898 class BackoffActions(SiteAction):
 899         def _run(self, args):
 900                 plc.enableSlices(args['hostname'])
 901                 plc.enableSliceCreation(args['hostname'])
 902                 return True
 903
 904 # TODO: create class for each action below,
 905 #               allow for lists of actions to be performed...
 906
 907 def close_rt_backoff(args):
 908         if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
 909                 mailer.closeTicketViaRT(args['ticket_id'],
 910                                                                 "Ticket CLOSED automatically by SiteAssist.")
 911                 plc.enableSlices(args['hostname'])
 912                 plc.enableSliceCreation(args['hostname'])
 913         return
 914
 915 def reboot_node(args):
 916         host = args['hostname']
 917         return reboot.reboot_policy(host, True, config.debug)
 918
 919 def reset_nodemanager(args):
 920         os.system("ssh root@%s /sbin/service nm restart" % nodename)
 921         return
 922
 923 class Action(Thread):
 924         def __init__(self, l_action):
 925                 self.l_action = l_action
 926
 927                 # the hostname to loginbase mapping
 928                 self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
 929
 930                 # Actions to take.
 931                 self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
 932                 # Actions taken.
 933                 self.act_all   = soltesz.if_cached_else(1, "act_all", lambda : {})
 934
 935                 # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
 936                 self.actions = {}
 937                 self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
 938                 self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
 939                 self.actions['close_rt'] = lambda args: close_rt_backoff(args)
 940                 self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
 941                 self.actions['noop'] = lambda args: args
 942                 self.actions['reboot_node'] = lambda args: reboot_node(args)
 943                 self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
 944
 945                 self.actions['ticket_waitforever'] = lambda args: args
 946                 self.actions['waitforever'] = lambda args: args
 947                 self.actions['unknown'] = lambda args: args
 948                 self.actions['waitforoneweekaction'] = lambda args: args
 949                 self.actions['waitfortwoweeksaction'] = lambda args: args
 950                 self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
 951                 self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
 952                 self.actions['email-againwaitforever'] = lambda args: args
 953                 self.actions['email-againticket_waitforever'] = lambda args: args
 954
 955
 956                 self.sickdb = {}
 957                 Thread.__init__(self)
 958
 959         def run(self):
 960                 self.accumSites()
 961                 print "Accumulated %d sick sites" % len(self.sickdb.keys())
 962                 logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
 963
 964                 try:
 965                         stats = self.analyseSites()
 966                 except Exception, err:
 967                         print "----------------"
 968                         import traceback
 969                         print traceback.print_exc()
 970                         print err
 971                         if config.policysavedb:
 972                                 print "Saving Databases... act_all"
 973                                 soltesz.dbDump("act_all", self.act_all)
 974                         sys.exit(1)
 975
 976                 print_stats("sites_observed", stats)
 977                 print_stats("sites_diagnosed", stats)
 978                 print_stats("nodes_diagnosed", stats)
 979                 print_stats("sites_emailed", stats)
 980                 print_stats("nodes_actedon", stats)
 981                 print string.join(stats['allsites'], ",")
 982
 983                 if config.policysavedb:
 984                         print "Saving Databases... act_all"
 985                         #soltesz.dbDump("policy.eventlog", self.eventlog)
 986                         # TODO: remove 'diagnose_out',
 987                         #       or at least the entries that were acted on.
 988                         soltesz.dbDump("act_all", self.act_all)
 989
 990         def accumSites(self):
 991                 """
 992                 Take all nodes, from l_action, look them up in the diagnose_db database,
 993                 and insert them into sickdb[] as:
 994
 995                 This way only the given l_action nodes will be acted on regardless
 996                 of how many from diagnose_db are available.
 997
 998                         sickdb[loginbase][nodename] = diag_record
 999                 """
1000                 # TODO: what if l_action == None ?
1001                 for nodename in self.l_action:
1002
1003                         loginbase = self.plcdb_hn2lb[nodename]
1004
1005                         if loginbase in self.diagnose_db and \
1006                                 nodename in self.diagnose_db[loginbase]['nodes']:
1007
1008                                 diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
1009
1010                                 if loginbase not in self.sickdb:
1011                                         self.sickdb[loginbase] = {'nodes' : {}}
1012
1013                                 # NOTE: don't copy all node records, since not all will be in l_action
1014                                 self.sickdb[loginbase]['nodes'][nodename] = diag_record
1015                                 # NOTE: but, we want to get the loginbase config settings,
1016                                 #               this is the easiest way.
1017                                 self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
1018                         #else:
1019                                 #print "%s not in diagnose_db!!" % loginbase
1020                 return
1021
1022         def __emailSite(self, loginbase, roles, message, args):
1023                 """
1024                 loginbase is the unique site abbreviation, prepended to slice names.
1025                 roles contains TECH, PI, USER roles, and derive email aliases.
1026                 record contains {'message': [<subj>,<body>], 'args': {...}}
1027                 """
1028                 ticket_id = 0
1029                 args.update({'loginbase':loginbase})
1030
1031                 if not config.mail and not config.debug and config.bcc:
1032                         roles = ADMIN
1033                 if config.mail and config.debug:
1034                         roles = ADMIN
1035
1036                 # build targets
1037                 contacts = []
1038                 if ADMIN & roles:
1039                         contacts += [config.email]
1040                 if TECH & roles:
1041                         contacts += [TECHEMAIL % loginbase]
1042                 if PI & roles:
1043                         contacts += [PIEMAIL % loginbase]
1044                 if USER & roles:
1045                         slices = plc.slices(loginbase)
1046                         if len(slices) >= 1:
1047                                 for slice in slices:
1048                                         contacts += [SLICEMAIL % slice]
1049                                 print "SLIC: %20s : %d slices" % (loginbase, len(slices))
1050                         else:
1051                                 print "SLIC: %20s : 0 slices" % loginbase
1052
1053                 try:
1054                         subject = message[0] % args
1055                         body = message[1] % args
1056                         if ADMIN & roles:
1057                                 # send only to admin
1058                                 if 'ticket_id' in args:
1059                                         subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
1060                                 else:
1061                                         subj = "Re: [PL noticket] %s" % subject
1062                                 mailer.email(subj, body, contacts)
1063                                 ticket_id = args['ticket_id']
1064                         else:
1065                                 ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
1066                 except Exception, err:
1067                         print "exception on message:"
1068                         import traceback
1069                         print traceback.print_exc()
1070                         print message
1071
1072                 return ticket_id
1073
1074
1075         def _format_diaginfo(self, diag_node):
1076                 info = diag_node['info']
1077                 if diag_node['stage'] == 'monitor-end-record':
1078                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
1079                 else:
1080                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
1081                 return hlist
1082
1083
1084         def get_email_args(self, act_recordlist, loginbase=None):
1085
1086                 email_args = {}
1087                 email_args['hostname_list'] = ""
1088
1089                 for act_record in act_recordlist:
1090                         email_args['hostname_list'] += act_record['msg_format']
1091                         email_args['hostname'] = act_record['nodename']
1092                         if  'plcnode' in act_record and \
1093                                 'pcu_ids' in act_record['plcnode'] and \
1094                                 len(act_record['plcnode']['pcu_ids']) > 0:
1095                                 print "setting 'pcu_id' for email_args %s"%email_args['hostname']
1096                                 email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
1097                         else:
1098                                 email_args['pcu_id'] = "-1"
1099
1100                         if 'ticket_id' in act_record:
1101                                 if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
1102                                         print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
1103                                         sys.stdout.flush()
1104                                         line = sys.stdin.readline()
1105                                         try:
1106                                                 ticket_id = int(line)
1107                                         except:
1108                                                 print "could not get ticket_id from stdin..."
1109                                                 os._exit(1)
1110                                 else:
1111                                         ticket_id = act_record['ticket_id']
1112
1113                                 email_args['ticket_id'] = ticket_id
1114
1115                 return email_args
1116
1117         def get_unique_issues(self, act_recordlist):
1118                 # NOTE: only send one email per site, per problem...
1119                 unique_issues = {}
1120                 for act_record in act_recordlist:
1121                         act_key = act_record['action'][0]
1122                         if act_key not in unique_issues:
1123                                 unique_issues[act_key] = []
1124
1125                         unique_issues[act_key] += [act_record]
1126
1127                 return unique_issues
1128
1129
1130         def __actOnSite(self, loginbase, site_record):
1131                 i_nodes_actedon = 0
1132                 i_nodes_emailed = 0
1133
1134                 act_recordlist = []
1135
1136                 for nodename in site_record['nodes'].keys():
1137                         diag_record = site_record['nodes'][nodename]
1138                         act_record  = self.__actOnNode(diag_record)
1139                         #print "nodename: %s %s" % (nodename, act_record)
1140                         if act_record is not None:
1141                                 act_recordlist += [act_record]
1142
1143                 unique_issues = self.get_unique_issues(act_recordlist)
1144
1145                 for issue in unique_issues.keys():
1146                         print "\tworking on issue: %s" % issue
1147                         issue_record_list = unique_issues[issue]
1148                         email_args = self.get_email_args(issue_record_list, loginbase)
1149
1150                         # for each record.
1151                         for act_record in issue_record_list:
1152                                 # if there's a pcu record and email config is set
1153                                 if 'email_pcu' in act_record:
1154                                         if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
1155                                                 # and 'reboot_node' in act_record['stage']:
1156
1157                                                 email_args['hostname'] = act_record['nodename']
1158                                                 ticket_id = self.__emailSite(loginbase,
1159                                                                                         act_record['email'],
1160                                                                                         emailTxt.mailtxt.pcudown[0],
1161                                                                                         email_args)
1162                                                 if ticket_id == 0:
1163                                                         # error.
1164                                                         print "got a ticket_id == 0!!!! %s" % act_record['nodename']
1165                                                         os._exit(1)
1166                                                         pass
1167                                                 email_args['ticket_id'] = ticket_id
1168
1169
1170                         act_record = issue_record_list[0]
1171                         # send message before squeezing
1172                         print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
1173                                                                                                 site_record['config']['email'])
1174                         if act_record['message'] != None and site_record['config']['email']:
1175                                 ticket_id = self.__emailSite(loginbase, act_record['email'],
1176                                                                                          act_record['message'], email_args)
1177
1178                                 if ticket_id == 0:
1179                                         # error.
1180                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
1181                                         os._exit(1)
1182                                         pass
1183
1184                                 # Add ticket_id to ALL nodenames
1185                                 for act_record in issue_record_list:
1186                                         nodename = act_record['nodename']
1187                                         # update node record with RT ticket_id
1188                                         if nodename in self.act_all:
1189                                                 self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
1190                                                 # if the ticket was previously resolved, reset it to new.
1191                                                 if 'rt' in act_record and \
1192                                                         'Status' in act_record['rt'] and \
1193                                                         act_record['rt']['Status'] == 'resolved':
1194                                                         mailer.setTicketStatus(ticket_id, "new")
1195                                                 status = mailer.getTicketStatus(ticket_id)
1196                                                 self.act_all[nodename][0]['rt'] = status
1197                                         if config.mail: i_nodes_emailed += 1
1198
1199                         print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
1200                                                                                                         site_record['config']['squeeze'])
1201                         if config.squeeze and site_record['config']['squeeze']:
1202                                 for act_key in act_record['action']:
1203                                         self.actions[act_key](email_args)
1204                                 i_nodes_actedon += 1
1205
1206                 if config.policysavedb:
1207                         print "Saving Databases... act_all, diagnose_out"
1208                         soltesz.dbDump("act_all", self.act_all)
1209                         # remove site record from diagnose_out, it's in act_all as done.
1210                         del self.diagnose_db[loginbase]
1211                         soltesz.dbDump("diagnose_out", self.diagnose_db)
1212
1213                 print "sleeping for 1 sec"
1214                 time.sleep(1)
1215                 #print "Hit enter to continue..."
1216                 #sys.stdout.flush()
1217                 #line = sys.stdin.readline()
1218
1219                 return (i_nodes_actedon, i_nodes_emailed)
1220
1221         def __actOnNode(self, diag_record):
1222                 nodename = diag_record['nodename']
1223                 message = diag_record['message']
1224
1225                 act_record = {}
1226                 act_record.update(diag_record)
1227                 act_record['nodename'] = nodename
1228                 act_record['msg_format'] = self._format_diaginfo(diag_record)
1229                 print "act_record['stage'] == %s " % act_record['stage']
1230
1231                 # avoid end records, and nmreset records
1232                 # reboot_node_failed, is set below, so don't reboot repeatedly.
1233
1234                 if 'monitor-end-record' not in act_record['stage'] and \
1235                    'nmreset' not in act_record['stage'] and \
1236                    'reboot_node_failed' not in act_record:
1237
1238                         if "DOWN" in act_record['log'] and \
1239                                         'pcu_ids' in act_record['plcnode'] and \
1240                                         len(act_record['plcnode']['pcu_ids']) > 0:
1241
1242                                 print "%s" % act_record['log'],
1243                                 print "%15s" % (['reboot_node'],)
1244                                 # Set node to re-install
1245                                 plc.nodeBootState(act_record['nodename'], "rins")
1246                                 try:
1247                                         ret = reboot_node({'hostname': act_record['nodename']})
1248                                 except Exception, exc:
1249                                         print "exception on reboot_node:"
1250                                         import traceback
1251                                         print traceback.print_exc()
1252                                         ret = False
1253
1254                                 if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
1255                                         # Reboot Succeeded
1256                                         print "reboot succeeded for %s" % act_record['nodename']
1257                                         act_record2 = {}
1258                                         act_record2.update(act_record)
1259                                         act_record2['action'] = ['reboot_node']
1260                                         act_record2['stage'] = "reboot_node"
1261                                         act_record2['reboot_node_failed'] = False
1262                                         act_record2['email_pcu'] = False
1263
1264                                         if nodename not in self.act_all:
1265                                                 self.act_all[nodename] = []
1266                                         print "inserting 'reboot_node' record into act_all"
1267                                         self.act_all[nodename].insert(0,act_record2)
1268
1269                                         # return None to avoid further action
1270                                         print "Taking no further action"
1271                                         return None
1272                                 else:
1273                                         print "reboot failed for %s" % act_record['nodename']
1274                                         # set email_pcu to also send pcu notice for this record.
1275                                         act_record['reboot_node_failed'] = True
1276                                         act_record['email_pcu'] = True
1277
1278                         print "%s" % act_record['log'],
1279                         print "%15s" % act_record['action']
1280
1281                 if act_record['stage'] is not 'monitor-end-record' and \
1282                    act_record['stage'] is not 'nmreset':
1283                         if nodename not in self.act_all:
1284                                 self.act_all[nodename] = []
1285
1286                         self.act_all[nodename].insert(0,act_record)
1287                 else:
1288                         print "Not recording %s in act_all" % nodename
1289
1290                 return act_record
1291
1292         def analyseSites(self):
1293                 i_sites_observed = 0
1294                 i_sites_diagnosed = 0
1295                 i_nodes_diagnosed = 0
1296                 i_nodes_actedon = 0
1297                 i_sites_emailed = 0
1298                 l_allsites = []
1299
1300                 sorted_sites = self.sickdb.keys()
1301                 sorted_sites.sort()
1302                 for loginbase in sorted_sites:
1303                         site_record = self.sickdb[loginbase]
1304                         print "sites: %s" % loginbase
1305
1306                         i_nodes_diagnosed += len(site_record.keys())
1307                         i_sites_diagnosed += 1
1308
1309                         (na,ne) = self.__actOnSite(loginbase, site_record)
1310
1311                         i_sites_observed += 1
1312                         i_nodes_actedon += na
1313                         i_sites_emailed += ne
1314
1315                         l_allsites += [loginbase]
1316
1317                 return {'sites_observed': i_sites_observed,
1318                                 'sites_diagnosed': i_sites_diagnosed,
1319                                 'nodes_diagnosed': i_nodes_diagnosed,
1320                                 'sites_emailed': i_sites_emailed,
1321                                 'nodes_actedon': i_nodes_actedon,
1322                                 'allsites':l_allsites}
1323
1324         def print_stats(self, key, stats):
1325                 print "%20s : %d" % (key, stats[key])
1326
1327
1328
1329         #"""
1330         #Prints, logs, and emails status of up nodes, down nodes, and buckets.
1331         #"""
1332         #def status(self):
1333         #       sub = "Monitor Summary"
1334         #       msg = "\nThe following nodes were acted upon:  \n\n"
1335         #       for (node, (type, date)) in self.emailed.items():
1336         #               # Print only things acted on today.
1337         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1338         #                       msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
1339         #       msg +="\n\nThe following sites have been 'squeezed':\n\n"
1340         #       for (loginbase, (date, type)) in self.squeezed.items():
1341         #               # Print only things acted on today.
1342         #               if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
1343         #                       msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
1344         #       mailer.email(sub, msg, [SUMTO])
1345         #       logger.info(msg)
1346         #       return
1347
1348         #"""
1349         #Store/Load state of emails.  When, where, what.
1350         #"""
1351         #def emailedStore(self, action):
1352         #       try:
1353         #               if action == "LOAD":
1354         #                       f = open(DAT, "r+")
1355         #                       logger.info("POLICY:  Found and reading " + DAT)
1356         #                       self.emailed.update(pickle.load(f))
1357         #               if action == "WRITE":
1358         #                       f = open(DAT, "w")
1359         #                       #logger.debug("Writing " + DAT)
1360         #                       pickle.dump(self.emailed, f)
1361         #               f.close()
1362         #       except Exception, err:
1363         #               logger.info("POLICY:  Problem with DAT, %s" %err)
1364
1365
1366 #class Policy(Thread):
1367
1368 def main():
1369         print "policy.py is a module, not a script for running directly."
1370
1371 if __name__ == '__main__':
1372         import os
1373         import plc
1374         try:
1375                 main()
1376         except KeyboardInterrupt:
1377                 print "Killed.  Exitting."
1378                 logger.info('Monitor Killed')
1379                 os._exit(0)