commands/policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.const import MINUP
  25 from monitor.model import *
  26 from monitor.wrapper import plc
  27 from monitor.wrapper import plccache
  28 from monitor.database.info.model import *
  29 from monitor.database.info.interface import *
  30
  31 from monitor.query import verify,query_to_dict,node_select
  32
  33 api = plc.getAuthAPI()
  34
  35 def logic():
  36
  37     plc.nodeBootState(host, 'reinstall')
  38     node_end_record(host)
  39
  40 def check_node_and_pcu_status_for(loginbase):
  41     """
  42         this function checks whether all the nodes and associated pcus for a
  43         given site are considered 'good'.
  44
  45         If so, the function returns True.
  46         Otherwise, the function returns False.
  47     """
  48
  49     results = []
  50     for node in plccache.plcdb_lb2hn[loginbase]:
  51
  52         noderec  = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
  53         nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
  54         nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
  55         pcuhist  = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
  56
  57         if (nodehist is not None and nodehist.status == 'good' and \
  58             ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
  59             if nodebl is None:             # no entry in blacklist table
  60                 results.append(True)
  61             elif nodebl is not None and nodebl.expired():    # expired entry in blacklist table
  62                 results.append(True)
  63             else:
  64                 results.append(False)    # entry that is not expired.
  65         else:
  66             results.append(False)
  67
  68     try:
  69         print "test: %s" % results
  70         # NOTE: incase results is empty, reduce does not work on an empty set.
  71         return reduce(lambda x,y: x&y, results) and len(results) > MINUP
  72     except:
  73         return False
  74
  75 def main(hostnames, sitenames):
  76     # commands:
  77     i = 1
  78     node_count = 1
  79     site_count = 1
  80     #print "hosts: %s" % hostnames
  81     print "apply-policy"
  82     for i,host in enumerate(hostnames):
  83         try:
  84             lb = plccache.plcdb_hn2lb[host]
  85         except:
  86             print "unknown host in plcdb_hn2lb %s" % host
  87             email_exception("%s %s" % (i,host))
  88             continue
  89
  90         nodeblack = BlacklistRecord.get_by(hostname=host)
  91
  92         if nodeblack and not nodeblack.expired():
  93             print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  94             continue
  95
  96         sitehist = SiteInterface.get_or_make(loginbase=lb)
  97
  98         recent_actions = sitehist.getRecentActions(hostname=host)
  99
 100         nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
 101
 102         print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
 103         if nodehist.status == 'good' and \
 104             changed_lessthan(nodehist.last_changed, 1.0) and \
 105             found_within(recent_actions, 'down_notice', 7.0) and \
 106             not found_within(recent_actions, 'online_notice', 0.5):
 107                 # NOTE: chronicly flapping nodes will not get 'online' notices
 108                 #         since, they are never up long enough to be 'good'.
 109                 # NOTE: searching for down_notice proves that the node has
 110                 #         gone through a 'down' state first, rather than just
 111                 #         flapping through: good, offline, online, ...
 112                 #
 113                 # NOTE: there is a narrow window in which this command must be
 114                 #         evaluated, otherwise the notice will not go out.
 115                 #        this is not ideal.
 116                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
 117                 print "send message for host %s online" % host
 118
 119
 120         # if a node is offline and doesn't have a PCU, remind the user that they should have one.
 121         #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 122         #    changed_greaterthan(nodehist.last_changed,1.0) and \
 123         #    not found_within(recent_actions, 'pcumissing_notice', 7.0):
 124         #
 125         #        sitehist.sendMessage('pcumissing_notice', hostname=host)
 126         #        print "send message for host %s pcumissing_notice" % host
 127
 128         # if it is offline and HAS a PCU, then try to use it.
 129         if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 130             changed_greaterthan(nodehist.last_changed,1.0) and \
 131             not nodehist.firewall and \
 132             not found_between(recent_actions, 'try_reboot', 3.5, 1):
 133
 134                 # TODO: there MUST be a better way to do this...
 135                 # get fb node record for pcuid
 136                 fbpcu = None
 137                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 138                 if fbnode:
 139                     fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 140
 141                 sitehist.attemptReboot(host)
 142                 print "send message for host %s try_reboot" % host
 143                 if False and not fbpcu.test_is_ok() and \
 144                     not found_within(recent_actions, 'pcuerror_notice', 3.0):
 145
 146                     args = {}
 147                     if fbpcu:
 148                         args['pcu_name'] = fbpcu.pcu_name()
 149                         args['pcu_errors'] = fbpcu.pcu_errors()
 150                         args['plc_pcuid'] = fbpcu.plc_pcuid
 151                     else:
 152                         args['pcu_name'] = "error looking up pcu name"
 153                         args['pcu_errors'] = ""
 154                         args['plc_pcuid'] = 0
 155
 156                     args['hostname'] = host
 157                     sitehist.sendMessage('pcuerror_notice', **args)
 158                     print "send message for host %s PCU Failure" % host
 159
 160
 161         # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 162         #         will be false for a day after the above condition is satisfied
 163         if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 164             changed_greaterthan(nodehist.last_changed,1.5) and \
 165             not nodehist.firewall and \
 166             found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 167             not found_within(recent_actions, 'pcufailed_notice', 3.5):
 168
 169                 # TODO: there MUST be a better way to do this...
 170                 # get fb node record for pcuid
 171                 fbpcu = None
 172                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 173                 if fbnode:
 174                     fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 175                 if fbpcu:
 176                     pcu_name = fbpcu.pcu_name()
 177                 else:
 178                     pcu_name = "error looking up pcu name"
 179
 180                 # get fb pcu record for pcuid
 181                 # send pcu failure message
 182                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
 183                 print "send message for host %s PCU Failure" % host
 184
 185         if nodehist.status == 'failboot' and \
 186             changed_greaterthan(nodehist.last_changed, 0.25) and \
 187             not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 188                 # send down node notice
 189                 # delay 0.5 days before retrying...
 190
 191                 print "send message for host %s bootmanager_restore" % host
 192                 sitehist.runBootManager(host)
 193             #    sitehist.sendMessage('retry_bootman', hostname=host)
 194
 195         if nodehist.status == 'down' and \
 196             changed_greaterthan(nodehist.last_changed, 2):
 197                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 198                     # send down node notice
 199                     sitehist.sendMessage('down_notice', hostname=host)
 200                     print "send message for host %s down" % host
 201
 202                 #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 203                     # send down node notice
 204                     #email_exception(host, "firewall_notice")
 205                 #    sitehist.sendMessage('firewall_notice', hostname=host)
 206                 #    print "send message for host %s down" % host
 207
 208         node_count = node_count + 1
 209         print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 210         sys.stdout.flush()
 211         session.flush()
 212
 213     for i,site in enumerate(sitenames):
 214         sitehist = SiteInterface.get_or_make(loginbase=site)
 215         siteblack = BlacklistRecord.get_by(loginbase=site)
 216         skip_due_to_blacklist=False
 217
 218         try:
 219             site_exempt = plc.isSiteExempt(site)
 220         except:
 221             site_exempt = False
 222
 223         if siteblack and not siteblack.expired() or site_exempt:
 224             if siteblack:
 225                 print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 226             else:
 227                 print "skipping %s due to blacklist." % (site)
 228             skip_due_to_blacklist=True
 229             sitehist.clearPenalty()
 230             sitehist.applyPenalty()
 231             continue
 232
 233         # TODO: make query only return records within a certin time range,
 234         #         i.e. greater than 0.5 days ago. or 5 days, etc.
 235         recent_actions = sitehist.getRecentActions(loginbase=site)
 236
 237         print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 238
 239         if sitehist.db.status == 'down':
 240             if sitehist.db.penalty_pause and \
 241                 changed_greaterthan(sitehist.db.penalty_pause_time, 30):
 242
 243                 email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
 244                 sitehist.closeTicket()
 245                 # NOTE: but preserve the penalty status.
 246                 sitehist.clearPenaltyPause()
 247
 248             if sitehist.db.message_id != 0 and \
 249                 sitehist.db.message_status == 'open' and \
 250                 not sitehist.db.penalty_pause:
 251
 252                 email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
 253                 sitehist.setPenaltyPause()
 254
 255             if  not sitehist.db.penalty_pause and \
 256                 not found_within(recent_actions, 'increase_penalty', 7) and \
 257                 changed_greaterthan(sitehist.db.last_changed, 7):
 258
 259                 # TODO: catch errors
 260                 sitehist.increasePenalty()
 261                 sitehist.applyPenalty()
 262                 sitehist.sendMessage('increase_penalty')
 263
 264                 print "send message for site %s penalty increase" % site
 265
 266         if sitehist.db.status == 'good':
 267             # clear penalty
 268             # NOTE: because 'all clear' should have an indefinite status, we
 269             #         have a boolean value rather than a 'recent action'
 270             if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
 271                 # send message that penalties are cleared.
 272
 273                 sitehist.clearPenalty()
 274                 sitehist.applyPenalty()
 275                 sitehist.sendMessage('clear_penalty')
 276                 sitehist.closeTicket()
 277
 278                 print "send message for site %s penalty cleared" % site
 279
 280             # check all nodes and pcus for this site; if they're all ok,
 281             #         close the ticket, else leave it open.
 282             # NOTE: in the case where a PCU reboots and fails, a message is
 283             #         sent, but the PCU may appear to be ok according to tests.
 284             # NOTE: Also, bootmanager sends messages regarding disks,
 285             #         configuration, etc.  So, the conditions here are 'good'
 286             #         rather than 'not down' as it is in sitebad.
 287             close_ticket = check_node_and_pcu_status_for(site)
 288             if close_ticket:
 289                 sitehist.closeTicket()
 290
 291         site_count = site_count + 1
 292
 293         print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 294         sys.stdout.flush()
 295         session.flush()
 296
 297     session.flush()
 298     return
 299
 300
 301 if __name__ == "__main__":
 302     parser = parsermodule.getParser(['nodesets'])
 303     parser.set_defaults( timewait=0,
 304                         skip=0,
 305                         rins=False,
 306                         reboot=False,
 307                         findbad=False,
 308                         force=False,
 309                         nosetup=False,
 310                         verbose=False,
 311                         quiet=False,)
 312
 313     parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 314                         help="The select string that must evaluate to true for the node to be considered 'done'")
 315     parser.add_option("", "--findbad", dest="findbad", action="store_true",
 316                         help="Re-run findbad on the nodes we're going to check before acting.")
 317     parser.add_option("", "--force", dest="force", action="store_true",
 318                         help="Force action regardless of previous actions/logs.")
 319     parser.add_option("", "--rins", dest="rins", action="store_true",
 320                         help="Set the boot_state to 'rins' for all nodes.")
 321     parser.add_option("", "--reboot", dest="reboot", action="store_true",
 322                         help="Actively try to reboot the nodes, keeping a log of actions.")
 323
 324     parser.add_option("", "--verbose", dest="verbose", action="store_true",
 325                         help="Extra debug output messages.")
 326     parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 327                         help="Do not perform the orginary setup phase.")
 328     parser.add_option("", "--skip", dest="skip",
 329                         help="Number of machines to skip on the input queue.")
 330     parser.add_option("", "--timewait", dest="timewait",
 331                         help="Minutes to wait between iterations of 10 nodes.")
 332
 333     parser = parsermodule.getParser(['defaults'], parser)
 334     config = parsermodule.parse_args(parser)
 335
 336     fbquery = HistoryNodeRecord.query.all()
 337     hostnames = [ n.hostname for n in fbquery ]
 338
 339     fbquery = HistorySiteRecord.query.all()
 340     sitenames = [ s.loginbase for s in fbquery ]
 341
 342     if config.site:
 343         # TODO: replace with calls to local db.  the api fails so often that
 344         #         these calls should be regarded as unreliable.
 345         l_nodes = plccache.GetNodesBySite(config.site)
 346         filter_hostnames = [ n['hostname'] for n in l_nodes ]
 347
 348         hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 349         sitenames = [config.site]
 350
 351     if config.node:
 352         hostnames = [ config.node ]
 353         sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 354
 355     try:
 356         main(hostnames, sitenames)
 357         session.flush()
 358     except KeyboardInterrupt:
 359         print "Killed by interrupt"
 360         session.flush()
 361         sys.exit(0)
 362     except:
 363         email_exception()
 364         print traceback.print_exc();
 365         print "fail all..."