commands/policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.const import MINUP
  25 from monitor.model import *
  26 from monitor.wrapper import plc
  27 from monitor.wrapper import plccache
  28 from monitor.database.info.model import *
  29 from monitor.database.info.interface import *
  30
  31 from monitor.query import verify,query_to_dict,node_select
  32
  33 api = plc.getAuthAPI()
  34
  35 def logic():
  36
  37         plc.nodeBootState(host, 'reinstall')
  38         node_end_record(host)
  39
  40 def check_node_and_pcu_status_for(loginbase):
  41         """
  42                 this function checks whether all the nodes and associated pcus for a
  43                 given site are considered 'good'.
  44
  45                 If so, the function returns True.
  46                 Otherwise, the function returns False.
  47         """
  48
  49         results = []
  50         for node in plccache.plcdb_lb2hn[loginbase]:
  51
  52                 noderec  = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
  53                 nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
  54                 nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
  55                 pcuhist  = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
  56
  57                 if (nodehist is not None and nodehist.status == 'good' and \
  58                         ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
  59                         if nodebl is None:                      # no entry in blacklist table
  60                                 results.append(True)
  61                         elif nodebl is not None and nodebl.expired():   # expired entry in blacklist table
  62                                 results.append(True)
  63                         else:
  64                                 results.append(False)   # entry that is not expired.
  65                 else:
  66                         results.append(False)
  67
  68         try:
  69                 print "test: %s" % results
  70                 # NOTE: incase results is empty, reduce does not work on an empty set.
  71                 return reduce(lambda x,y: x&y, results) and len(results) > MINUP
  72         except:
  73                 return False
  74
  75 def main(hostnames, sitenames):
  76         # commands:
  77         i = 1
  78         node_count = 1
  79         site_count = 1
  80         #print "hosts: %s" % hostnames
  81         print "apply-policy"
  82         for i,host in enumerate(hostnames):
  83                 try:
  84                         lb = plccache.plcdb_hn2lb[host]
  85                 except:
  86                         print "unknown host in plcdb_hn2lb %s" % host
  87                         email_exception("%s %s" % (i,host))
  88                         continue
  89
  90                 nodeblack = BlacklistRecord.get_by(hostname=host)
  91
  92                 if nodeblack and not nodeblack.expired():
  93                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  94                         continue
  95
  96                 sitehist = SiteInterface.get_or_make(loginbase=lb)
  97
  98                 recent_actions = sitehist.getRecentActions(hostname=host)
  99
 100                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
 101
 102                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
 103                 if nodehist.status == 'good' and \
 104                         changed_lessthan(nodehist.last_changed, 1.0) and \
 105                         found_within(recent_actions, 'down_notice', 7.0) and \
 106                         not found_within(recent_actions, 'online_notice', 0.5):
 107                                 # NOTE: chronicly flapping nodes will not get 'online' notices
 108                                 #               since, they are never up long enough to be 'good'.
 109                                 # NOTE: searching for down_notice proves that the node has
 110                                 #               gone through a 'down' state first, rather than just
 111                                 #               flapping through: good, offline, online, ...
 112                                 #
 113                                 # NOTE: there is a narrow window in which this command must be
 114                                 #               evaluated, otherwise the notice will not go out.
 115                                 #               this is not ideal.
 116                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
 117                                 print "send message for host %s online" % host
 118
 119
 120                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
 121                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 122                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
 123                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
 124                 #
 125                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
 126                 #               print "send message for host %s pcumissing_notice" % host
 127
 128                 # if it is offline and HAS a PCU, then try to use it.
 129                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 130                         changed_greaterthan(nodehist.last_changed,1.0) and \
 131                         not nodehist.firewall and \
 132                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
 133
 134                                 # TODO: there MUST be a better way to do this...
 135                                 # get fb node record for pcuid
 136                                 fbpcu = None
 137                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 138                                 if fbnode:
 139                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 140
 141                                 sitehist.attemptReboot(host)
 142                                 print "send message for host %s try_reboot" % host
 143                                 if False and not fbpcu.test_is_ok() and \
 144                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
 145
 146                                         args = {}
 147                                         if fbpcu:
 148                                                 args['pcu_name'] = fbpcu.pcu_name()
 149                                                 args['pcu_errors'] = fbpcu.pcu_errors()
 150                                                 args['plc_pcuid'] = fbpcu.plc_pcuid
 151                                         else:
 152                                                 args['pcu_name'] = "error looking up pcu name"
 153                                                 args['pcu_errors'] = ""
 154                                                 args['plc_pcuid'] = 0
 155
 156                                         args['hostname'] = host
 157                                         sitehist.sendMessage('pcuerror_notice', **args)
 158                                         print "send message for host %s PCU Failure" % host
 159
 160
 161                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 162                 #               will be false for a day after the above condition is satisfied
 163                 if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 164                         changed_greaterthan(nodehist.last_changed,1.5) and \
 165                         not nodehist.firewall and \
 166                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 167                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
 168
 169                                 # TODO: there MUST be a better way to do this...
 170                                 # get fb node record for pcuid
 171                                 fbpcu = None
 172                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 173                                 if fbnode:
 174                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 175                                 if fbpcu:
 176                                         pcu_name = fbpcu.pcu_name()
 177                                 else:
 178                                         pcu_name = "error looking up pcu name"
 179
 180                                 # get fb pcu record for pcuid
 181                                 # send pcu failure message
 182                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
 183                                 print "send message for host %s PCU Failure" % host
 184
 185                 if nodehist.status == 'failboot' and \
 186                         changed_greaterthan(nodehist.last_changed, 0.25) and \
 187                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 188                                 # send down node notice
 189                                 # delay 0.5 days before retrying...
 190
 191                                 print "send message for host %s bootmanager_restore" % host
 192                                 sitehist.runBootManager(host)
 193                         #       sitehist.sendMessage('retry_bootman', hostname=host)
 194
 195                 if nodehist.status == 'down' and \
 196                         changed_greaterthan(nodehist.last_changed, 2):
 197                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 198                                         # send down node notice
 199                                         sitehist.sendMessage('down_notice', hostname=host)
 200                                         print "send message for host %s down" % host
 201
 202                                 #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 203                                         # send down node notice
 204                                         #email_exception(host, "firewall_notice")
 205                                 #       sitehist.sendMessage('firewall_notice', hostname=host)
 206                                 #       print "send message for host %s down" % host
 207
 208                 node_count = node_count + 1
 209                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 210                 sys.stdout.flush()
 211                 session.flush()
 212
 213         for i,site in enumerate(sitenames):
 214                 sitehist = SiteInterface.get_or_make(loginbase=site)
 215                 siteblack = BlacklistRecord.get_by(loginbase=site)
 216                 skip_due_to_blacklist=False
 217
 218                 if siteblack and not siteblack.expired():
 219                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 220                         skip_due_to_blacklist=True
 221                         sitehist.clearPenalty()
 222                         sitehist.applyPenalty()
 223                         continue
 224
 225                 # TODO: make query only return records within a certin time range,
 226                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
 227                 recent_actions = sitehist.getRecentActions(loginbase=site)
 228
 229                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 230
 231                 if sitehist.db.status == 'down':
 232                         if sitehist.db.penalty_pause and \
 233                                 changed_greaterthan(sitehist.db.penalty_pause_time, 30):
 234
 235                                 email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
 236                                 sitehist.closeTicket()
 237                                 # NOTE: but preserve the penalty status.
 238                                 sitehist.clearPenaltyPause()
 239
 240                         if sitehist.db.message_id != 0 and \
 241                                 sitehist.db.message_status == 'open' and \
 242                                 not sitehist.db.penalty_pause:
 243
 244                                 email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
 245                                 sitehist.setPenaltyPause()
 246
 247                         if  not sitehist.db.penalty_pause and \
 248                                 not found_within(recent_actions, 'increase_penalty', 7) and \
 249                                 changed_greaterthan(sitehist.db.last_changed, 7):
 250
 251                                 # TODO: catch errors
 252                                 sitehist.increasePenalty()
 253                                 sitehist.applyPenalty()
 254                                 sitehist.sendMessage('increase_penalty')
 255
 256                                 print "send message for site %s penalty increase" % site
 257
 258                 if sitehist.db.status == 'good':
 259                         # clear penalty
 260                         # NOTE: because 'all clear' should have an indefinite status, we
 261                         #               have a boolean value rather than a 'recent action'
 262                         if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
 263                                 # send message that penalties are cleared.
 264
 265                                 sitehist.clearPenalty()
 266                                 sitehist.applyPenalty()
 267                                 sitehist.sendMessage('clear_penalty')
 268                                 sitehist.closeTicket()
 269
 270                                 print "send message for site %s penalty cleared" % site
 271
 272                         # check all nodes and pcus for this site; if they're all ok,
 273                         #               close the ticket, else leave it open.
 274                         # NOTE: in the case where a PCU reboots and fails, a message is
 275                         #               sent, but the PCU may appear to be ok according to tests.
 276                         # NOTE: Also, bootmanager sends messages regarding disks,
 277                         #               configuration, etc.  So, the conditions here are 'good'
 278                         #               rather than 'not down' as it is in sitebad.
 279                         close_ticket = check_node_and_pcu_status_for(site)
 280                         if close_ticket:
 281                                 sitehist.closeTicket()
 282
 283                 site_count = site_count + 1
 284
 285                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 286                 sys.stdout.flush()
 287                 session.flush()
 288
 289         session.flush()
 290         return
 291
 292
 293 if __name__ == "__main__":
 294         parser = parsermodule.getParser(['nodesets'])
 295         parser.set_defaults( timewait=0,
 296                                                 skip=0,
 297                                                 rins=False,
 298                                                 reboot=False,
 299                                                 findbad=False,
 300                                                 force=False,
 301                                                 nosetup=False,
 302                                                 verbose=False,
 303                                                 quiet=False,)
 304
 305         parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 306                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
 307         parser.add_option("", "--findbad", dest="findbad", action="store_true",
 308                                                 help="Re-run findbad on the nodes we're going to check before acting.")
 309         parser.add_option("", "--force", dest="force", action="store_true",
 310                                                 help="Force action regardless of previous actions/logs.")
 311         parser.add_option("", "--rins", dest="rins", action="store_true",
 312                                                 help="Set the boot_state to 'rins' for all nodes.")
 313         parser.add_option("", "--reboot", dest="reboot", action="store_true",
 314                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
 315
 316         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 317                                                 help="Extra debug output messages.")
 318         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 319                                                 help="Do not perform the orginary setup phase.")
 320         parser.add_option("", "--skip", dest="skip",
 321                                                 help="Number of machines to skip on the input queue.")
 322         parser.add_option("", "--timewait", dest="timewait",
 323                                                 help="Minutes to wait between iterations of 10 nodes.")
 324
 325         parser = parsermodule.getParser(['defaults'], parser)
 326         config = parsermodule.parse_args(parser)
 327
 328         fbquery = HistoryNodeRecord.query.all()
 329         hostnames = [ n.hostname for n in fbquery ]
 330
 331         fbquery = HistorySiteRecord.query.all()
 332         sitenames = [ s.loginbase for s in fbquery ]
 333
 334         if config.site:
 335                 # TODO: replace with calls to local db.  the api fails so often that
 336                 #               these calls should be regarded as unreliable.
 337                 l_nodes = plccache.GetNodesBySite(config.site)
 338                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
 339
 340                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 341                 sitenames = [config.site]
 342
 343         if config.node:
 344                 hostnames = [ config.node ]
 345                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 346
 347         try:
 348                 main(hostnames, sitenames)
 349                 session.flush()
 350         except KeyboardInterrupt:
 351                 print "Killed by interrupt"
 352                 session.flush()
 353                 sys.exit(0)
 354         except:
 355                 email_exception()
 356                 print traceback.print_exc();
 357                 print "fail all..."