policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.model import *
  25 from monitor.wrapper import plc
  26 from monitor.wrapper import plccache
  27 from monitor.database.info.model import *
  28 from monitor.database.info.interface import *
  29
  30 from monitor.query import verify,query_to_dict,node_select
  31
  32 api = plc.getAuthAPI()
  33
  34 def logic():
  35
  36         plc.nodeBootState(host, 'reinstall')
  37         node_end_record(host)
  38
  39 def main(hostnames, sitenames):
  40         # commands:
  41         i = 1
  42         node_count = 1
  43         site_count = 1
  44         #print "hosts: %s" % hostnames
  45         for i,host in enumerate(hostnames):
  46                 try:
  47                         lb = plccache.plcdb_hn2lb[host]
  48                 except:
  49                         print "unknown host in plcdb_hn2lb %s" % host
  50                         email_exception(host)
  51                         continue
  52
  53                 nodeblack = BlacklistRecord.get_by(hostname=host)
  54
  55                 if nodeblack and not nodeblack.expired():
  56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  57                         continue
  58
  59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
  60
  61                 recent_actions = sitehist.getRecentActions(hostname=host)
  62
  63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
  64
  65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
  66                 if nodehist.status == 'good' and \
  67                         changed_lessthan(nodehist.last_changed, 1.0) and \
  68                         found_within(recent_actions, 'down_notice', 7.0) and \
  69                         not found_within(recent_actions, 'online_notice', 0.5):
  70                                 # NOTE: chronicly flapping nodes will not get 'online' notices
  71                                 #               since, they are never up long enough to be 'good'.
  72                             # NOTE: searching for down_notice proves that the node has
  73                                 #               gone through a 'down' state first, rather than just
  74                                 #               flapping through: good, offline, online, ...
  75                                 #
  76                                 # NOTE: there is a narrow window in which this command must be
  77                                 #               evaluated, otherwise the notice will not go out.
  78                                 #               this is not ideal.
  79                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
  80                                 print "send message for host %s online" % host
  81
  82
  83                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
  84                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  85                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
  86                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
  87                 #
  88                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
  89                 #               print "send message for host %s pcumissing_notice" % host
  90
  91                 # if it is offline and HAS a PCU, then try to use it.
  92                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  93                         changed_greaterthan(nodehist.last_changed,1.0) and \
  94                         not nodehist.firewall and \
  95                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
  96
  97                                 # TODO: there MUST be a better way to do this...
  98                                 # get fb node record for pcuid
  99                                 fbpcu = None
 100                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 101                                 if fbnode:
 102                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 103
 104                                 sitehist.attemptReboot(host)
 105                                 print "send message for host %s try_reboot" % host
 106                                 if not fbpcu.test_is_ok() and \
 107                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
 108
 109                                         args = {}
 110                                         if fbpcu:
 111                                                 args['pcu_name'] = fbpcu.pcu_name()
 112                                                 args['pcu_errors'] = fbpcu.pcu_errors()
 113                                                 args['plc_pcuid'] = fbpcu.plc_pcuid
 114                                         else:
 115                                                 args['pcu_name'] = "error looking up pcu name"
 116                                                 args['pcu_errors'] = ""
 117                                                 args['plc_pcuid'] = 0
 118
 119                                         args['hostname'] = host
 120                                         sitehist.sendMessage('pcuerror_notice', **args)
 121                                         print "send message for host %s PCU Failure" % host
 122
 123
 124                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 125                 #               will be false for a day after the above condition is satisfied
 126                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 127                         changed_greaterthan(nodehist.last_changed,1.5) and \
 128                         not nodehist.firewall and \
 129                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 130                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
 131
 132                                 # TODO: there MUST be a better way to do this...
 133                                 # get fb node record for pcuid
 134                                 fbpcu = None
 135                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 136                                 if fbnode:
 137                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 138                                 if fbpcu:
 139                                         pcu_name = fbpcu.pcu_name()
 140                                 else:
 141                                         pcu_name = "error looking up pcu name"
 142
 143                                 # get fb pcu record for pcuid
 144                                 # send pcu failure message
 145                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
 146                                 print "send message for host %s PCU Failure" % host
 147
 148                 if nodehist.status == 'failboot' and \
 149                         changed_greaterthan(nodehist.last_changed, 0.25) and \
 150                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 151                                 # send down node notice
 152                                 # delay 0.5 days before retrying...
 153
 154                                 print "send message for host %s bootmanager_restore" % host
 155                                 sitehist.runBootManager(host)
 156                         #       sitehist.sendMessage('retry_bootman', hostname=host)
 157
 158                 if nodehist.status == 'down' and \
 159                         changed_greaterthan(nodehist.last_changed, 2):
 160                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 161                                         # send down node notice
 162                                         sitehist.sendMessage('down_notice', hostname=host)
 163                                         print "send message for host %s down" % host
 164
 165                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 166                                         # send down node notice
 167                                         #email_exception(host, "firewall_notice")
 168                                         sitehist.sendMessage('firewall_notice', hostname=host)
 169                                         print "send message for host %s down" % host
 170
 171                 node_count = node_count + 1
 172                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 173                 sys.stdout.flush()
 174                 session.flush()
 175
 176         for i,site in enumerate(sitenames):
 177                 sitehist = SiteInterface.get_or_make(loginbase=site)
 178                 siteblack = BlacklistRecord.get_by(loginbase=site)
 179                 skip_due_to_blacklist=False
 180
 181                 if siteblack and not siteblack.expired():
 182                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 183                         skip_due_to_blacklist=True
 184                         sitehist.clearPenalty()
 185                         sitehist.applyPenalty()
 186                         continue
 187
 188                 # TODO: make query only return records within a certin time range,
 189                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
 190                 recent_actions = sitehist.getRecentActions(loginbase=site)
 191
 192                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 193
 194                 if sitehist.db.status == 'down':
 195                         if sitehist.db.penalty_pause and \
 196                                 changed_greaterthan(sitehist.db.penalty_pause_time, 30):
 197
 198                                 email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
 199                                 sitehist.closeTicket()
 200                                 # NOTE: but preserve the penalty status.
 201                                 sitehist.clearPenaltyPause()
 202
 203                         if sitehist.db.message_id != 0 and \
 204                                 sitehist.db.message_status == 'open' and \
 205                                 not sitehist.db.penalty_pause:
 206
 207                                 email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
 208                                 sitehist.setPenaltyPause()
 209
 210                         if  not sitehist.db.penalty_pause and \
 211                                 not found_within(recent_actions, 'increase_penalty', 7) and \
 212                                 changed_greaterthan(sitehist.db.last_changed, 7):
 213
 214                                 # TODO: catch errors
 215                                 sitehist.increasePenalty()
 216                                 sitehist.applyPenalty()
 217                                 sitehist.sendMessage('increase_penalty')
 218
 219                                 print "send message for site %s penalty increase" % site
 220
 221                 if sitehist.db.status == 'good':
 222                         # clear penalty
 223                         # NOTE: because 'all clear' should have an indefinite status, we
 224                         #               have a boolean value rather than a 'recent action'
 225                         if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
 226                                 # send message that penalties are cleared.
 227
 228                                 sitehist.clearPenalty()
 229                                 sitehist.applyPenalty()
 230                                 sitehist.sendMessage('clear_penalty')
 231                                 sitehist.closeTicket()
 232
 233                                 print "send message for site %s penalty cleared" % site
 234
 235
 236                 site_count = site_count + 1
 237
 238                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 239                 sys.stdout.flush()
 240                 session.flush()
 241
 242         session.flush()
 243         return
 244
 245
 246 if __name__ == "__main__":
 247         parser = parsermodule.getParser(['nodesets'])
 248         parser.set_defaults( timewait=0,
 249                                                 skip=0,
 250                                                 rins=False,
 251                                                 reboot=False,
 252                                                 findbad=False,
 253                                                 force=False,
 254                                                 nosetup=False,
 255                                                 verbose=False,
 256                                                 quiet=False,)
 257
 258         parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 259                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
 260         parser.add_option("", "--findbad", dest="findbad", action="store_true",
 261                                                 help="Re-run findbad on the nodes we're going to check before acting.")
 262         parser.add_option("", "--force", dest="force", action="store_true",
 263                                                 help="Force action regardless of previous actions/logs.")
 264         parser.add_option("", "--rins", dest="rins", action="store_true",
 265                                                 help="Set the boot_state to 'rins' for all nodes.")
 266         parser.add_option("", "--reboot", dest="reboot", action="store_true",
 267                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
 268
 269         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 270                                                 help="Extra debug output messages.")
 271         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 272                                                 help="Do not perform the orginary setup phase.")
 273         parser.add_option("", "--skip", dest="skip",
 274                                                 help="Number of machines to skip on the input queue.")
 275         parser.add_option("", "--timewait", dest="timewait",
 276                                                 help="Minutes to wait between iterations of 10 nodes.")
 277
 278         parser = parsermodule.getParser(['defaults'], parser)
 279         config = parsermodule.parse_args(parser)
 280
 281         fbquery = HistoryNodeRecord.query.all()
 282         hostnames = [ n.hostname for n in fbquery ]
 283
 284         fbquery = HistorySiteRecord.query.all()
 285         sitenames = [ s.loginbase for s in fbquery ]
 286
 287         if config.site:
 288                 # TODO: replace with calls to local db.  the api fails so often that
 289                 #               these calls should be regarded as unreliable.
 290                 l_nodes = plccache.GetNodesBySite(config.site)
 291                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
 292
 293                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 294                 sitenames = [config.site]
 295
 296         if config.node:
 297                 hostnames = [ config.node ]
 298                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 299
 300         try:
 301                 main(hostnames, sitenames)
 302                 session.flush()
 303         except KeyboardInterrupt:
 304                 print "Killed by interrupt"
 305                 session.flush()
 306                 sys.exit(0)
 307         except:
 308                 email_exception()
 309                 print traceback.print_exc();
 310                 print "fail all..."