policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.model import *
  25 from monitor.wrapper import plc
  26 from monitor.wrapper import plccache
  27 from monitor.database.info.model import *
  28 from monitor.database.info.interface import *
  29
  30 from nodequery import verify,query_to_dict,node_select
  31
  32 api = plc.getAuthAPI()
  33
  34 def logic():
  35
  36         plc.nodeBootState(host, 'reinstall')
  37         node_end_record(host)
  38
  39 def main(hostnames, sitenames):
  40         # commands:
  41         i = 1
  42         node_count = 1
  43         site_count = 1
  44         #print "hosts: %s" % hostnames
  45         for i,host in enumerate(hostnames):
  46                 try:
  47                         lb = plccache.plcdb_hn2lb[host]
  48                 except:
  49                         print "unknown host in plcdb_hn2lb %s" % host
  50                         email_exception(host)
  51                         continue
  52
  53                 nodeblack = BlacklistRecord.get_by(hostname=host)
  54
  55                 if nodeblack and not nodeblack.expired():
  56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  57                         continue
  58
  59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
  60
  61                 recent_actions = sitehist.getRecentActions(hostname=host)
  62
  63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
  64
  65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
  66                 if nodehist.status == 'good' and \
  67                         changed_lessthan(nodehist.last_changed, 1.0) and \
  68                         found_within(recent_actions, 'down_notice', 7.0) and \
  69                         not found_within(recent_actions, 'online_notice', 0.5):
  70                                 # NOTE: chronicly flapping nodes will not get 'online' notices
  71                                 #               since, they are never up long enough to be 'good'.
  72                             # NOTE: searching for down_notice proves that the node has
  73                                 #               gone through a 'down' state first, rather than just
  74                                 #               flapping through: good, offline, online, ...
  75                                 #
  76                                 # NOTE: there is a narrow window in which this command must be
  77                                 #               evaluated, otherwise the notice will not go out.
  78                                 #               this is not ideal.
  79                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
  80                                 print "send message for host %s online" % host
  81
  82
  83                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
  84                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  85                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
  86                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
  87                 #
  88                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
  89                 #               print "send message for host %s pcumissing_notice" % host
  90
  91                 # if it is offline and HAS a PCU, then try to use it.
  92                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  93                         changed_greaterthan(nodehist.last_changed,1.0) and \
  94                         not nodehist.firewall and \
  95                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
  96
  97                                 # TODO: there MUST be a better way to do this...
  98                                 # get fb node record for pcuid
  99                                 fbpcu = None
 100                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 101                                 if fbnode:
 102                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 103
 104                                 sitehist.attemptReboot(host)
 105                                 print "send message for host %s try_reboot" % host
 106                                 if not fbpcu.test_is_ok() and \
 107                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
 108
 109                                         args = {}
 110                                         if fbpcu:
 111                                                 args['pcu_name'] = fbpcu.pcu_name()
 112                                                 args['pcu_errors'] = fbpcu.pcu_errors()
 113                                         else:
 114                                                 args['pcu_name'] = "error looking up pcu name"
 115                                                 args['pcu_errors'] = ""
 116
 117                                         args['hostname'] = host
 118                                         sitehist.sendMessage('pcuerror_notice', **args)
 119                                         print "send message for host %s PCU Failure" % host
 120
 121
 122                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 123                 #               will be false for a day after the above condition is satisfied
 124                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 125                         changed_greaterthan(nodehist.last_changed,1.5) and \
 126                         not nodehist.firewall and \
 127                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 128                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
 129
 130                                 # TODO: there MUST be a better way to do this...
 131                                 # get fb node record for pcuid
 132                                 fbpcu = None
 133                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 134                                 if fbnode:
 135                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 136                                 if fbpcu:
 137                                         pcu_name = fbpcu.pcu_name()
 138                                 else:
 139                                         pcu_name = "error looking up pcu name"
 140
 141                                 # get fb pcu record for pcuid
 142                                 # send pcu failure message
 143                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
 144                                 print "send message for host %s PCU Failure" % host
 145
 146                 if nodehist.status == 'failboot' and \
 147                         changed_greaterthan(nodehist.last_changed, 0.25) and \
 148                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 149                                 # send down node notice
 150                                 # delay 0.5 days before retrying...
 151
 152                                 print "send message for host %s bootmanager_restore" % host
 153                                 sitehist.runBootManager(host)
 154                         #       sitehist.sendMessage('retry_bootman', hostname=host)
 155
 156                 if nodehist.status == 'down' and \
 157                         changed_greaterthan(nodehist.last_changed, 2):
 158                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 159                                         # send down node notice
 160                                         sitehist.sendMessage('down_notice', hostname=host)
 161                                         print "send message for host %s down" % host
 162
 163                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 164                                         # send down node notice
 165                                         #email_exception(host, "firewall_notice")
 166                                         sitehist.sendMessage('firewall_notice', hostname=host)
 167                                         print "send message for host %s down" % host
 168
 169                 node_count = node_count + 1
 170                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 171                 sys.stdout.flush()
 172                 session.flush()
 173
 174         for i,site in enumerate(sitenames):
 175                 sitehist = SiteInterface.get_or_make(loginbase=site)
 176                 siteblack = BlacklistRecord.get_by(loginbase=site)
 177                 skip_due_to_blacklist=False
 178
 179                 if siteblack and not siteblack.expired():
 180                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 181                         skip_due_to_blacklist=True
 182                         sitehist.clearPenalty()
 183                         sitehist.applyPenalty()
 184                         continue
 185
 186                 # TODO: make query only return records within a certin time range,
 187                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
 188                 recent_actions = sitehist.getRecentActions(loginbase=site)
 189
 190                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 191
 192                 # determine if there are penalties within the last 30 days?
 193                 # if so, add a 'pause_penalty' action.
 194                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
 195                         sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
 196                         #       pause escalation
 197                         print "Pausing penalties for %s" % site
 198                         sitehist.pausePenalty()
 199                 else:
 200
 201                         if sitehist.db.status == 'down':
 202                                 if  not found_within(recent_actions, 'pause_penalty', 30) and \
 203                                         not found_within(recent_actions, 'increase_penalty', 7) and \
 204                                         changed_greaterthan(sitehist.db.last_changed, 7):
 205
 206                                         # TODO: catch errors
 207                                         sitehist.increasePenalty()
 208                                         sitehist.applyPenalty()
 209                                         sitehist.sendMessage('increase_penalty')
 210
 211                                         print "send message for site %s penalty increase" % site
 212
 213                         if sitehist.db.status == 'good':
 214                                 # clear penalty
 215                                 # NOTE: because 'all clear' should have an indefinite status, we
 216                                 #               have a boolean value rather than a 'recent action'
 217                                 if sitehist.db.penalty_applied:
 218                                         # send message that penalties are cleared.
 219
 220                                         sitehist.clearPenalty()
 221                                         sitehist.applyPenalty()
 222                                         sitehist.sendMessage('clear_penalty')
 223                                         sitehist.closeTicket()
 224
 225                                         print "send message for site %s penalty cleared" % site
 226
 227
 228                 site_count = site_count + 1
 229
 230                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 231                 sys.stdout.flush()
 232                 session.flush()
 233
 234         session.flush()
 235         return
 236
 237
 238 if __name__ == "__main__":
 239         parser = parsermodule.getParser(['nodesets'])
 240         parser.set_defaults( timewait=0,
 241                                                 skip=0,
 242                                                 rins=False,
 243                                                 reboot=False,
 244                                                 findbad=False,
 245                                                 force=False,
 246                                                 nosetup=False,
 247                                                 verbose=False,
 248                                                 quiet=False,)
 249
 250         parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 251                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
 252         parser.add_option("", "--findbad", dest="findbad", action="store_true",
 253                                                 help="Re-run findbad on the nodes we're going to check before acting.")
 254         parser.add_option("", "--force", dest="force", action="store_true",
 255                                                 help="Force action regardless of previous actions/logs.")
 256         parser.add_option("", "--rins", dest="rins", action="store_true",
 257                                                 help="Set the boot_state to 'rins' for all nodes.")
 258         parser.add_option("", "--reboot", dest="reboot", action="store_true",
 259                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
 260
 261         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 262                                                 help="Extra debug output messages.")
 263         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 264                                                 help="Do not perform the orginary setup phase.")
 265         parser.add_option("", "--skip", dest="skip",
 266                                                 help="Number of machines to skip on the input queue.")
 267         parser.add_option("", "--timewait", dest="timewait",
 268                                                 help="Minutes to wait between iterations of 10 nodes.")
 269
 270         parser = parsermodule.getParser(['defaults'], parser)
 271         config = parsermodule.parse_args(parser)
 272
 273         fbquery = HistoryNodeRecord.query.all()
 274         hostnames = [ n.hostname for n in fbquery ]
 275
 276         fbquery = HistorySiteRecord.query.all()
 277         sitenames = [ s.loginbase for s in fbquery ]
 278
 279         if config.site:
 280                 # TODO: replace with calls to local db.  the api fails so often that
 281                 #               these calls should be regarded as unreliable.
 282                 l_nodes = plccache.GetNodesBySite(config.site)
 283                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
 284
 285                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 286                 sitenames = [config.site]
 287
 288         if config.node:
 289                 hostnames = [ config.node ]
 290                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 291
 292         try:
 293                 main(hostnames, sitenames)
 294                 session.flush()
 295         except KeyboardInterrupt:
 296                 print "Killed by interrupt"
 297                 session.flush()
 298                 sys.exit(0)
 299         except:
 300                 email_exception()
 301                 print traceback.print_exc();
 302                 print "fail all..."