policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.model import *
  25 from monitor.wrapper import plc
  26 from monitor.wrapper import plccache
  27 from monitor.database.info.model import *
  28 from monitor.database.info.interface import *
  29
  30 from nodequery import verify,query_to_dict,node_select
  31
  32 api = plc.getAuthAPI()
  33
  34 def logic():
  35
  36         plc.nodeBootState(host, 'reinstall')
  37         node_end_record(host)
  38
  39 def main(hostnames, sitenames):
  40         # commands:
  41         i = 1
  42         node_count = 1
  43         site_count = 1
  44         #print "hosts: %s" % hostnames
  45         for i,host in enumerate(hostnames):
  46                 try:
  47                         lb = plccache.plcdb_hn2lb[host]
  48                 except:
  49                         print "unknown host in plcdb_hn2lb %s" % host
  50                         email_exception(host)
  51                         continue
  52
  53                 nodeblack = BlacklistRecord.get_by(hostname=host)
  54
  55                 if nodeblack and not nodeblack.expired():
  56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  57                         continue
  58
  59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
  60
  61                 recent_actions = sitehist.getRecentActions(hostname=host)
  62
  63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
  64
  65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
  66                 if nodehist.status == 'good' and \
  67                         changed_lessthan(nodehist.last_changed, 1.0) and \
  68                         found_within(recent_actions, 'down_notice', 7.0) and \
  69                         not found_within(recent_actions, 'online_notice', 0.5):
  70                                 # NOTE: chronicly flapping nodes will not get 'online' notices
  71                                 #               since, they are never up long enough to be 'good'.
  72                             # NOTE: searching for down_notice proves that the node has
  73                                 #               gone through a 'down' state first, rather than just
  74                                 #               flapping through: good, offline, online, ...
  75                                 #
  76                                 # NOTE: there is a narrow window in which this command must be
  77                                 #               evaluated, otherwise the notice will not go out.
  78                                 #               this is not ideal.
  79                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
  80                                 print "send message for host %s online" % host
  81
  82
  83                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
  84                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  85                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
  86                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
  87                 #
  88                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
  89                 #               print "send message for host %s pcumissing_notice" % host
  90
  91                 # if it is offline and HAS a PCU, then try to use it.
  92                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  93                         changed_greaterthan(nodehist.last_changed,1.0) and \
  94                         not nodehist.firewall and \
  95                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
  96
  97                                 sitehist.attemptReboot(host)
  98                                 print "send message for host %s try_reboot" % host
  99
 100                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 101                 #               will be false for a day after the above condition is satisfied
 102                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 103                         changed_greaterthan(nodehist.last_changed,1.5) and \
 104                         not nodehist.firewall and \
 105                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 106                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
 107
 108                                 # send pcu failure message
 109                                 #act = ActionRecord(**kwargs)
 110                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
 111                                 print "send message for host %s PCU Failure" % host
 112
 113                 if nodehist.status == 'failboot' and \
 114                         changed_greaterthan(nodehist.last_changed, 1) and \
 115                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 116                                 # send down node notice
 117                                 # delay 0.5 days before retrying...
 118
 119                                 print "send message for host %s bootmanager_restore" % host
 120                                 sitehist.runBootManager(host)
 121                         #       sitehist.sendMessage('retry_bootman', hostname=host)
 122
 123                 if nodehist.status == 'down' and \
 124                         changed_greaterthan(nodehist.last_changed, 2):
 125                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 126                                         # send down node notice
 127                                         sitehist.sendMessage('down_notice', hostname=host)
 128                                         print "send message for host %s down" % host
 129
 130                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 131                                         # send down node notice
 132                                         #email_exception(host, "firewall_notice")
 133                                         sitehist.sendMessage('firewall_notice', hostname=host)
 134                                         print "send message for host %s down" % host
 135
 136                 node_count = node_count + 1
 137                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 138                 sys.stdout.flush()
 139                 session.flush()
 140
 141         for i,site in enumerate(sitenames):
 142                 sitehist = SiteInterface.get_or_make(loginbase=site)
 143                 siteblack = BlacklistRecord.get_by(loginbase=site)
 144                 skip_due_to_blacklist=False
 145
 146                 if siteblack and not siteblack.expired():
 147                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 148                         skip_due_to_blacklist=True
 149                         sitehist.clearPenalty()
 150                         sitehist.applyPenalty()
 151                         continue
 152
 153                 # TODO: make query only return records within a certin time range,
 154                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
 155                 recent_actions = sitehist.getRecentActions(loginbase=site)
 156
 157                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 158
 159                 # determine if there are penalties within the last 30 days?
 160                 # if so, add a 'pause_penalty' action.
 161                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
 162                         sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
 163                         #       pause escalation
 164                         print "Pausing penalties for %s" % site
 165                         sitehist.pausePenalty()
 166                 else:
 167
 168                         if sitehist.db.status == 'down':
 169                                 if  not found_within(recent_actions, 'pause_penalty', 30) and \
 170                                         not found_within(recent_actions, 'increase_penalty', 7) and \
 171                                         changed_greaterthan(sitehist.db.last_changed, 7):
 172
 173                                         # TODO: catch errors
 174                                         sitehist.increasePenalty()
 175                                         sitehist.applyPenalty()
 176                                         sitehist.sendMessage('increase_penalty')
 177
 178                                         print "send message for site %s penalty increase" % site
 179
 180                         if sitehist.db.status == 'good':
 181                                 # clear penalty
 182                                 # NOTE: because 'all clear' should have an indefinite status, we
 183                                 #               have a boolean value rather than a 'recent action'
 184                                 if sitehist.db.penalty_applied:
 185                                         # send message that penalties are cleared.
 186
 187                                         sitehist.clearPenalty()
 188                                         sitehist.applyPenalty()
 189                                         sitehist.sendMessage('clear_penalty')
 190                                         sitehist.closeTicket()
 191
 192                                         print "send message for site %s penalty cleared" % site
 193
 194
 195                 site_count = site_count + 1
 196
 197                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 198                 sys.stdout.flush()
 199                 session.flush()
 200
 201         session.flush()
 202         return
 203
 204
 205 if __name__ == "__main__":
 206         parser = parsermodule.getParser(['nodesets'])
 207         parser.set_defaults( timewait=0,
 208                                                 skip=0,
 209                                                 rins=False,
 210                                                 reboot=False,
 211                                                 findbad=False,
 212                                                 force=False,
 213                                                 nosetup=False,
 214                                                 verbose=False,
 215                                                 quiet=False,)
 216
 217         parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 218                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
 219         parser.add_option("", "--findbad", dest="findbad", action="store_true",
 220                                                 help="Re-run findbad on the nodes we're going to check before acting.")
 221         parser.add_option("", "--force", dest="force", action="store_true",
 222                                                 help="Force action regardless of previous actions/logs.")
 223         parser.add_option("", "--rins", dest="rins", action="store_true",
 224                                                 help="Set the boot_state to 'rins' for all nodes.")
 225         parser.add_option("", "--reboot", dest="reboot", action="store_true",
 226                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
 227
 228         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 229                                                 help="Extra debug output messages.")
 230         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 231                                                 help="Do not perform the orginary setup phase.")
 232         parser.add_option("", "--skip", dest="skip",
 233                                                 help="Number of machines to skip on the input queue.")
 234         parser.add_option("", "--timewait", dest="timewait",
 235                                                 help="Minutes to wait between iterations of 10 nodes.")
 236
 237         parser = parsermodule.getParser(['defaults'], parser)
 238         config = parsermodule.parse_args(parser)
 239
 240         fbquery = HistoryNodeRecord.query.all()
 241         hostnames = [ n.hostname for n in fbquery ]
 242
 243         fbquery = HistorySiteRecord.query.all()
 244         sitenames = [ s.loginbase for s in fbquery ]
 245
 246         if config.site:
 247                 # TODO: replace with calls to local db.  the api fails so often that
 248                 #               these calls should be regarded as unreliable.
 249                 l_nodes = plccache.GetNodesBySite(config.site)
 250                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
 251
 252                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 253                 sitenames = [config.site]
 254
 255         if config.node:
 256                 hostnames = [ config.node ]
 257                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 258
 259         try:
 260                 main(hostnames, sitenames)
 261                 session.flush()
 262         except KeyboardInterrupt:
 263                 print "Killed by interrupt"
 264                 session.flush()
 265                 sys.exit(0)
 266         except:
 267                 email_exception()
 268                 print traceback.print_exc();
 269                 print "fail all..."