policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.model import *
  25 from monitor.wrapper import plc
  26 from monitor.wrapper import plccache
  27 from monitor.database.info.model import *
  28 from monitor.database.info.interface import *
  29
  30 from nodequery import verify,query_to_dict,node_select
  31
  32 api = plc.getAuthAPI()
  33
  34 def logic():
  35
  36         plc.nodeBootState(host, 'reinstall')
  37         node_end_record(host)
  38
  39 def main(hostnames, sitenames):
  40         # commands:
  41         i = 1
  42         node_count = 1
  43         site_count = 1
  44         #print "hosts: %s" % hostnames
  45         for i,host in enumerate(hostnames):
  46                 try:
  47                         lb = plccache.plcdb_hn2lb[host]
  48                 except:
  49                         print "unknown host in plcdb_hn2lb %s" % host
  50                         email_exception(host)
  51                         continue
  52
  53                 nodeblack = BlacklistRecord.get_by(hostname=host)
  54
  55                 if nodeblack and not nodeblack.expired():
  56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  57                         continue
  58
  59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
  60
  61                 recent_actions = sitehist.getRecentActions(hostname=host)
  62
  63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
  64
  65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
  66                 if nodehist.status == 'good' and \
  67                         changed_lessthan(nodehist.last_changed, 1.0) and \
  68                         found_within(recent_actions, 'down_notice', 7.0) and \
  69                         not found_within(recent_actions, 'online_notice', 0.5):
  70                                 # NOTE: chronicly flapping nodes will not get 'online' notices
  71                                 #               since, they are never up long enough to be 'good'.
  72                             # NOTE: searching for down_notice proves that the node has
  73                                 #               gone through a 'down' state first, rather than just
  74                                 #               flapping through: good, offline, online, ...
  75                                 #
  76                                 # NOTE: there is a narrow window in which this command must be
  77                                 #               evaluated, otherwise the notice will not go out.
  78                                 #               this is not ideal.
  79                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
  80                                 print "send message for host %s online" % host
  81
  82
  83                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
  84                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  85                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
  86                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
  87                 #
  88                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
  89                 #               print "send message for host %s pcumissing_notice" % host
  90
  91                 # if it is offline and HAS a PCU, then try to use it.
  92                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
  93                         changed_greaterthan(nodehist.last_changed,1.0) and \
  94                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
  95
  96                                 sitehist.attemptReboot(host)
  97                                 print "send message for host %s try_reboot" % host
  98
  99                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 100                 #               will be false for a day after the above condition is satisfied
 101                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 102                         changed_greaterthan(nodehist.last_changed,1.5) and \
 103                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 104                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
 105
 106                                 # send pcu failure message
 107                                 #act = ActionRecord(**kwargs)
 108                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
 109                                 print "send message for host %s PCU Failure" % host
 110
 111                 if nodehist.status == 'monitordebug' and \
 112                         changed_greaterthan(nodehist.last_changed, 1) and \
 113                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 114                                 # send down node notice
 115                                 # delay 0.5 days before retrying...
 116
 117                                 print "send message for host %s bootmanager_restore" % host
 118                                 sitehist.runBootManager(host)
 119                         #       sitehist.sendMessage('retry_bootman', hostname=host)
 120
 121                 if nodehist.status == 'down' and \
 122                         changed_greaterthan(nodehist.last_changed, 2):
 123                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 124                                         # send down node notice
 125                                         sitehist.sendMessage('down_notice', hostname=host)
 126                                         print "send message for host %s down" % host
 127
 128                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 129                                         # send down node notice
 130                                         email_exception(host, "firewall_notice")
 131                                         sitehist.sendMessage('firewall_notice', hostname=host)
 132                                         print "send message for host %s down" % host
 133
 134                 node_count = node_count + 1
 135                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 136                 sys.stdout.flush()
 137                 session.flush()
 138
 139         for i,site in enumerate(sitenames):
 140                 sitehist = SiteInterface.get_or_make(loginbase=site)
 141                 siteblack = BlacklistRecord.get_by(loginbase=site)
 142                 skip_due_to_blacklist=False
 143
 144                 if siteblack and not siteblack.expired():
 145                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 146                         skip_due_to_blacklist=True
 147                         sitehist.clearPenalty()
 148                         sitehist.applyPenalty()
 149                         continue
 150
 151                 # TODO: make query only return records within a certin time range,
 152                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
 153                 recent_actions = sitehist.getRecentActions(loginbase=site)
 154
 155                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 156
 157                 # determine if there are penalties within the last 30 days?
 158                 # if so, add a 'pause_penalty' action.
 159                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
 160                         sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
 161                         #       pause escalation
 162                         print "Pausing penalties for %s" % site
 163                         sitehist.pausePenalty()
 164                 else:
 165
 166                         if sitehist.db.status == 'down':
 167                                 if  not found_within(recent_actions, 'pause_penalty', 30) and \
 168                                         not found_within(recent_actions, 'increase_penalty', 7) and \
 169                                         changed_greaterthan(sitehist.db.last_changed, 7):
 170
 171                                         # TODO: catch errors
 172                                         sitehist.increasePenalty()
 173                                         sitehist.applyPenalty()
 174                                         sitehist.sendMessage('increase_penalty')
 175
 176                                         print "send message for site %s penalty increase" % site
 177
 178                         if sitehist.db.status == 'good':
 179                                 # clear penalty
 180                                 # NOTE: because 'all clear' should have an indefinite status, we
 181                                 #               have a boolean value rather than a 'recent action'
 182                                 if sitehist.db.penalty_applied:
 183                                         # send message that penalties are cleared.
 184
 185                                         sitehist.clearPenalty()
 186                                         sitehist.applyPenalty()
 187                                         sitehist.sendMessage('clear_penalty')
 188                                         sitehist.closeTicket()
 189
 190                                         print "send message for site %s penalty cleared" % site
 191
 192
 193                 site_count = site_count + 1
 194
 195                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 196                 sys.stdout.flush()
 197                 session.flush()
 198
 199         session.flush()
 200         return
 201
 202
 203 if __name__ == "__main__":
 204         parser = parsermodule.getParser(['nodesets'])
 205         parser.set_defaults( timewait=0,
 206                                                 skip=0,
 207                                                 rins=False,
 208                                                 reboot=False,
 209                                                 findbad=False,
 210                                                 force=False,
 211                                                 nosetup=False,
 212                                                 verbose=False,
 213                                                 quiet=False,)
 214
 215         parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 216                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
 217         parser.add_option("", "--findbad", dest="findbad", action="store_true",
 218                                                 help="Re-run findbad on the nodes we're going to check before acting.")
 219         parser.add_option("", "--force", dest="force", action="store_true",
 220                                                 help="Force action regardless of previous actions/logs.")
 221         parser.add_option("", "--rins", dest="rins", action="store_true",
 222                                                 help="Set the boot_state to 'rins' for all nodes.")
 223         parser.add_option("", "--reboot", dest="reboot", action="store_true",
 224                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
 225
 226         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 227                                                 help="Extra debug output messages.")
 228         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 229                                                 help="Do not perform the orginary setup phase.")
 230         parser.add_option("", "--skip", dest="skip",
 231                                                 help="Number of machines to skip on the input queue.")
 232         parser.add_option("", "--timewait", dest="timewait",
 233                                                 help="Minutes to wait between iterations of 10 nodes.")
 234
 235         parser = parsermodule.getParser(['defaults'], parser)
 236         config = parsermodule.parse_args(parser)
 237
 238         fbquery = HistoryNodeRecord.query.all()
 239         hostnames = [ n.hostname for n in fbquery ]
 240
 241         fbquery = HistorySiteRecord.query.all()
 242         sitenames = [ s.loginbase for s in fbquery ]
 243
 244         if config.site:
 245                 # TODO: replace with calls to local db.  the api fails so often that
 246                 #               these calls should be regarded as unreliable.
 247                 l_nodes = plccache.GetNodesBySite(config.site)
 248                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
 249
 250                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 251                 sitenames = [config.site]
 252
 253         if config.node:
 254                 hostnames = [ config.node ]
 255                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 256
 257         try:
 258                 main(hostnames, sitenames)
 259                 session.flush()
 260         except KeyboardInterrupt:
 261                 print "Killed by interrupt"
 262                 session.flush()
 263                 sys.exit(0)
 264         except:
 265                 email_exception()
 266                 print traceback.print_exc();
 267                 print "fail all..."