commands/policy.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import os
  16 import time
  17 import traceback
  18 import sys
  19 from optparse import OptionParser
  20
  21 from monitor import config
  22 from monitor import parser as parsermodule
  23 from monitor.common import *
  24 from monitor.const import MINUP
  25 from monitor.model import *
  26 from monitor.wrapper import plc
  27 from monitor.wrapper import plccache
  28 from monitor.database.info.model import *
  29 from monitor.database.info.interface import *
  30
  31 from monitor.query import verify,query_to_dict,node_select
  32
  33 api = plc.getAuthAPI()
  34
  35 def logic():
  36
  37         plc.nodeBootState(host, 'reinstall')
  38         node_end_record(host)
  39
  40 def check_node_and_pcu_status_for(loginbase):
  41         """
  42                 this function checks whether all the nodes and associated pcus for a
  43                 given site are considered 'good'.
  44
  45                 If so, the function returns True.
  46                 Otherwise, the function returns False.
  47         """
  48
  49         results = []
  50         for node in plccache.plcdb_lb2hn[loginbase]:
  51
  52                 noderec  = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
  53                 nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
  54                 nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
  55                 pcuhist  = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
  56
  57                 if (nodehist is not None and nodehist.status == 'good' and \
  58                         ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
  59                         if nodebl is None:                      # no entry in blacklist table
  60                                 results.append(True)
  61                         elif nodebl is not None and nodebl.expired():   # expired entry in blacklist table
  62                                 results.append(True)
  63                         else:
  64                                 results.append(False)   # entry that is not expired.
  65                 else:
  66                         results.append(False)
  67
  68         try:
  69                 print "test: %s" % results
  70                 # NOTE: incase results is empty, reduce does not work on an empty set.
  71                 return reduce(lambda x,y: x&y, results) and len(results) > MINUP
  72         except:
  73                 return False
  74
  75 def main(hostnames, sitenames):
  76         # commands:
  77         i = 1
  78         node_count = 1
  79         site_count = 1
  80         #print "hosts: %s" % hostnames
  81         for i,host in enumerate(hostnames):
  82                 try:
  83                         lb = plccache.plcdb_hn2lb[host]
  84                 except:
  85                         print "unknown host in plcdb_hn2lb %s" % host
  86                         email_exception(host)
  87                         continue
  88
  89                 nodeblack = BlacklistRecord.get_by(hostname=host)
  90
  91                 if nodeblack and not nodeblack.expired():
  92                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
  93                         continue
  94
  95                 sitehist = SiteInterface.get_or_make(loginbase=lb)
  96
  97                 recent_actions = sitehist.getRecentActions(hostname=host)
  98
  99                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
 100
 101                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
 102                 if nodehist.status == 'good' and \
 103                         changed_lessthan(nodehist.last_changed, 1.0) and \
 104                         found_within(recent_actions, 'down_notice', 7.0) and \
 105                         not found_within(recent_actions, 'online_notice', 0.5):
 106                                 # NOTE: chronicly flapping nodes will not get 'online' notices
 107                                 #               since, they are never up long enough to be 'good'.
 108                             # NOTE: searching for down_notice proves that the node has
 109                                 #               gone through a 'down' state first, rather than just
 110                                 #               flapping through: good, offline, online, ...
 111                                 #
 112                                 # NOTE: there is a narrow window in which this command must be
 113                                 #               evaluated, otherwise the notice will not go out.
 114                                 #               this is not ideal.
 115                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
 116                                 print "send message for host %s online" % host
 117
 118
 119                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
 120                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 121                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
 122                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
 123                 #
 124                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
 125                 #               print "send message for host %s pcumissing_notice" % host
 126
 127                 # if it is offline and HAS a PCU, then try to use it.
 128                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 129                         changed_greaterthan(nodehist.last_changed,1.0) and \
 130                         not nodehist.firewall and \
 131                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
 132
 133                                 # TODO: there MUST be a better way to do this...
 134                                 # get fb node record for pcuid
 135                                 fbpcu = None
 136                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 137                                 if fbnode:
 138                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 139
 140                                 sitehist.attemptReboot(host)
 141                                 print "send message for host %s try_reboot" % host
 142                                 if not fbpcu.test_is_ok() and \
 143                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
 144
 145                                         args = {}
 146                                         if fbpcu:
 147                                                 args['pcu_name'] = fbpcu.pcu_name()
 148                                                 args['pcu_errors'] = fbpcu.pcu_errors()
 149                                                 args['plc_pcuid'] = fbpcu.plc_pcuid
 150                                         else:
 151                                                 args['pcu_name'] = "error looking up pcu name"
 152                                                 args['pcu_errors'] = ""
 153                                                 args['plc_pcuid'] = 0
 154
 155                                         args['hostname'] = host
 156                                         sitehist.sendMessage('pcuerror_notice', **args)
 157                                         print "send message for host %s PCU Failure" % host
 158
 159
 160                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
 161                 #               will be false for a day after the above condition is satisfied
 162                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
 163                         changed_greaterthan(nodehist.last_changed,1.5) and \
 164                         not nodehist.firewall and \
 165                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
 166                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
 167
 168                                 # TODO: there MUST be a better way to do this...
 169                                 # get fb node record for pcuid
 170                                 fbpcu = None
 171                                 fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
 172                                 if fbnode:
 173                                         fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
 174                                 if fbpcu:
 175                                         pcu_name = fbpcu.pcu_name()
 176                                 else:
 177                                         pcu_name = "error looking up pcu name"
 178
 179                                 # get fb pcu record for pcuid
 180                                 # send pcu failure message
 181                                 sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
 182                                 print "send message for host %s PCU Failure" % host
 183
 184                 if nodehist.status == 'failboot' and \
 185                         changed_greaterthan(nodehist.last_changed, 0.25) and \
 186                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
 187                                 # send down node notice
 188                                 # delay 0.5 days before retrying...
 189
 190                                 print "send message for host %s bootmanager_restore" % host
 191                                 sitehist.runBootManager(host)
 192                         #       sitehist.sendMessage('retry_bootman', hostname=host)
 193
 194                 if nodehist.status == 'down' and \
 195                         changed_greaterthan(nodehist.last_changed, 2):
 196                                 if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
 197                                         # send down node notice
 198                                         sitehist.sendMessage('down_notice', hostname=host)
 199                                         print "send message for host %s down" % host
 200
 201                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
 202                                         # send down node notice
 203                                         #email_exception(host, "firewall_notice")
 204                                         sitehist.sendMessage('firewall_notice', hostname=host)
 205                                         print "send message for host %s down" % host
 206
 207                 node_count = node_count + 1
 208                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 209                 sys.stdout.flush()
 210                 session.flush()
 211
 212         for i,site in enumerate(sitenames):
 213                 sitehist = SiteInterface.get_or_make(loginbase=site)
 214                 siteblack = BlacklistRecord.get_by(loginbase=site)
 215                 skip_due_to_blacklist=False
 216
 217                 if siteblack and not siteblack.expired():
 218                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
 219                         skip_due_to_blacklist=True
 220                         sitehist.clearPenalty()
 221                         sitehist.applyPenalty()
 222                         continue
 223
 224                 # TODO: make query only return records within a certin time range,
 225                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
 226                 recent_actions = sitehist.getRecentActions(loginbase=site)
 227
 228                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
 229
 230                 if sitehist.db.status == 'down':
 231                         if sitehist.db.penalty_pause and \
 232                                 changed_greaterthan(sitehist.db.penalty_pause_time, 30):
 233
 234                                 email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
 235                                 sitehist.closeTicket()
 236                                 # NOTE: but preserve the penalty status.
 237                                 sitehist.clearPenaltyPause()
 238
 239                         if sitehist.db.message_id != 0 and \
 240                                 sitehist.db.message_status == 'open' and \
 241                                 not sitehist.db.penalty_pause:
 242
 243                                 email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
 244                                 sitehist.setPenaltyPause()
 245
 246                         if  not sitehist.db.penalty_pause and \
 247                                 not found_within(recent_actions, 'increase_penalty', 7) and \
 248                                 changed_greaterthan(sitehist.db.last_changed, 7):
 249
 250                                 # TODO: catch errors
 251                                 sitehist.increasePenalty()
 252                                 sitehist.applyPenalty()
 253                                 sitehist.sendMessage('increase_penalty')
 254
 255                                 print "send message for site %s penalty increase" % site
 256
 257                 if sitehist.db.status == 'good':
 258                         # clear penalty
 259                         # NOTE: because 'all clear' should have an indefinite status, we
 260                         #               have a boolean value rather than a 'recent action'
 261                         if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
 262                                 # send message that penalties are cleared.
 263
 264                                 sitehist.clearPenalty()
 265                                 sitehist.applyPenalty()
 266                                 sitehist.sendMessage('clear_penalty')
 267                                 sitehist.closeTicket()
 268
 269                                 print "send message for site %s penalty cleared" % site
 270
 271                         # check all nodes and pcus for this site; if they're all ok,
 272                         #               close the ticket, else leave it open.
 273                         # NOTE: in the case where a PCU reboots and fails, a message is
 274                         #               sent, but the PCU may appear to be ok according to tests.
 275                         # NOTE: Also, bootmanager sends messages regarding disks,
 276                         #               configuration, etc.  So, the conditions here are 'good'
 277                         #               rather than 'not down' as it is in sitebad.
 278                         close_ticket = check_node_and_pcu_status_for(site)
 279                         if close_ticket:
 280                                 sitehist.closeTicket()
 281
 282                 site_count = site_count + 1
 283
 284                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
 285                 sys.stdout.flush()
 286                 session.flush()
 287
 288         session.flush()
 289         return
 290
 291
 292 if __name__ == "__main__":
 293         parser = parsermodule.getParser(['nodesets'])
 294         parser.set_defaults( timewait=0,
 295                                                 skip=0,
 296                                                 rins=False,
 297                                                 reboot=False,
 298                                                 findbad=False,
 299                                                 force=False,
 300                                                 nosetup=False,
 301                                                 verbose=False,
 302                                                 quiet=False,)
 303
 304         parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 305                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
 306         parser.add_option("", "--findbad", dest="findbad", action="store_true",
 307                                                 help="Re-run findbad on the nodes we're going to check before acting.")
 308         parser.add_option("", "--force", dest="force", action="store_true",
 309                                                 help="Force action regardless of previous actions/logs.")
 310         parser.add_option("", "--rins", dest="rins", action="store_true",
 311                                                 help="Set the boot_state to 'rins' for all nodes.")
 312         parser.add_option("", "--reboot", dest="reboot", action="store_true",
 313                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
 314
 315         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 316                                                 help="Extra debug output messages.")
 317         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 318                                                 help="Do not perform the orginary setup phase.")
 319         parser.add_option("", "--skip", dest="skip",
 320                                                 help="Number of machines to skip on the input queue.")
 321         parser.add_option("", "--timewait", dest="timewait",
 322                                                 help="Minutes to wait between iterations of 10 nodes.")
 323
 324         parser = parsermodule.getParser(['defaults'], parser)
 325         config = parsermodule.parse_args(parser)
 326
 327         fbquery = HistoryNodeRecord.query.all()
 328         hostnames = [ n.hostname for n in fbquery ]
 329
 330         fbquery = HistorySiteRecord.query.all()
 331         sitenames = [ s.loginbase for s in fbquery ]
 332
 333         if config.site:
 334                 # TODO: replace with calls to local db.  the api fails so often that
 335                 #               these calls should be regarded as unreliable.
 336                 l_nodes = plccache.GetNodesBySite(config.site)
 337                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
 338
 339                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
 340                 sitenames = [config.site]
 341
 342         if config.node:
 343                 hostnames = [ config.node ]
 344                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
 345
 346         try:
 347                 main(hostnames, sitenames)
 348                 session.flush()
 349         except KeyboardInterrupt:
 350                 print "Killed by interrupt"
 351                 session.flush()
 352                 sys.exit(0)
 353         except:
 354                 email_exception()
 355                 print traceback.print_exc();
 356                 print "fail all..."