+ # get fb pcu record for pcuid
+ # send pcu failure message
+ sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
+ print "send message for host %s PCU Failure" % host
+
+ if nodehist.status == 'failboot' and \
+ changed_greaterthan(nodehist.last_changed, 0.25) and \
+ not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
+ # send down node notice
+ # delay 0.5 days before retrying...
+
+ print "send message for host %s bootmanager_restore" % host
+ sitehist.runBootManager(host)
+ # sitehist.sendMessage('retry_bootman', hostname=host)
+
+ if nodehist.status == 'down' and \
+ changed_greaterthan(nodehist.last_changed, 2):
+ if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5):
+ # send down node notice
+ sitehist.sendMessage('down_notice', hostname=host)
+ print "send message for host %s down" % host
+
+ if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
+ # send down node notice
+ #email_exception(host, "firewall_notice")
+ sitehist.sendMessage('firewall_notice', hostname=host)
+ print "send message for host %s down" % host
+
+ node_count = node_count + 1
+ print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+ sys.stdout.flush()
+ session.flush()
+
+ for i,site in enumerate(sitenames):
+ sitehist = SiteInterface.get_or_make(loginbase=site)
+ siteblack = BlacklistRecord.get_by(loginbase=site)
+ skip_due_to_blacklist=False
+
+ if siteblack and not siteblack.expired():
+ print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() )
+ skip_due_to_blacklist=True
+ sitehist.clearPenalty()
+ sitehist.applyPenalty()
+ continue
+
+ # TODO: make query only return records within a certin time range,
+ # i.e. greater than 0.5 days ago. or 5 days, etc.
+ recent_actions = sitehist.getRecentActions(loginbase=site)
+
+ print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
+
+ if sitehist.db.status == 'down':
+ if sitehist.db.penalty_pause and \
+ changed_greaterthan(sitehist.db.penalty_pause_time, 30):
+
+ email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase)
+ sitehist.closeTicket()
+ # NOTE: but preserve the penalty status.
+ sitehist.clearPenaltyPause()
+
+ if sitehist.db.message_id != 0 and \
+ sitehist.db.message_status == 'open' and \
+ not sitehist.db.penalty_pause:
+
+ email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase)
+ sitehist.setPenaltyPause()
+
+ if not sitehist.db.penalty_pause and \
+ not found_within(recent_actions, 'increase_penalty', 7) and \
+ changed_greaterthan(sitehist.db.last_changed, 7):
+
+ # TODO: catch errors
+ sitehist.increasePenalty()
+ sitehist.applyPenalty()
+ sitehist.sendMessage('increase_penalty')
+
+ print "send message for site %s penalty increase" % site
+
+ if sitehist.db.status == 'good':
+ # clear penalty
+ # NOTE: because 'all clear' should have an indefinite status, we
+ # have a boolean value rather than a 'recent action'
+ if sitehist.db.penalty_applied or sitehist.db.penalty_pause:
+ # send message that penalties are cleared.
+
+ sitehist.clearPenalty()
+ sitehist.applyPenalty()
+ sitehist.sendMessage('clear_penalty')
+ sitehist.closeTicket()
+
+ print "send message for site %s penalty cleared" % site
+
+ # check all nodes and pcus for this site; if they're all ok,
+ # close the ticket, else leave it open.
+ # NOTE: in the case where a PCU reboots and fails, a message is
+ # sent, but the PCU may appear to be ok according to tests.
+ # NOTE: Also, bootmanager sends messages regarding disks,
+ # configuration, etc. So, the conditions here are 'good'
+ # rather than 'not down' as it is in sitebad.
+ close_ticket = check_node_and_pcu_status_for(site)
+ if close_ticket:
+ sitehist.closeTicket()
+
+ site_count = site_count + 1
+
+ print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+ sys.stdout.flush()
+ session.flush()
+
+ session.flush()
+ return
+
+
+if __name__ == "__main__":
+ parser = parsermodule.getParser(['nodesets'])
+ parser.set_defaults( timewait=0,
+ skip=0,
+ rins=False,
+ reboot=False,
+ findbad=False,
+ force=False,
+ nosetup=False,
+ verbose=False,
+ quiet=False,)
+
+ parser.add_option("", "--stopselect", dest="stopselect", metavar="",
+ help="The select string that must evaluate to true for the node to be considered 'done'")
+ parser.add_option("", "--findbad", dest="findbad", action="store_true",
+ help="Re-run findbad on the nodes we're going to check before acting.")
+ parser.add_option("", "--force", dest="force", action="store_true",
+ help="Force action regardless of previous actions/logs.")
+ parser.add_option("", "--rins", dest="rins", action="store_true",
+ help="Set the boot_state to 'rins' for all nodes.")
+ parser.add_option("", "--reboot", dest="reboot", action="store_true",
+ help="Actively try to reboot the nodes, keeping a log of actions.")
+
+ parser.add_option("", "--verbose", dest="verbose", action="store_true",
+ help="Extra debug output messages.")
+ parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
+ help="Do not perform the orginary setup phase.")
+ parser.add_option("", "--skip", dest="skip",
+ help="Number of machines to skip on the input queue.")
+ parser.add_option("", "--timewait", dest="timewait",
+ help="Minutes to wait between iterations of 10 nodes.")
+
+ parser = parsermodule.getParser(['defaults'], parser)
+ config = parsermodule.parse_args(parser)
+
+ fbquery = HistoryNodeRecord.query.all()
+ hostnames = [ n.hostname for n in fbquery ]
+
+ fbquery = HistorySiteRecord.query.all()
+ sitenames = [ s.loginbase for s in fbquery ]