3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
19 from optparse import OptionParser
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.const import MINUP
25 from monitor.model import *
26 from monitor.wrapper import plc
27 from monitor.wrapper import plccache
28 from monitor.database.info.model import *
29 from monitor.database.info.interface import *
31 from monitor.query import verify,query_to_dict,node_select
33 def main(hostnames, config):
37 print "failboot-repair"
38 for i,host in enumerate(hostnames):
40 lb = plccache.plcdb_hn2lb[host]
42 print "unknown host in plcdb_hn2lb %s" % host
43 email_exception("%s %s" % (i,host))
46 nodeblack = BlacklistRecord.get_by(hostname=host)
48 if nodeblack and not nodeblack.expired():
49 print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() )
52 sitehist = SiteInterface.get_or_make(loginbase=lb)
54 recent_actions = sitehist.getRecentActions(hostname=host)
56 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
58 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
60 if nodehist.status == 'failboot' and \
61 changed_greaterthan(nodehist.last_changed, 0.25) and \
62 ( not found_between(recent_actions, 'bootmanager_restore', 0.5, 0) \
64 # send down node notice
65 # delay 0.5 days before retrying...
66 print "send message for host %s bootmanager_restore" % host
67 sitehist.runBootManager(host)
69 node_count = node_count + 1
70 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
78 if __name__ == "__main__":
79 parser = parsermodule.getParser(['nodesets'])
80 parser.set_defaults(rins=False,
87 parser.add_option("", "--force", dest="force", action="store_true",
88 help="Force action regardless of previous actions/logs.")
89 parser.add_option("", "--rins", dest="rins", action="store_true",
90 help="Set the boot_state to 'rins' for all nodes.")
91 parser.add_option("", "--reboot", dest="reboot", action="store_true",
92 help="Actively try to reboot the nodes, keeping a log of actions.")
94 parser.add_option("", "--verbose", dest="verbose", action="store_true",
95 help="Extra debug output messages.")
97 parser = parsermodule.getParser(['defaults'], parser)
98 config = parsermodule.parse_args(parser)
100 fbquery = HistoryNodeRecord.query.all()
101 hostnames = [ n.hostname for n in fbquery ]
104 # TODO: replace with calls to local db. the api fails so often that
105 # these calls should be regarded as unreliable.
106 l_nodes = plccache.GetNodesBySite(config.site)
107 filter_hostnames = [ n['hostname'] for n in l_nodes ]
109 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
112 hostnames = [ config.node ]
115 main(hostnames, config)
117 except KeyboardInterrupt:
118 print "Killed by interrupt"
123 print traceback.print_exc();