3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
19 from optparse import OptionParser
21 import bootman # debug nodes
23 from monitor import util
24 from monitor import const
25 from monitor import reboot
26 from monitor import config
27 from monitor import database
28 from monitor import parser as parsermodule
29 from monitor.common import *
30 from monitor.model import *
31 from monitor.wrapper import plc
32 from monitor.wrapper import plccache
33 from monitor.wrapper.emailTxt import mailtxt
34 from monitor.database.info.model import *
36 from nodequery import verify,query_to_dict,node_select
38 api = plc.getAuthAPI()
41 class SiteInterface(HistorySiteRecord):
43 def get_or_make(cls, if_new_set={}, **kwargs):
44 if 'hostname' in kwargs:
45 kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
46 del kwargs['hostname']
47 res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
48 return SiteInterface(res)
50 def __init__(self, sitehist):
53 def getRecentActions(self, **kwargs):
54 # TODO: make query only return records within a certin time range,
55 # i.e. greater than 0.5 days ago. or 5 days, etc.
57 #print "kwargs: ", kwargs
60 if 'loginbase' in kwargs:
61 recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
62 elif 'hostname' in kwargs:
63 recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
66 def increasePenalty(self):
67 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
68 self.db.penalty_level += 1
69 # NOTE: this is to prevent overflow or index errors in applyPenalty.
70 # there's probably a better approach to this.
71 if self.db.penalty_level >= 2:
72 self.db.penalty_level = 2
73 self.db.penalty_applied = True
75 def applyPenalty(self):
77 penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None,
78 'disable' : lambda site: None } )
79 penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site),
80 'disable' : lambda site: plc.enableSiteSliceCreation(site) } )
81 penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site),
82 'disable' : lambda site: plc.enableSiteSlices(site) } )
84 for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
85 print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
86 penalty_map[i]['disable'](self.db.loginbase)
88 for i in range(0,self.db.penalty_level+1):
89 print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
90 penalty_map[i]['enable'](self.db.loginbase)
94 def pausePenalty(self):
95 act = ActionRecord(loginbase=self.db.loginbase,
97 action_type='pause_penalty',)
99 def clearPenalty(self):
100 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
101 self.db.penalty_level = 0
102 self.db.penalty_applied = False
104 def getTicketStatus(self):
105 if self.db.message_id != 0:
106 rtstatus = mailer.getTicketStatus(self.db.message_id)
107 self.db.message_status = rtstatus['Status']
108 self.db.message_queue = rtstatus['Queue']
109 self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
111 def setTicketStatus(self, status):
112 print 'SETTING status %s' % status
113 if self.db.message_id != 0:
114 rtstatus = mailer.setTicketStatus(self.db.message_id, status)
116 def getContacts(self):
118 if self.db.penalty_level >= 0:
119 contacts += plc.getTechEmails(self.db.loginbase)
121 if self.db.penalty_level >= 1:
122 contacts += plc.getPIEmails(self.db.loginbase)
124 if self.db.penalty_level >= 2:
125 contacts += plc.getSliceUserEmails(self.db.loginbase)
129 def sendMessage(self, type, **kwargs):
131 # NOTE: evidently changing an RT message's subject opens the ticket.
132 # the logic in this policy depends up a ticket only being 'open'
133 # if a user has replied to it.
134 # So, to preserve these semantics, we check the status before
135 # sending, then after sending, reset the status to the
137 # There is a very tiny race here, where a user sends a reply
138 # within the time it takes to check, send, and reset.
139 # This sucks. It's almost certainly fragile.
142 # TODO: catch any errors here, and add an ActionRecord that contains
145 args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
149 if 'hostname' in args:
150 hostname = args['hostname']
152 if hasattr(mailtxt, type):
154 message = getattr(mailtxt, type)
156 if 'viart' in kwargs:
157 viart = kwargs['viart']
160 self.getTicketStatus() # get current message status
162 m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
164 contacts = self.getContacts()
165 contacts = [config.cc_email] # TODO: remove after testing...
167 print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
169 ret = m.send(contacts)
171 self.db.message_id = ret
172 # reset to previous status, since a new subject 'opens' RT tickets.
173 self.setTicketStatus(self.db.message_status)
175 # NOTE: only make a record of it if it's in RT.
176 act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice',
177 action_type=type, message_id=self.db.message_id)
180 print "+-- WARNING! ------------------------------"
181 print "| No such message name in emailTxt.mailtxt: %s" % type
182 print "+------------------------------------------"
186 def closeTicket(self):
187 # TODO: close the rt ticket before overwriting the message_id
188 mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
189 act = ActionRecord(loginbase=self.db.loginbase, action='notice',
190 action_type='end_notice', message_id=self.db.message_id)
191 self.db.message_id = 0
192 self.db.message_status = "new"
194 def runBootManager(self, hostname):
195 print "attempting BM reboot of %s" % hostname
198 ret = bootman.restore(self, hostname)
201 err = traceback.format_exc()
204 act = ActionRecord(loginbase=self.db.loginbase,
207 action_type='bootmanager_restore',
211 def attemptReboot(self, hostname):
212 print "attempting PCU reboot of %s" % hostname
213 ret = reboot.reboot_str(hostname)
214 if ret == 0 or ret == "0":
216 act = ActionRecord(loginbase=self.db.loginbase,
219 action_type='first_try_reboot',
224 plc.nodeBootState(host, 'rins')
225 node_end_record(host)
230 def main(hostnames, sitenames):
231 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
236 #print "hosts: %s" % hostnames
237 for host in hostnames:
239 lb = plccache.plcdb_hn2lb[host]
241 print "unknown host in plcdb_hn2lb %s" % host
244 sitehist = SiteInterface.get_or_make(loginbase=lb)
246 recent_actions = sitehist.getRecentActions(hostname=host)
248 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
250 print "%s %s" % ( nodehist.hostname, nodehist.status)
251 if nodehist.status == 'good' and \
252 changed_lessthan(nodehist.last_changed, 1.0) and \
253 not found_within(recent_actions, 'online_notice', 0.5):
254 # NOTE: there is a narrow window in which this command must be
255 # evaluated, otherwise the notice will not go out. this is not ideal.
256 sitehist.sendMessage('online_notice', hostname=host)
257 print "send message for host %s online" % host
261 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
262 changed_greaterthan(nodehist.last_changed,1.0) and \
263 not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
265 sitehist.attemptReboot(host)
266 print "send message for host %s first_try_reboot" % host
269 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
270 # will be false for a day after the above condition is satisfied
271 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
272 changed_greaterthan(nodehist.last_changed,1.5) and \
273 found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
274 not found_within(recent_actions, 'pcufailed_notice', 3.5):
275 # found_within(recent_actions, 'first_try_reboot', 3.5) and \
277 # send pcu failure message
278 #act = ActionRecord(**kwargs)
279 sitehist.sendMessage('pcufailed_notice', hostname=host)
280 print "send message for host %s PCU Failure" % host
283 if nodehist.status == 'monitordebug' and \
284 changed_greaterthan(nodehist.last_changed, 1) and \
285 not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
286 # send down node notice
287 # delay 0.5 days before retrying...
289 print "send message for host %s bootmanager_restore" % host
290 sitehist.runBootManager(host)
291 # sitehist.sendMessage('retry_bootman', hostname=host)
293 if nodehist.status == 'down' and \
294 changed_greaterthan(nodehist.last_changed, 2) and \
295 not found_within(recent_actions, 'down_notice', 3.5):
296 # send down node notice
298 sitehist.sendMessage('down_notice', hostname=host)
299 print "send message for host %s offline" % host
302 node_count = node_count + 1
304 for site in sitenames:
305 sitehist = SiteInterface.get_or_make(loginbase=site)
306 # TODO: make query only return records within a certin time range,
307 # i.e. greater than 0.5 days ago. or 5 days, etc.
308 recent_actions = sitehist.getRecentActions(loginbase=site)
310 #sitehist.sendMessage('test_notice', host)
312 print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
313 if sitehist.db.status == 'down':
314 if not found_within(recent_actions, 'pause_penalty', 30) and \
315 not found_within(recent_actions, 'increase_penalty', 7) and \
316 changed_greaterthan(sitehist.db.last_changed, 7):
319 sitehist.increasePenalty()
320 #sitehist.applyPenalty()
321 sitehist.sendMessage('increase_penalty')
323 print "send message for site %s penalty increase" % site
325 if sitehist.db.status == 'good':
327 # NOTE: because 'all clear' should have an indefinite status, we
328 # have a boolean value rather than a 'recent action'
329 if sitehist.db.penalty_applied:
330 # send message that penalties are cleared.
332 sitehist.clearPenalty()
333 #sitehist.applyPenalty()
334 sitehist.sendMessage('clear_penalty')
335 sitehist.closeTicket()
337 print "send message for site %s penalty cleared" % site
339 # find all ticket ids for site ( could be on the site record? )
340 # determine if there are penalties within the last 30 days?
341 # if so, add a 'pause_penalty' action.
342 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
344 print "Pausing penalties for %s" % site
345 sitehist.pausePenalty()
347 site_count = site_count + 1
354 if __name__ == "__main__":
355 parser = parsermodule.getParser(['nodesets'])
356 parser.set_defaults( timewait=0,
367 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
368 help="The select string that must evaluate to true for the node to be considered 'done'")
369 parser.add_option("", "--findbad", dest="findbad", action="store_true",
370 help="Re-run findbad on the nodes we're going to check before acting.")
371 parser.add_option("", "--force", dest="force", action="store_true",
372 help="Force action regardless of previous actions/logs.")
373 parser.add_option("", "--rins", dest="rins", action="store_true",
374 help="Set the boot_state to 'rins' for all nodes.")
375 parser.add_option("", "--reboot", dest="reboot", action="store_true",
376 help="Actively try to reboot the nodes, keeping a log of actions.")
378 parser.add_option("", "--verbose", dest="verbose", action="store_true",
379 help="Extra debug output messages.")
380 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
381 help="Do not perform the orginary setup phase.")
382 parser.add_option("", "--skip", dest="skip",
383 help="Number of machines to skip on the input queue.")
384 parser.add_option("", "--timewait", dest="timewait",
385 help="Minutes to wait between iterations of 10 nodes.")
387 parser = parsermodule.getParser(['defaults'], parser)
388 config = parsermodule.parse_args(parser)
390 # # COLLECT nodegroups, nodes and node lists
391 # if config.nodegroup:
392 # ng = api.GetNodeGroups({'name' : config.nodegroup})
393 # nodelist = api.GetNodes(ng[0]['node_ids'])
394 # hostnames = [ n['hostname'] for n in nodelist ]
396 # if config.node or config.nodelist:
397 # if config.node: hostnames = [ config.node ]
398 # else: hostnames = util.file.getListFromFile(config.nodelist)
400 # fbquery = FindbadNodeRecord.get_all_latest()
401 # fb_nodelist = [ n.hostname for n in fbquery ]
403 # if config.nodeselect:
404 # hostnames = node_select(config.nodeselect, fb_nodelist)
406 fbquery = HistoryNodeRecord.query.all()
407 hostnames = [ n.hostname for n in fbquery ]
409 fbquery = HistorySiteRecord.query.all()
410 sitenames = [ s.loginbase for s in fbquery ]
413 site = api.GetSites(config.site)
414 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
415 filter_hostnames = [ n['hostname'] for n in l_nodes ]
417 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
418 sitenames = [config.site]
421 hostnames = [ config.node ]
422 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
425 main(hostnames, sitenames)
426 except KeyboardInterrupt:
427 print "Killed by interrupt"
431 print traceback.print_exc();
432 print "Continuing..."