3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
19 from optparse import OptionParser
21 import bootman # debug nodes
23 from monitor import util
24 from monitor import const
25 from monitor import reboot
26 from monitor import config
27 from monitor import database
28 from monitor import parser as parsermodule
29 from monitor.common import *
30 from monitor.model import *
31 from monitor.wrapper import plc
32 from monitor.wrapper import plccache
33 from monitor.wrapper.emailTxt import mailtxt
34 from monitor.database.info.model import *
36 from nodequery import verify,query_to_dict,node_select
38 api = plc.getAuthAPI()
41 class SiteInterface(HistorySiteRecord):
43 def get_or_make(cls, if_new_set={}, **kwargs):
44 if 'hostname' in kwargs:
45 kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
46 del kwargs['hostname']
47 res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
48 return SiteInterface(res)
50 def __init__(self, sitehist):
53 def getRecentActions(self, **kwargs):
54 # TODO: make query only return records within a certin time range,
55 # i.e. greater than 0.5 days ago. or 5 days, etc.
57 #print "kwargs: ", kwargs
60 if 'loginbase' in kwargs:
61 recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
62 elif 'hostname' in kwargs:
63 recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
66 def increasePenalty(self):
67 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
68 self.db.penalty_level += 1
69 # NOTE: this is to prevent overflow or index errors in applyPenalty.
70 # there's probably a better approach to this.
71 if self.db.penalty_level >= 2:
72 self.db.penalty_level = 2
73 self.db.penalty_applied = True
75 def applyPenalty(self):
77 penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None,
78 'disable' : lambda site: None } )
79 penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site),
80 'disable' : lambda site: plc.enableSiteSliceCreation(site) } )
81 penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site),
82 'disable' : lambda site: plc.enableSiteSlices(site) } )
84 for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
85 print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
86 penalty_map[i]['disable'](self.db.loginbase)
88 for i in range(0,self.db.penalty_level+1):
89 print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
90 penalty_map[i]['enable'](self.db.loginbase)
94 def pausePenalty(self):
95 act = ActionRecord(loginbase=self.db.loginbase,
97 action_type='pause_penalty',)
99 def clearPenalty(self):
100 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
101 self.db.penalty_level = 0
102 self.db.penalty_applied = False
104 def getTicketStatus(self):
105 if self.db.message_id != 0:
106 rtstatus = mailer.getTicketStatus(self.db.message_id)
107 self.db.message_status = rtstatus['Status']
108 self.db.message_queue = rtstatus['Queue']
109 self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
111 def setTicketStatus(self, status):
112 print 'SETTING status %s' % status
113 if self.db.message_id != 0:
114 rtstatus = mailer.setTicketStatus(self.db.message_id, status)
116 def getContacts(self):
118 if self.db.penalty_level >= 0:
119 contacts += plc.getTechEmails(self.db.loginbase)
121 if self.db.penalty_level >= 1:
122 contacts += plc.getPIEmails(self.db.loginbase)
124 if self.db.penalty_level >= 2:
125 contacts += plc.getSliceUserEmails(self.db.loginbase)
129 def sendMessage(self, type, **kwargs):
131 # NOTE: evidently changing an RT message's subject opens the ticket.
132 # the logic in this policy depends up a ticket only being 'open'
133 # if a user has replied to it.
134 # So, to preserve these semantics, we check the status before
135 # sending, then after sending, reset the status to the
137 # There is a very tiny race here, where a user sends a reply
138 # within the time it takes to check, send, and reset.
139 # This sucks. It's almost certainly fragile.
142 # TODO: catch any errors here, and add an ActionRecord that contains
145 args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
149 if 'hostname' in args:
150 hostname = args['hostname']
152 if hasattr(mailtxt, type):
154 message = getattr(mailtxt, type)
156 if 'viart' in kwargs:
157 viart = kwargs['viart']
160 self.getTicketStatus() # get current message status
162 m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
164 contacts = self.getContacts()
165 contacts = [config.cc_email] # TODO: remove after testing...
167 print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
169 ret = m.send(contacts)
171 self.db.message_id = ret
172 # reset to previous status, since a new subject 'opens' RT tickets.
173 self.setTicketStatus(self.db.message_status)
175 # NOTE: only make a record of it if it's in RT.
176 act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice',
177 action_type=type, message_id=self.db.message_id)
180 print "+-- WARNING! ------------------------------"
181 print "| No such message name in emailTxt.mailtxt: %s" % type
182 print "+------------------------------------------"
186 def closeTicket(self):
187 # TODO: close the rt ticket before overwriting the message_id
188 mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
189 act = ActionRecord(loginbase=self.db.loginbase, action='notice',
190 action_type='end_notice', message_id=self.db.message_id)
191 self.db.message_id = 0
192 self.db.message_status = "new"
194 def runBootManager(self, hostname):
195 print "attempting BM reboot of %s" % hostname
198 ret = bootman.restore(self, hostname)
201 err = traceback.format_exc()
204 act = ActionRecord(loginbase=self.db.loginbase,
207 action_type='bootmanager_restore',
211 def attemptReboot(self, hostname):
212 print "attempting PCU reboot of %s" % hostname
215 ret = reboot.reboot_str(hostname)
217 err = traceback.format_exc()
220 if ret == 0 or ret == "0":
223 act = ActionRecord(loginbase=self.db.loginbase,
226 action_type='first_try_reboot',
231 plc.nodeBootState(host, 'rins')
232 node_end_record(host)
237 def main(hostnames, sitenames):
242 #print "hosts: %s" % hostnames
243 for host in hostnames:
245 lb = plccache.plcdb_hn2lb[host]
247 print "unknown host in plcdb_hn2lb %s" % host
250 nodeblack = BlacklistRecord.get_by(hostname=host)
252 if nodeblack and not nodeblack.expired():
253 print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() )
256 sitehist = SiteInterface.get_or_make(loginbase=lb)
258 recent_actions = sitehist.getRecentActions(hostname=host)
260 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
262 print "%s %s" % ( nodehist.hostname, nodehist.status)
263 if nodehist.status == 'good' and \
264 changed_lessthan(nodehist.last_changed, 1.0) and \
265 not found_within(recent_actions, 'online_notice', 0.5):
266 # NOTE: there is a narrow window in which this command must be
267 # evaluated, otherwise the notice will not go out. this is not ideal.
268 sitehist.sendMessage('online_notice', hostname=host)
269 print "send message for host %s online" % host
273 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
274 changed_greaterthan(nodehist.last_changed,1.0) and \
275 not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
277 sitehist.attemptReboot(host)
278 print "send message for host %s first_try_reboot" % host
281 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
282 # will be false for a day after the above condition is satisfied
283 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
284 changed_greaterthan(nodehist.last_changed,1.5) and \
285 found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
286 not found_within(recent_actions, 'pcufailed_notice', 3.5):
287 # found_within(recent_actions, 'first_try_reboot', 3.5) and \
289 # send pcu failure message
290 #act = ActionRecord(**kwargs)
291 sitehist.sendMessage('pcufailed_notice', hostname=host)
292 print "send message for host %s PCU Failure" % host
295 if nodehist.status == 'monitordebug' and \
296 changed_greaterthan(nodehist.last_changed, 1) and \
297 not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
298 # send down node notice
299 # delay 0.5 days before retrying...
301 print "send message for host %s bootmanager_restore" % host
302 sitehist.runBootManager(host)
303 # sitehist.sendMessage('retry_bootman', hostname=host)
305 if nodehist.status == 'down' and \
306 changed_greaterthan(nodehist.last_changed, 2) and \
307 not found_within(recent_actions, 'down_notice', 3.5):
308 # send down node notice
310 sitehist.sendMessage('down_notice', hostname=host)
311 print "send message for host %s down" % host
314 node_count = node_count + 1
317 for site in sitenames:
318 sitehist = SiteInterface.get_or_make(loginbase=site)
319 # TODO: make query only return records within a certin time range,
320 # i.e. greater than 0.5 days ago. or 5 days, etc.
321 recent_actions = sitehist.getRecentActions(loginbase=site)
323 #sitehist.sendMessage('test_notice', host)
325 print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
326 if sitehist.db.status == 'down':
327 if not found_within(recent_actions, 'pause_penalty', 30) and \
328 not found_within(recent_actions, 'increase_penalty', 7) and \
329 changed_greaterthan(sitehist.db.last_changed, 7):
332 sitehist.increasePenalty()
333 #sitehist.applyPenalty()
334 sitehist.sendMessage('increase_penalty')
336 print "send message for site %s penalty increase" % site
338 if sitehist.db.status == 'good':
340 # NOTE: because 'all clear' should have an indefinite status, we
341 # have a boolean value rather than a 'recent action'
342 if sitehist.db.penalty_applied:
343 # send message that penalties are cleared.
345 sitehist.clearPenalty()
346 #sitehist.applyPenalty()
347 sitehist.sendMessage('clear_penalty')
348 sitehist.closeTicket()
350 print "send message for site %s penalty cleared" % site
352 # find all ticket ids for site ( could be on the site record? )
353 # determine if there are penalties within the last 30 days?
354 # if so, add a 'pause_penalty' action.
355 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
357 print "Pausing penalties for %s" % site
358 sitehist.pausePenalty()
360 site_count = site_count + 1
368 if __name__ == "__main__":
369 parser = parsermodule.getParser(['nodesets'])
370 parser.set_defaults( timewait=0,
381 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
382 help="The select string that must evaluate to true for the node to be considered 'done'")
383 parser.add_option("", "--findbad", dest="findbad", action="store_true",
384 help="Re-run findbad on the nodes we're going to check before acting.")
385 parser.add_option("", "--force", dest="force", action="store_true",
386 help="Force action regardless of previous actions/logs.")
387 parser.add_option("", "--rins", dest="rins", action="store_true",
388 help="Set the boot_state to 'rins' for all nodes.")
389 parser.add_option("", "--reboot", dest="reboot", action="store_true",
390 help="Actively try to reboot the nodes, keeping a log of actions.")
392 parser.add_option("", "--verbose", dest="verbose", action="store_true",
393 help="Extra debug output messages.")
394 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
395 help="Do not perform the orginary setup phase.")
396 parser.add_option("", "--skip", dest="skip",
397 help="Number of machines to skip on the input queue.")
398 parser.add_option("", "--timewait", dest="timewait",
399 help="Minutes to wait between iterations of 10 nodes.")
401 parser = parsermodule.getParser(['defaults'], parser)
402 config = parsermodule.parse_args(parser)
404 # # COLLECT nodegroups, nodes and node lists
405 # if config.nodegroup:
406 # ng = api.GetNodeGroups({'name' : config.nodegroup})
407 # nodelist = api.GetNodes(ng[0]['node_ids'])
408 # hostnames = [ n['hostname'] for n in nodelist ]
410 fbquery = HistoryNodeRecord.query.all()
411 hostnames = [ n.hostname for n in fbquery ]
413 fbquery = HistorySiteRecord.query.all()
414 sitenames = [ s.loginbase for s in fbquery ]
417 # TODO: replace with calls to local db. the api fails so often that
418 # these calls should be regarded as unreliable.
419 site = api.GetSites(config.site)
420 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
421 filter_hostnames = [ n['hostname'] for n in l_nodes ]
423 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
424 sitenames = [config.site]
427 hostnames = [ config.node ]
428 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
431 main(hostnames, sitenames)
432 except KeyboardInterrupt:
433 print "Killed by interrupt"
438 print traceback.print_exc();