3 # This script is used to manipulate the operational state of nodes in
4 # different node groups. These are basically set operations on nodes via the
7 # Take the ng name as an argument....
9 # * get a list of nodes in the given nodegroup.
10 # * set some or all in the set to rins.
12 # * do something else to them all.
19 from optparse import OptionParser
21 import bootman # debug nodes
23 from monitor import util
24 from monitor import const
25 from monitor import reboot
26 from monitor import config
27 from monitor import database
28 from monitor import parser as parsermodule
29 from monitor.common import *
30 from monitor.model import *
31 from monitor.wrapper import plc
32 from monitor.wrapper import plccache
33 from monitor.wrapper.emailTxt import mailtxt
34 from monitor.database.info.model import *
36 from nodequery import verify,query_to_dict,node_select
38 api = plc.getAuthAPI()
41 class SiteInterface(HistorySiteRecord):
43 def get_or_make(cls, if_new_set={}, **kwargs):
44 if 'hostname' in kwargs:
45 kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
46 del kwargs['hostname']
47 res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
48 return SiteInterface(res)
50 def __init__(self, sitehist):
53 def getRecentActions(self, **kwargs):
54 # TODO: make query only return records within a certin time range,
55 # i.e. greater than 0.5 days ago. or 5 days, etc.
57 #print "kwargs: ", kwargs
60 if 'loginbase' in kwargs:
61 recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
62 elif 'hostname' in kwargs:
63 recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
66 def increasePenalty(self):
67 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
68 self.db.penalty_level += 1
69 # NOTE: this is to prevent overflow or index errors in applyPenalty.
70 # there's probably a better approach to this.
71 if self.db.penalty_level >= 2:
72 self.db.penalty_level = 2
73 self.db.penalty_applied = True
75 def applyPenalty(self):
77 penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None,
78 'disable' : lambda site: None } )
79 penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site),
80 'disable' : lambda site: plc.enableSiteSliceCreation(site) } )
81 penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site),
82 'disable' : lambda site: plc.enableSiteSlices(site) } )
84 for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
85 print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
86 penalty_map[i]['disable'](self.db.loginbase)
88 for i in range(0,self.db.penalty_level+1):
89 print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
90 penalty_map[i]['enable'](self.db.loginbase)
94 def pausePenalty(self):
95 act = ActionRecord(loginbase=self.db.loginbase,
97 action_type='pause_penalty',)
99 def clearPenalty(self):
100 #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
101 self.db.penalty_level = 0
102 self.db.penalty_applied = False
104 def getTicketStatus(self):
105 if self.db.message_id != 0:
106 rtstatus = mailer.getTicketStatus(self.db.message_id)
107 self.db.message_status = rtstatus['Status']
108 self.db.message_queue = rtstatus['Queue']
109 self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
111 def setTicketStatus(self, status):
112 print 'SETTING status %s' % status
113 if self.db.message_id != 0:
114 rtstatus = mailer.setTicketStatus(self.db.message_id, status)
116 def getContacts(self):
118 if self.db.penalty_level >= 0:
119 contacts += plc.getTechEmails(self.db.loginbase)
121 if self.db.penalty_level >= 1:
122 contacts += plc.getPIEmails(self.db.loginbase)
124 if self.db.penalty_level >= 2:
125 contacts += plc.getSliceUserEmails(self.db.loginbase)
129 def sendMessage(self, type, **kwargs):
131 # NOTE: evidently changing an RT message's subject opens the ticket.
132 # the logic in this policy depends up a ticket only being 'open'
133 # if a user has replied to it.
134 # So, to preserve these semantics, we check the status before
135 # sending, then after sending, reset the status to the
137 # There is a very tiny race here, where a user sends a reply
138 # within the time it takes to check, send, and reset.
139 # This sucks. It's almost certainly fragile.
142 # TODO: catch any errors here, and add an ActionRecord that contains
145 args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
149 if 'hostname' in args:
150 hostname = args['hostname']
152 if hasattr(mailtxt, type):
154 message = getattr(mailtxt, type)
156 if 'viart' in kwargs:
157 viart = kwargs['viart']
160 self.getTicketStatus() # get current message status
162 m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
164 contacts = self.getContacts()
165 contacts = [config.cc_email] # TODO: remove after testing...
167 print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
169 ret = m.send(contacts)
171 self.db.message_id = ret
172 # reset to previous status, since a new subject 'opens' RT tickets.
173 self.setTicketStatus(self.db.message_status)
175 # NOTE: only make a record of it if it's in RT.
176 act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice',
177 action_type=type, message_id=self.db.message_id)
180 print "+-- WARNING! ------------------------------"
181 print "| No such message name in emailTxt.mailtxt: %s" % type
182 print "+------------------------------------------"
186 def closeTicket(self):
187 # TODO: close the rt ticket before overwriting the message_id
188 mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
189 act = ActionRecord(loginbase=self.db.loginbase, action='notice',
190 action_type='end_notice', message_id=self.db.message_id)
191 self.db.message_id = 0
192 self.db.message_status = "new"
194 def runBootManager(self, hostname):
195 print "attempting BM reboot of %s" % hostname
198 ret = bootman.restore(self, hostname)
201 err = traceback.format_exc()
204 act = ActionRecord(loginbase=self.db.loginbase,
207 action_type='bootmanager_restore',
211 def attemptReboot(self, hostname):
212 print "attempting PCU reboot of %s" % hostname
213 ret = reboot.reboot_str(hostname)
214 if ret == 0 or ret == "0":
216 act = ActionRecord(loginbase=self.db.loginbase,
219 action_type='first_try_reboot',
224 plc.nodeBootState(host, 'rins')
225 node_end_record(host)
230 def main(hostnames, sitenames):
235 #print "hosts: %s" % hostnames
236 for host in hostnames:
238 lb = plccache.plcdb_hn2lb[host]
240 print "unknown host in plcdb_hn2lb %s" % host
243 nodeblack = BlacklistRecord.get_by(hostname=host)
245 if nodeblack and not nodeblack.expired():
246 print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() )
249 sitehist = SiteInterface.get_or_make(loginbase=lb)
251 recent_actions = sitehist.getRecentActions(hostname=host)
253 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
255 print "%s %s" % ( nodehist.hostname, nodehist.status)
256 if nodehist.status == 'good' and \
257 changed_lessthan(nodehist.last_changed, 1.0) and \
258 not found_within(recent_actions, 'online_notice', 0.5):
259 # NOTE: there is a narrow window in which this command must be
260 # evaluated, otherwise the notice will not go out. this is not ideal.
261 sitehist.sendMessage('online_notice', hostname=host)
262 print "send message for host %s online" % host
266 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
267 changed_greaterthan(nodehist.last_changed,1.0) and \
268 not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
270 sitehist.attemptReboot(host)
271 print "send message for host %s first_try_reboot" % host
274 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
275 # will be false for a day after the above condition is satisfied
276 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
277 changed_greaterthan(nodehist.last_changed,1.5) and \
278 found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
279 not found_within(recent_actions, 'pcufailed_notice', 3.5):
280 # found_within(recent_actions, 'first_try_reboot', 3.5) and \
282 # send pcu failure message
283 #act = ActionRecord(**kwargs)
284 sitehist.sendMessage('pcufailed_notice', hostname=host)
285 print "send message for host %s PCU Failure" % host
288 if nodehist.status == 'monitordebug' and \
289 changed_greaterthan(nodehist.last_changed, 1) and \
290 not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
291 # send down node notice
292 # delay 0.5 days before retrying...
294 print "send message for host %s bootmanager_restore" % host
295 sitehist.runBootManager(host)
296 # sitehist.sendMessage('retry_bootman', hostname=host)
298 if nodehist.status == 'down' and \
299 changed_greaterthan(nodehist.last_changed, 2) and \
300 not found_within(recent_actions, 'down_notice', 3.5):
301 # send down node notice
303 sitehist.sendMessage('down_notice', hostname=host)
304 print "send message for host %s offline" % host
307 node_count = node_count + 1
309 for site in sitenames:
310 sitehist = SiteInterface.get_or_make(loginbase=site)
311 # TODO: make query only return records within a certin time range,
312 # i.e. greater than 0.5 days ago. or 5 days, etc.
313 recent_actions = sitehist.getRecentActions(loginbase=site)
315 #sitehist.sendMessage('test_notice', host)
317 print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status)
318 if sitehist.db.status == 'down':
319 if not found_within(recent_actions, 'pause_penalty', 30) and \
320 not found_within(recent_actions, 'increase_penalty', 7) and \
321 changed_greaterthan(sitehist.db.last_changed, 7):
324 sitehist.increasePenalty()
325 #sitehist.applyPenalty()
326 sitehist.sendMessage('increase_penalty')
328 print "send message for site %s penalty increase" % site
330 if sitehist.db.status == 'good':
332 # NOTE: because 'all clear' should have an indefinite status, we
333 # have a boolean value rather than a 'recent action'
334 if sitehist.db.penalty_applied:
335 # send message that penalties are cleared.
337 sitehist.clearPenalty()
338 #sitehist.applyPenalty()
339 sitehist.sendMessage('clear_penalty')
340 sitehist.closeTicket()
342 print "send message for site %s penalty cleared" % site
344 # find all ticket ids for site ( could be on the site record? )
345 # determine if there are penalties within the last 30 days?
346 # if so, add a 'pause_penalty' action.
347 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
349 print "Pausing penalties for %s" % site
350 sitehist.pausePenalty()
352 site_count = site_count + 1
359 if __name__ == "__main__":
360 parser = parsermodule.getParser(['nodesets'])
361 parser.set_defaults( timewait=0,
372 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
373 help="The select string that must evaluate to true for the node to be considered 'done'")
374 parser.add_option("", "--findbad", dest="findbad", action="store_true",
375 help="Re-run findbad on the nodes we're going to check before acting.")
376 parser.add_option("", "--force", dest="force", action="store_true",
377 help="Force action regardless of previous actions/logs.")
378 parser.add_option("", "--rins", dest="rins", action="store_true",
379 help="Set the boot_state to 'rins' for all nodes.")
380 parser.add_option("", "--reboot", dest="reboot", action="store_true",
381 help="Actively try to reboot the nodes, keeping a log of actions.")
383 parser.add_option("", "--verbose", dest="verbose", action="store_true",
384 help="Extra debug output messages.")
385 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
386 help="Do not perform the orginary setup phase.")
387 parser.add_option("", "--skip", dest="skip",
388 help="Number of machines to skip on the input queue.")
389 parser.add_option("", "--timewait", dest="timewait",
390 help="Minutes to wait between iterations of 10 nodes.")
392 parser = parsermodule.getParser(['defaults'], parser)
393 config = parsermodule.parse_args(parser)
395 # # COLLECT nodegroups, nodes and node lists
396 # if config.nodegroup:
397 # ng = api.GetNodeGroups({'name' : config.nodegroup})
398 # nodelist = api.GetNodes(ng[0]['node_ids'])
399 # hostnames = [ n['hostname'] for n in nodelist ]
401 fbquery = HistoryNodeRecord.query.all()
402 hostnames = [ n.hostname for n in fbquery ]
404 fbquery = HistorySiteRecord.query.all()
405 sitenames = [ s.loginbase for s in fbquery ]
408 # TODO: replace with calls to local db. the api fails so often that
409 # these calls should be regarded as unreliable.
410 site = api.GetSites(config.site)
411 l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
412 filter_hostnames = [ n['hostname'] for n in l_nodes ]
414 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
415 sitenames = [config.site]
418 hostnames = [ config.node ]
419 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
422 main(hostnames, sitenames)
423 except KeyboardInterrupt:
424 print "Killed by interrupt"
428 print traceback.print_exc();
429 print "Continuing..."