From ee740a3ff286a9720cd1656cd60a3c85f0f14b29 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 26 Aug 2008 02:02:06 +0000 Subject: [PATCH] update of all changes in the last week that fine-tuned the behavior of Monitor with the new clean_policy and unified_model. --- bootman.py | 6 +- clean_policy.py | 230 +++++++++++++++++++++++++++++++---------------- emailTxt.py | 32 ++++--- findbadpcu.py | 11 ++- grouprins.py | 70 ++++++++++----- mailer.py | 2 +- mailmonitor.py | 10 +-- nodecommon.py | 7 +- nodegroups.py | 3 +- nodeinfo.py | 3 +- nodequery.py | 27 ++++-- reboot.py | 59 ++++++++---- ssh/pexpect.py | 4 + unified_model.py | 56 ++++++++---- 14 files changed, 356 insertions(+), 164 deletions(-) diff --git a/bootman.py b/bootman.py index a278afe..87d8b71 100755 --- a/bootman.py +++ b/bootman.py @@ -34,9 +34,12 @@ class Sopen(subprocess.Popen): #from Rpyc import SocketConnection, Async from Rpyc import SocketConnection, Async from Rpyc.Utils import * +fb = None def get_fbnode(node): - fb = database.dbLoad("findbad") + global fb + if fb is None: + fb = database.dbLoad("findbad") fbnode = fb['nodes'][node]['values'] return fbnode @@ -359,7 +362,6 @@ def reboot(hostname, config=None, forced_action=None): except: print traceback.print_exc() return False - if forced_action == "reboot": conn.restart_node('rins') diff --git a/clean_policy.py b/clean_policy.py index d2bde41..a14016e 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -31,6 +31,9 @@ def get_ticket_id(record): return None class MonitorMergeDiagnoseSendEscellate: + act_all = None + fb = None + def __init__(self, hostname, act): self.hostname = hostname self.act = act @@ -41,7 +44,11 @@ class MonitorMergeDiagnoseSendEscellate: return def getFBRecord(self): - fb = database.dbLoad("findbad") + if MonitorMergeDiagnoseSendEscellate.fb == None: + MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad") + + fb = MonitorMergeDiagnoseSendEscellate.fb + if self.hostname in fb['nodes']: fbnode = fb['nodes'][self.hostname]['values'] else: @@ -50,12 +57,15 @@ class MonitorMergeDiagnoseSendEscellate: def getActionRecord(self): # update ticket status - act_all = database.dbLoad("act_all") + if MonitorMergeDiagnoseSendEscellate.act_all == None: + MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all") + + act_all = MonitorMergeDiagnoseSendEscellate.act_all + if self.hostname in act_all and len(act_all[self.hostname]) > 0: actnode = act_all[self.hostname][0] else: actnode = None - del act_all return actnode def getKernel(self, unamestr): @@ -73,13 +83,15 @@ class MonitorMergeDiagnoseSendEscellate: fbnode['info'] = None fbnode['log'] = None fbnode['time'] = time.time() + fbnode['email'] = TECH + fbnode['action'] = ['noop'] fbnode['date_created'] = time.time() - if actnode is None: + if actnode is None: # there is no entry in act_all actnode = {} actnode.update(fbnode) actnode['ticket_id'] = "" - actnode['prev_category'] = "NORECORD" + actnode['prev_category'] = "ERROR" else: actnode['prev_category']= actnode['category'] actnode['comonstats'] = fbnode['comonstats'] @@ -111,29 +123,40 @@ class MonitorMergeDiagnoseSendEscellate: diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags') # NOTE: change record stage based on RT status. - diag.setFlag('ResetStage') + #diag.setFlag('ResetStage') if record.stageIswaitforever(): ticket = record.data['rt'] if 'new' in ticket['Status']: - diag.setFlag('ResetStage') + print "Resetting Stage!!!!!" + # diag.setFlag('ResetStage') + record.reset_stage() + #if diag.getFlag('ResetStage'): + # print "diagnose: resetting stage" + # diag.resetFlag('ResetStage') if 'resolved' in ticket['Status']: - diag.setFlag('EndRecord') + diag.setFlag('RTEndRecord') # NOTE: take category, and prepare action category = record.getCategory() if category == "error": diag.setFlag('SendNodedown') - record.data['message'] = emailTxt.mailtxt.newdown + record.data['message_series'] = emailTxt.mailtxt.newdown record.data['log'] = self.getDownLog(record) - elif category == "prod": + elif category == "prod" or category == "alpha": state = record.getState() if state == "boot": - diag.setFlag('SendThankyou') - record.data['message'] = emailTxt.mailtxt.newthankyou - record.data['log'] = self.getThankyouLog(record) - + if record.severity() != 0: + diag.setFlag('SendThankyou') + print "RESETTING STAGE: improvement" + record.data['stage'] = 'improvement' + record.data['message_series'] = emailTxt.mailtxt.newthankyou + record.data['log'] = self.getThankyouLog(record) + else: + # NOTE: do nothing, since we've already done the above. + print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname + return None elif state == "debug": pass else: @@ -141,55 +164,79 @@ class MonitorMergeDiagnoseSendEscellate: else: print "unknown category: %s" % category - if diag.getFlag('ResetStage'): - print "resetting stage" - record.reset_stage() + # TODO: how to not send email?... record = self.checkStageAndTime(diag,record) - if record: - print "checkStageAndTime Returned Valid Record" - site = PersistFlags(self.loginbase, 1, db='site_persistflags') + #if record: + print "diagnose: checkStageAndTime Returned Valid Record" + site = PersistFlags(self.loginbase, 1, db='site_persistflags') - if site.status is not "good": - print "Setting site %s for 'squeeze'" % self.loginbase - diag.setFlag('Squeeze') - else: - print "Setting site %s for 'backoff'" % self.loginbase - diag.setFlag('BackOff') - - diag.save() - return diag + if site.status != "good": + print "diagnose: Setting site %s for 'squeeze'" % self.loginbase + diag.setFlag('Squeeze') else: - print "checkStageAndTime Returned NULL Record" - return None + print "diagnose: Setting site %s for 'backoff'" % self.loginbase + diag.setFlag('BackOff') + + diag.save() + return diag + #else: + # print "checkStageAndTime Returned NULL Record" + # return None def action(self, record, diag): - if record.improved() or diag.getFlag('EndRecord'): - print "end record for %s" % self.hostname - record.end_record() - diag.setFlag('CloseRT') - return None - - if self.getSendEmailFlag(record): - print "sending email" + + message = None + + #print record.data['stage'] + #print "improvement" in record.data['stage'] + #print self.getSendEmailFlag(record) + if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: + print "action: getting message" message = record.getMessage(record.data['ticket_id']) - message.reset() - message.send(record.getContacts()) - if message.rt.ticket_id: - print "setting record ticket_id" - record.data['ticket_id'] = message.rt.ticket_id - if diag.getFlag('CloseRT'): - message.rt.closeTicket() + if message: + #message.reset() + print "action: sending email" + message.send(record.getContacts()) + #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" + #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" + #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" + #print message + if message.rt.ticket_id: + print "action: setting record ticket_id" + record.data['ticket_id'] = message.rt.ticket_id + + if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'): + print "action: taking action" + record.takeAction() + diag.resetFlag('Squeeze') + diag.resetFlag('BackOff') + diag.save() + + if record.saveAction(): + print "action: saving act_all db" + self.add_and_save_act_all(record) + else: + print "action: NOT saving act_all db" + print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] ) + + if record.improved() or diag.getFlag('RTEndRecord'): + print "action: end record for %s" % self.hostname + record.end_record() + diag.setFlag('CloseRT') + diag.resetFlag('RTEndRecord') + diag.save() + #return None + + if message: + if diag.getFlag('CloseRT'): + message.rt.closeTicket() + diag.resetFlag('CloseRT') + diag.save() + else: print "NOT sending email : %s %s" % (config.mail, record.data['rt']) - if record.data['takeaction'] and diag.getFlag('Squeeze'): - print "taking action" - record.takeAction() - - print "saving act_all db" - self.add_and_save_act_all(record) - return def getSendEmailFlag(self, record): @@ -200,13 +247,16 @@ class MonitorMergeDiagnoseSendEscellate: if 'rt' in record.data and \ 'Status' in record.data['rt'] and \ "open" in record.data['rt']['Status'] and \ - record.data['rt']['Created'] < 60*60*24*30: + record.data['rt']['Created'] > int(time.time() - 60*60*24*30): + # if created-time is greater than the thirty days ago from the current time return False return True def add_and_save_act_all(self, record): self.act_all = database.dbLoad("act_all") + if self.hostname not in self.act_all: + self.act_all[self.hostname] = [] self.act_all[self.hostname].insert(0,record.data) database.dbDump("act_all", self.act_all) @@ -218,7 +268,7 @@ class MonitorMergeDiagnoseSendEscellate: #for key in record.data.keys(): # print "%10s %s %s " % (key, "==", record.data[key]) - if record.data['ticket_id'] == "": + if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data: log = "DOWN: %20s : %-40s == %20s %s" % \ (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket']) else: @@ -231,79 +281,99 @@ class MonitorMergeDiagnoseSendEscellate: record.data['args'] = {'nodename': self.hostname} record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category']) - if record.data['ticket_id'] == "": - log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ + try: + if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data: + log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ (self.loginbase, self.hostname, record.data['stage'], - state, category, record.data['found_rt_ticket']) - else: - log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ + record.data['prev_category'], record.data['category'], record.data['found_rt_ticket']) + else: + log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \ (self.loginbase, self.hostname, record.data['stage'], - state, category, record.data['ticket_id']) + record.data['prev_category'], record.data['category'], record.data['ticket_id']) + except: + log = "IMPR: %s improved to %s " % (self.hostname, record.data['category']) return log def checkStageAndTime(self, diag, record): current_time = time.time() delta = current_time - record.data['time'] + #print record.data if 'findbad' in record.data['stage']: # The node is bad, and there's no previous record of it. record.data['email'] = TECH record.data['action'] = ['noop'] record.data['takeaction'] = False - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' + record.data['save-act-all'] = True elif 'reboot_node' in record.data['stage']: record.data['email'] = TECH record.data['action'] = ['noop'] - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' record.data['takeaction'] = False + record.data['save-act-all'] = False elif 'improvement' in record.data['stage']: - print "backing off of %s" % self.hostname + print "checkStageAndTime: backing off of %s" % self.hostname record.data['action'] = ['close_rt'] record.data['takeaction'] = True - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'monitor-end-record' + record.data['save-act-all'] = True elif 'actinoneweek' in record.data['stage']: if delta >= 7 * SPERDAY: + print "checkStageAndTime: transition to next stage actintwoweeks" record.data['email'] = TECH | PI record.data['stage'] = 'stage_actintwoweeks' - record.data['message'] = record.data['message'][1] + record.data['message'] = record.data['message_series'][1] record.data['action'] = ['nocreate' ] record.data['time'] = current_time # reset clock for waitforever record.data['takeaction'] = True + record.data['save-act-all'] = True elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data: + print "checkStageAndTime: second message in one week" record.data['email'] = TECH - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['action'] = ['sendmailagain-waitforoneweekaction' ] record.data['second-mail-at-oneweek'] = True record.data['takeaction'] = False + record.data['save-act-all'] = True else: record.data['message'] = None record.data['action'] = ['waitforoneweekaction' ] - print "ignoring this record for: %s" % self.hostname - return None # don't send if there's no action + record.data['takeaction'] = False + record.data['save-act-all'] = False + print "checkStageAndTime: ignoring this record for: %s" % self.hostname + #return None # don't send if there's no action elif 'actintwoweeks' in record.data['stage']: if delta >= 7 * SPERDAY: + print "checkStageAndTime: transition to next stage waitforever" record.data['email'] = TECH | PI | USER record.data['stage'] = 'stage_waitforever' - record.data['message'] = record.data['message'][2] + record.data['message'] = record.data['message_series'][2] record.data['action'] = ['suspendslices'] record.data['time'] = current_time # reset clock for waitforever record.data['takeaction'] = True + record.data['save-act-all'] = True elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data: + print "checkStageAndTime: second message in one week for stage two" record.data['email'] = TECH | PI - record.data['message'] = record.data['message'][1] + record.data['message'] = record.data['message_series'][1] record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ] record.data['second-mail-at-twoweeks'] = True record.data['takeaction'] = False + record.data['save-act-all'] = True else: record.data['message'] = None + record.data['takeaction'] = False record.data['action'] = ['waitfortwoweeksaction'] - return None # don't send if there's no action + record.data['save-act-all'] = False + print "checkStageAndTime: second message in one week for stage two" + #return None # don't send if there's no action elif 'ticket_waitforever' in record.data['stage']: record.data['email'] = TECH @@ -314,14 +384,18 @@ class MonitorMergeDiagnoseSendEscellate: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None record.data['time'] = current_time + record.data['save-act-all'] = True else: if delta >= 7*SPERDAY: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None record.data['time'] = current_time # reset clock + record.data['save-act-all'] = True else: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None + record.data['takeaction'] = False + record.data['save-act-all'] = False return None elif 'waitforever' in record.data['stage']: @@ -331,12 +405,15 @@ class MonitorMergeDiagnoseSendEscellate: record.data['takeaction'] = True if delta >= 3*SPERDAY: record.data['action'] = ['email-againwaitforever'] - record.data['message'] = record.data['message'][2] + record.data['message'] = record.data['message_series'][2] record.data['time'] = current_time # reset clock + record.data['save-act-all'] = True else: record.data['action'] = ['waitforever'] record.data['message'] = None - return None # don't send if there's no action + record.data['takeaction'] = False + record.data['save-act-all'] = False + #return None # don't send if there's no action else: # There is no action to be taken, possibly b/c the stage has @@ -347,14 +424,15 @@ class MonitorMergeDiagnoseSendEscellate: # TODO: figure out which. for now assume 2. print "UNKNOWN stage for %s; nothing done" % self.hostname record.data['action'] = ['unknown'] - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['email'] = TECH record.data['action'] = ['noop'] - record.data['message'] = record.data['message'][0] + record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' record.data['time'] = current_time # reset clock record.data['takeaction'] = False + record.data['save-act-all'] = True print "%s" % record.data['log'], print "%15s" % record.data['action'] diff --git a/emailTxt.py b/emailTxt.py index c2e147f..cfbf112 100644 --- a/emailTxt.py +++ b/emailTxt.py @@ -30,7 +30,7 @@ If the machine has booted successfully, you may check it more quickly by logging sudo /usr/sbin/vps ax -If you have a BootCD older than 3.0, you will need to create burn a new BootImage to CD or USB. You can find instructions for this at the Technical Contact's Guide: +If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contact's Guide: https://www.planet-lab.org/doc/guides/bootcdsetup @@ -204,24 +204,32 @@ Monitor restarted NM on the following machines: """) pcudown_one =("""Could not use PCU to reboot %(hostname)s""", -"""As part of our machine monitoring and maintenance, we tried to use the PCU -registered below, but could not for the following reason at the link below: +"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU +registered below, but could not for the reasons at the link below: https://monitor.planet-lab.org/cgi-bin/printbadpcus.php?id=%(pcu_id)s -We need your help resolving this issue in two ways: +We need your help resolving this issue in a few ways: + + 1. First, we need your help rebooting %(hostname)s. Because the above PCU does + not appear to work, please manually reboot this machine. If it turns out that + there is a problem with the PCU configuration, we can help you + resolve that independently. -* First, we need your help rebooting %(hostname)s. Because we cannot leverage - the above PCU, please manually reboot this machine and we can help you - resolve any configuration errors with the PCU independently. + 2. If there is nothing apparently wrong with the PCU, or the mapping between + the PCU and the host, then there is likely a problem with our bootstrap + software on your machine. To help us, please make a note of any text on + the console and report it to mailto:support@planet-lab.org . An example + might be that the console hangs waiting for a module to unload. The last + reported name or any error messages on the screen would be very helpful. -* Second, if it is possible, please correcct the above PCU problem. - By enabling us to take administrative actions automatically from - PlanetLab Central without local intervention, you can trade a small - amount of time now for a time savings in the future. + 3. Alternately, if it is possible, please correcct the above PCU problem, or + let us know what steps you are taking. By enabling us to take administrative + actions automatically from PlanetLab Central without your intervention, you + can trade a small amount of time now for a time savings in the future. If the PCU is up and running, but behind a firewall, please make it accessible -from address block 128.112.139.0/25. You can confirm that this is the address +from address block 128.112.139.0/24. You can confirm that this is the address space from which the PlanetLab Central servers run. If the above PCU is no longer in service, please delete it by visiting: diff --git a/findbadpcu.py b/findbadpcu.py index e3d160d..ca65344 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -12,6 +12,7 @@ import sets import signal import traceback +from nodequery import pcu_select #old_handler = signal.getsignal(signal.SIGCHLD) @@ -329,7 +330,7 @@ def checkAndRecordState(l_pcus, cohash): global count global_round = externalState['round'] - tp = threadpool.ThreadPool(20) + tp = threadpool.ThreadPool(10) # CREATE all the work requests for pcuname in l_pcus: @@ -390,6 +391,11 @@ def main(): pcus = [] for node in l_nodes: pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + elif config.pcuselect is not None: + n, pcus = pcu_select(config.pcuselect) + # clear out dups. l_pcus = [pcu for pcu in sets.Set(pcus)] elif config.nodelist == None and config.pcuid == None: @@ -421,6 +427,7 @@ if __name__ == '__main__': parser.set_defaults(nodelist=None, increment=False, pcuid=None, + pcuselect=None, site=None, dbname="findbadpcus", cachenodes=False, @@ -430,6 +437,8 @@ if __name__ == '__main__': help="Provide the input file for the node list") parser.add_option("", "--site", dest="site", metavar="FILE", help="Get all pcus associated with the given site's nodes") + parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", + help="Query string to apply to the findbad pcus") parser.add_option("", "--pcuid", dest="pcuid", metavar="id", help="Provide the id for a single pcu") diff --git a/grouprins.py b/grouprins.py index 1896f41..d859727 100755 --- a/grouprins.py +++ b/grouprins.py @@ -64,13 +64,35 @@ class Reboot(object): self.action = "reboot.reboot('%s')" % host pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags') - pflags.resetRecentFlag('pcutried') + #pflags.resetRecentFlag('pcutried') if not pflags.getRecentFlag('pcutried'): - pflags.setRecentFlag('pcutried') try: print "CALLING REBOOT!!!" ret = reboot.reboot(host) + pflags.setRecentFlag('pcutried') + pflags.save() + return ret + + except Exception,e: + print traceback.print_exc(); print e + + # NOTE: this failure could be an implementation issue on + # our end. So, extra notices are confusing... + # self._send_pcunotice(host) + + pflags.setRecentFlag('pcufailed') + pflags.save() + return False + + elif not pflags.getRecentFlag('pcu_rins_tried'): + try: + # set node to 'rins' boot state. + print "CALLING REBOOT +++ RINS" + plc.nodeBootState(host, 'rins') + ret = reboot.reboot(host) + + pflags.setRecentFlag('pcu_rins_tried') pflags.save() return ret @@ -93,12 +115,12 @@ class Reboot(object): pflags.setRecentFlag('pcumessagesent') pflags.save() - # NOTE: this will result in just one message sent at a time. - return True - else: - print "GetRecentFlag()" - return False + # This will result in mail() being called next, to try to + # engage the technical contact to take care of it also. + print "RETURNING FALSE" + return False + else: print "NO PCUOK" self.action = "None" @@ -174,8 +196,6 @@ parser.set_defaults( timewait=0, force=False, nosetup=False, verbose=False, - stopkey=None, - stopvalue=None, quiet=False, ) @@ -210,7 +230,7 @@ if config.nodegroup: if config.node or config.nodelist: if config.node: hostnames = [ config.node ] - else: hostnames = config.getListFromFile(config.nodelist) + else: hostnames = util.file.getListFromFile(config.nodelist) fb = database.dbLoad("findbad") @@ -221,14 +241,18 @@ if config.findbad: # rerun findbad with the nodes in the given nodes. file = "findbad.txt" util.file.setFileFromList(file, hostnames) - os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) + os.system("./findbad.py --cachenodes --increment --nodelist %s" % file) + # TODO: shouldn't we reload the node list now? +l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) # commands: i = 1 count = 1 +#print "hosts: %s" % hostnames for host in hostnames: #if 'echo' in host or 'hptest-1' in host: continue + try: try: node = api.GetNodes(host)[0] @@ -240,6 +264,9 @@ for host in hostnames: print "%-2d" % i, nodegroup_display(node, fb) i += 1 if i-1 <= int(config.skip): continue + if host in l_blacklist: + print "%s is blacklisted. Skipping." % host + continue if config.stopselect: dict_query = query_to_dict(config.stopselect) @@ -249,20 +276,17 @@ for host in hostnames: if verify(dict_query, fbnode) and observed_state != "dbg ": # evaluates to true, therefore skip. print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host ) - continue - - if config.stopkey and config.stopvalue: - fbnode = fb['nodes'][host]['values'] - observed_state = get_current_state(fbnode) + try: + # todo: clean up act_all record here. + # todo: send thank you, etc. + mailmonitor.reboot(host) + except Exception, e: + print traceback.print_exc(); print e - if config.stopkey in fbnode: - if config.stopvalue in fbnode[config.stopkey] and observed_state != "dbg ": - print "%s has stopvalue; skipping..." % host - continue - else: - print "stopkey %s not in fbnode record for %s; skipping..." % (config.stopkey, host) - print fbnode continue + #else: + #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state ) + #sys.exit(1) if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2): print "recently rebooted %s. skipping... " % host diff --git a/mailer.py b/mailer.py index da6249d..97bd173 100755 --- a/mailer.py +++ b/mailer.py @@ -171,7 +171,7 @@ def closeTicketViaRT(ticket_id, comment): return def emailViaRT(subject, text, to, ticket_id=None): - if ticket_id == None or ticket_id == "": + if ticket_id == None or ticket_id == "" or ticket_id == 0: print "No TICKET" return emailViaRT_NoTicket(subject, text, to) diff --git a/mailmonitor.py b/mailmonitor.py index 87b301f..c9c1750 100644 --- a/mailmonitor.py +++ b/mailmonitor.py @@ -17,7 +17,7 @@ api = plc.getAuthAPI() from clean_policy import * def reboot(hostname): - print "calling reboot!!! %s " % hostname + print "CALLING: mailmonitor.reboot(%s)" % hostname l_nodes = api.GetNodes(hostname) if len(l_nodes) == 0: @@ -30,11 +30,11 @@ def reboot(hostname): if len(l_nodes) == 0: raise Exception("Host removed via blacklist: %s" % hostname) - ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : []) - if ad_dbTickets == None: - raise Exception("Could not find cached dbTickets") + #ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : []) + #if ad_dbTickets == None: + # raise Exception("Could not find cached dbTickets") - print "starting new thing" + #print "starting new thing" mon = MonitorMergeDiagnoseSendEscellate(hostname, True) mon.run() diff --git a/nodecommon.py b/nodecommon.py index ba67625..a8b82ea 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -132,13 +132,16 @@ def nodegroup_display(node, fb, conf=None): node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu'] node['lastupdate'] = diff_time(node['last_contact']) pf = PersistFlags(node['hostname'], 1, db='node_persistflags') - node['lc'] = diff_time(pf.last_changed) + try: + node['lc'] = diff_time(pf.last_changed) + except: + node['lc'] = "err" ut = fb['nodes'][node['hostname']]['values']['comonstats']['uptime'] if ut != "null": ut = diff_time(float(fb['nodes'][node['hostname']]['values']['comonstats']['uptime']), False) node['uptime'] = ut - return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)33s %(lastupdate)12s, %(lc)s, %(uptime)s" % node + return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)35.35s %(lastupdate)12s, %(lc)s, %(uptime)s" % node def datetime_fromstr(str): if '-' in str: diff --git a/nodegroups.py b/nodegroups.py index e96e7b4..3f4b980 100755 --- a/nodegroups.py +++ b/nodegroups.py @@ -22,6 +22,7 @@ from nodequery import verify,query_to_dict,node_select from nodecommon import * import database +import util.file def main(): fb = database.dbLoad("findbad") @@ -52,7 +53,7 @@ def main(): if config.node: hostlist = [ config.node ] else: - hostlist = config.getListFromFile(config.nodelist) + hostlist = util.file.getListFromFile(config.nodelist) # NOTE: preserve order given in file. Otherwise, return values are not in order # given to GetNodes diff --git a/nodeinfo.py b/nodeinfo.py index 30838f1..fee8eb3 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -76,8 +76,7 @@ def act_print_nodeinfo(actnode, header): if 'rt' in actnode and 'Status' in actnode['rt']: print "\t %5.5s %5.5s | %8.8s | %15.15s | %s" % \ (actnode['rt']['Status'], actnode['rt']['id'][7:], - actnode['category'], actnode['action'][0], - actnode['msg_format'][:-1]) + actnode['category'], actnode['action'][0], actnode['info'][1:]) else: if type(actnode['action']) == type([]): action = actnode['action'][0] diff --git a/nodequery.py b/nodequery.py index c3f7ab8..e746e5b 100755 --- a/nodequery.py +++ b/nodequery.py @@ -18,8 +18,8 @@ import time import re #fb = {} -fb = {} -fbpcu = {} +fb = None +fbpcu = None class NoKeyException(Exception): pass @@ -46,7 +46,10 @@ def fb_print_nodeinfo(fbnode, hostname, fields=None): fbnode['kernel'] = fbnode['kernel'].split()[2] fbnode['boot_state'] = fbnode['plcnode']['boot_state'] - print "%(hostname)-39s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode + if len(fbnode['nodegroups']) > 0: + fbnode['category'] = fbnode['nodegroups'][0] + + print "%(hostname)-45s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode else: format = "" for f in fields: @@ -143,7 +146,13 @@ def verify(constraints, data): #print "looking at key: %s" % key if key in data: value_re = re.compile(con[key]) - con_and_true = con_and_true & (value_re.search(data[key]) is not None) + if type([]) == type(data[key]): + local_or_true = False + for val in data[key]: + local_or_true = local_or_true | (value_re.search(val) is not None) + con_and_true = con_and_true & local_or_true + else: + con_and_true = con_and_true & (value_re.search(data[key]) is not None) elif key not in data: print "missing key %s" % key, pass @@ -180,10 +189,17 @@ def pcu_in(fbdata): return False def pcu_select(str_query, nodelist=None): + global fb + global fbpcu pcunames = [] nodenames = [] if str_query is None: return (nodenames, pcunames) + if fb is None: + fb = database.dbLoad("findbad") + if fbpcu is None: + fbpcu = database.dbLoad("findbadpcus") + #print str_query dict_query = query_to_dict(str_query) #print dict_query @@ -199,7 +215,8 @@ def pcu_select(str_query, nodelist=None): nodenames.append(node) str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \ (pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password']) - pcunames.append(str) + #pcunames.append(str) + pcunames.append(pcuinfo['pcu_id']) return (nodenames, pcunames) def node_select(str_query, nodelist=None, fbdb=None): diff --git a/reboot.py b/reboot.py index 337b0b4..e876a76 100755 --- a/reboot.py +++ b/reboot.py @@ -704,9 +704,8 @@ class BayTechCtrlCUnibe(PCUControl): # Control Outlets (5 ,1).........5 try: - print s - print "Enter Request" in s.before - index = s.expect("Enter Request") + #index = s.expect("Enter Request") + index = s.expect(["Enter Request :"]) if index == 0: print "3" @@ -720,7 +719,8 @@ class BayTechCtrlCUnibe(PCUControl): print "Reboot %d" % node_port s.send("Reboot %d\r\n" % node_port) - index = s.expect(["(Y/N)?"]) + time.sleep(5) + index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"]) if index == 0: if dryrun: print "sending N" @@ -728,16 +728,21 @@ class BayTechCtrlCUnibe(PCUControl): else: print "sending Y" s.send("Y\r\n") + elif index == 1: + raise ExceptionPrompt("PCU Reported 'Port in use.'") + elif index == 2: + raise ExceptionSequence("Issued command 'Reboot' failed.") - #index = s.expect(["DS-RPC>"]) + time.sleep(5) + index = s.expect(["DS-RPC>"]) #print "got prompt back" s.close() except pexpect.EOF: - raise ExceptionPrompt("EOF before 'Enter Request' Prompt") + raise ExceptionPrompt("EOF before expected Prompt") except pexpect.TIMEOUT: - raise ExceptionPrompt("Timeout before 'Enter Request' Prompt") + raise ExceptionPrompt("Timeout before expected Prompt") return 0 @@ -757,40 +762,54 @@ class BayTechCtrlC(PCUControl): # Otherwise, the login succeeded. # Send a ctrl-c to the remote process. - print "sending ctrl-c" + print "SENDING ctrl-c" s.send(chr(3)) # Control Outlets (5 ,1).........5 try: + print "EXPECTING: ", "Enter Request :" index = s.expect(["Enter Request :"]) if index == 0: - print "5" + print "SENDING: 5" s.send("5\r\n") - index = s.expect(["DS-RPC>", "Enter user name:"]) + print "EXPECTING: ", "DS-RPC>" + index = s.expect(["DS-RPC>", "Enter user name:", "Port in use."]) if index == 1: print "sending username" s.send(self.username + "\r\n") index = s.expect(["DS-RPC>"]) + elif index == 2: + raise ExceptionPrompt("PCU Reported 'Port in use.'") if index == 0: - print "Reboot %d" % node_port + print "SENDING: Reboot %d" % node_port s.send("Reboot %d\r\n" % node_port) - index = s.expect(["(Y/N)?"]) + print "SLEEPING: 5" + time.sleep(5) + print "EXPECTING: ", "Y/N?" + index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"]) if index == 0: if dryrun: print "sending N" s.send("N\r\n") else: - print "sending Y" + print "SENDING: Y" s.send("Y\r\n") + elif index == 1: + raise ExceptionPrompt("PCU Reported 'Port in use.'") + elif index == 2: + raise ExceptionSequence("Issued command 'Reboot' failed.") # NOTE: for some reason, the script times out with the # following line. In manual tests, it works correctly, but # with automated tests, evidently it fails. - #index = s.expect(["DS-RPC>"]) - #print "got prompt back" + print "SLEEPING: 5" + time.sleep(5) + #print "TOTAL--", s.allstr, "--EOT" + index = s.expect(["DS-RPC>"]) + print "got prompt back" s.close() @@ -817,6 +836,7 @@ class BayTech(PCUControl): # even after login... print "msg: %s" % msg self.transport.write(self.username + "\r\n") + time.sleep(5) self.ifThenSend("DS-RPC>", "Reboot %d" % node_port) # Reboot Outlet N (Y/N)? @@ -824,6 +844,7 @@ class BayTech(PCUControl): self.ifThenSend("(Y/N)?", "N") else: self.ifThenSend("(Y/N)?", "Y") + time.sleep(5) self.ifThenSend("DS-RPC>", "") self.close() @@ -1227,7 +1248,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun): print values # TODO: make a more robust version of APC - if values['pcu_id'] in [1163,1055,1111,1231,1113,1127,1128,1148]: + if values['pcu_id'] in [1102,1163,1055,1111,1231,1113,1127,1128,1148]: apc = APCEurope(values, verbose, ['22', '23']) rb_ret = apc.reboot(values[nodename], dryrun) @@ -1235,11 +1256,11 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun): apc = APCBrazil(values, verbose, ['22', '23']) rb_ret = apc.reboot(values[nodename], dryrun) - elif values['pcu_id'] in [1221,1225]: + elif values['pcu_id'] in [1221,1225,1220]: apc = APCBerlin(values, verbose, ['22', '23']) rb_ret = apc.reboot(values[nodename], dryrun) - elif values['pcu_id'] in [1173,1221,1220]: + elif values['pcu_id'] in [1173,1240]: apc = APCFolsom(values, verbose, ['22', '23']) rb_ret = apc.reboot(values[nodename], dryrun) @@ -1249,7 +1270,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun): # BayTech DS4-RPC elif continue_probe and values['model'].find("DS4-RPC") >= 0: - if values['pcu_id'] in [1237,1052,1209,1002,1008,1041,1013,1022]: + if values['pcu_id'] in [1056,1237,1052,1209,1002,1008,1041,1013,1022]: # These require a 'ctrl-c' to be sent... baytech = BayTechCtrlC(values, verbose, ['22', '23']) rb_ret = baytech.reboot(values[nodename], dryrun) diff --git a/ssh/pexpect.py b/ssh/pexpect.py index 19ee230..4eab532 100644 --- a/ssh/pexpect.py +++ b/ssh/pexpect.py @@ -342,6 +342,7 @@ class spawn (object): self.env = env self.__irix_hack = sys.platform.lower().find('irix') >= 0 # This flags if we are running on irix self.use_native_pty_fork = not (sys.platform.lower().find('solaris') >= 0) # Solaris uses internal __fork_pty(). All other use pty.fork(). + self.allstr = "" # allow dummy instances for subclasses that may not use command or args. if command is None: @@ -1108,6 +1109,7 @@ class spawn (object): self.buffer = incoming[self.match.end() : ] self.before = incoming[ : self.match.start()] self.after = incoming[self.match.start() : self.match.end()] + #print "MATCH--", self.after, "--EOM" return self.match_index # No match at this point if timeout < 0 and timeout is not None: @@ -1116,6 +1118,8 @@ class spawn (object): c = self.read_nonblocking (self.maxread, timeout) time.sleep (0.0001) incoming = incoming + c + self.allstr += c + #print "INCOMING--", c, "--EOI" if timeout is not None: timeout = end_time - time.time() except EOF, e: diff --git a/unified_model.py b/unified_model.py index acc89d8..8c5fb7f 100755 --- a/unified_model.py +++ b/unified_model.py @@ -91,7 +91,7 @@ class RT(object): return self.status def closeTicket(self): - mailer.closeTicketViaRT(self.ticket_id) + mailer.closeTicketViaRT(self.ticket_id, "Ticket CLOSED automatically by SiteAssist.") def email(self, subject, body, to): self.ticket_id = mailer.emailViaRT(subject, body, to, self.ticket_id) @@ -229,10 +229,10 @@ class PersistMessage(Message): #print pm if id in pm: - print "Using existing object" + #print "Using existing object" obj = pm[id] else: - print "creating new object" + #print "creating new object" obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs) obj.id = id obj.actiontracker = Recent(3*60*60*24) @@ -252,18 +252,19 @@ class PersistMessage(Message): def reset(self): self.actiontracker.unsetRecent() + def save(self): + pm = database.dbLoad(self.db) + pm[self.id] = self + database.dbDump(self.db, pm) + def send(self, to): if not self.actiontracker.isRecent(): self.ticket_id = Message.send(self, to) self.actiontracker.setRecent() - - #print "recording object for persistance" - pm = database.dbLoad(self.db) - pm[self.id] = self - database.dbDump(self.db, pm) + self.save() else: # NOTE: only send a new message every week, regardless. - print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24) + print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24)) class MonitorMessage(object): def __new__(typ, id, *args, **kwargs): @@ -427,6 +428,7 @@ class Record(object): def severity(self): category = self.data['category'] prev_category = self.data['prev_category'] + #print "SEVERITY: ", category, prev_category val = cmpCategoryVal(category, prev_category) return val @@ -504,33 +506,46 @@ class Record(object): def takeAction(self): pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames') - if 'improvement' in self.data['stage'] or self.improved(): - print "decreasing penalty for %s"%self.hostname + if 'improvement' in self.data['stage'] or self.improved() or \ + 'monitor-end-record' in self.data['stage']: + print "takeAction: decreasing penalty for %s"%self.hostname + pp.decrease() pp.decrease() else: - print "increasing penalty for %s"%self.hostname + print "takeAction: increasing penalty for %s"%self.hostname pp.increase() pp.apply(self.hostname) pp.save() def _format_diaginfo(self): info = self.data['info'] + print "FORMAT : STAGE: ", self.data['stage'] if self.data['stage'] == 'monitor-end-record': + if info[2] == "ALPHA": info = (info[0], info[1], "PROD") hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) else: hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn) return hlist + def saveAction(self): + if 'save-act-all' in self.data and self.data['save-act-all'] == True: + return True + else: + return False def getMessage(self, ticket_id=None): self.data['args']['hostname'] = self.hostname self.data['args']['loginbase'] = self.loginbase self.data['args']['hostname_list'] = self._format_diaginfo() - message = PersistMessage(self.hostname, + #print self.data['message'] + if self.data['message']: + message = PersistMessage(self.hostname, self.data['message'][0] % self.data['args'], self.data['message'][1] % self.data['args'], True, db='monitor_persistmessages', ticket_id=ticket_id) - return message + return message + else: + return None def getContacts(self): roles = self.data['email'] @@ -579,6 +594,7 @@ class NodeRecord: def severity(self): category = self.data['category'] prev_category = self.data['prev_category'] + print "IMPROVED: ", category, prev_category val = cmpCategoryVal(category, prev_category) return val @@ -659,6 +675,15 @@ def node_end_record(node): del act_all return False + pm = database.dbLoad("monitor_persistmessages") + if node not in pm: + del pm + return False + else: + print "deleting node record" + del pm[node] + database.dbDump("monitor_persistmessages", pm) + a = Action(node, act_all[node][0]) a.delField('rt') a.delField('found_rt_ticket') @@ -667,8 +692,9 @@ def node_end_record(node): a.delField('first-found') rec = a.get() rec['action'] = ["close_rt"] - rec['category'] = "UNKNOWN" + rec['category'] = "ALPHA" # assume that it's up... rec['stage'] = "monitor-end-record" + rec['ticket_id'] = None rec['time'] = time.time() - 7*60*60*24 act_all[node].insert(0,rec) database.dbDump("act_all", act_all) -- 2.43.0