update of all changes in the last week that fine-tuned the behavior of Monitor
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
with the new clean_policy and unified_model.

14 files changed:
bootman.py
clean_policy.py
emailTxt.py
findbadpcu.py
grouprins.py
mailer.py
mailmonitor.py
nodecommon.py
nodegroups.py
nodeinfo.py
nodequery.py
reboot.py
ssh/pexpect.py
unified_model.py

index a278afe..87d8b71 100755 (executable)
@@ -34,9 +34,12 @@ class Sopen(subprocess.Popen):
 #from Rpyc import SocketConnection, Async
 from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
+fb = None
 
 def get_fbnode(node):
-       fb = database.dbLoad("findbad")
+       global fb
+       if fb is None:
+               fb = database.dbLoad("findbad")
        fbnode = fb['nodes'][node]['values']
        return fbnode
 
@@ -359,7 +362,6 @@ def reboot(hostname, config=None, forced_action=None):
                except:
                        print traceback.print_exc()
                        return False
-                       
 
        if forced_action == "reboot":
                conn.restart_node('rins')
index d2bde41..a14016e 100644 (file)
@@ -31,6 +31,9 @@ def get_ticket_id(record):
                return None
 
 class MonitorMergeDiagnoseSendEscellate:
+       act_all = None
+       fb = None
+
        def __init__(self, hostname, act):
                self.hostname = hostname
                self.act = act
@@ -41,7 +44,11 @@ class MonitorMergeDiagnoseSendEscellate:
                return
 
        def getFBRecord(self):
-               fb = database.dbLoad("findbad")
+               if MonitorMergeDiagnoseSendEscellate.fb == None:
+                       MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
+
+               fb = MonitorMergeDiagnoseSendEscellate.fb
+
                if self.hostname in fb['nodes']:
                        fbnode = fb['nodes'][self.hostname]['values']
                else:
@@ -50,12 +57,15 @@ class MonitorMergeDiagnoseSendEscellate:
 
        def getActionRecord(self):
                # update ticket status
-               act_all = database.dbLoad("act_all")
+               if MonitorMergeDiagnoseSendEscellate.act_all == None:
+                       MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
+
+               act_all = MonitorMergeDiagnoseSendEscellate.act_all 
+
                if self.hostname in act_all and len(act_all[self.hostname]) > 0:
                        actnode = act_all[self.hostname][0]
                else:
                        actnode = None
-               del act_all
                return actnode
 
        def getKernel(self, unamestr):
@@ -73,13 +83,15 @@ class MonitorMergeDiagnoseSendEscellate:
                fbnode['info'] = None
                fbnode['log'] = None
                fbnode['time'] = time.time()
+               fbnode['email'] = TECH
+               fbnode['action'] = ['noop']
                fbnode['date_created'] = time.time()
 
-               if actnode is None:
+               if actnode is None: # there is no entry in act_all
                        actnode = {} 
                        actnode.update(fbnode)
                        actnode['ticket_id'] = ""
-                       actnode['prev_category'] = "NORECORD
+                       actnode['prev_category'] = "ERROR
                else:
                        actnode['prev_category']= actnode['category']
                        actnode['comonstats']   = fbnode['comonstats']
@@ -111,29 +123,40 @@ class MonitorMergeDiagnoseSendEscellate:
 
                diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
                # NOTE: change record stage based on RT status.
-               diag.setFlag('ResetStage')
+               #diag.setFlag('ResetStage')
                if record.stageIswaitforever():
                        ticket = record.data['rt']
                        if 'new' in ticket['Status']:
-                               diag.setFlag('ResetStage')
+                               print "Resetting Stage!!!!!"
+                       #       diag.setFlag('ResetStage')
+                               record.reset_stage()
+                       #if diag.getFlag('ResetStage'):
+                       #       print "diagnose: resetting stage"
+                       #       diag.resetFlag('ResetStage')
                                
                        if 'resolved' in ticket['Status']:
-                               diag.setFlag('EndRecord')
+                               diag.setFlag('RTEndRecord')
 
                # NOTE: take category, and prepare action
                category = record.getCategory()
                if category == "error":
                        diag.setFlag('SendNodedown')
-                       record.data['message'] = emailTxt.mailtxt.newdown
+                       record.data['message_series'] = emailTxt.mailtxt.newdown
                        record.data['log'] = self.getDownLog(record)
 
-               elif category == "prod":
+               elif category == "prod" or category == "alpha":
                        state = record.getState()
                        if state == "boot":
-                               diag.setFlag('SendThankyou')
-                               record.data['message'] = emailTxt.mailtxt.newthankyou
-                               record.data['log'] = self.getThankyouLog(record)
-
+                               if record.severity() != 0:
+                                       diag.setFlag('SendThankyou')
+                                       print "RESETTING STAGE: improvement"
+                                       record.data['stage'] = 'improvement'
+                                       record.data['message_series'] = emailTxt.mailtxt.newthankyou
+                                       record.data['log'] = self.getThankyouLog(record)
+                               else:
+                                       # NOTE: do nothing, since we've already done the above.
+                                       print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
+                                       return None
                        elif state == "debug":
                                pass
                        else:
@@ -141,55 +164,79 @@ class MonitorMergeDiagnoseSendEscellate:
                else:
                        print "unknown category: %s" % category
 
-               if diag.getFlag('ResetStage'):
-                       print "resetting stage"
-                       record.reset_stage()
 
+               # TODO: how to not send email?...
                record = self.checkStageAndTime(diag,record)
-               if record:
-                       print "checkStageAndTime Returned Valid Record"
-                       site = PersistFlags(self.loginbase, 1, db='site_persistflags')
+               #if record:
+               print "diagnose: checkStageAndTime Returned Valid Record"
+               site = PersistFlags(self.loginbase, 1, db='site_persistflags')
 
-                       if site.status is not "good":
-                               print "Setting site %s for 'squeeze'" % self.loginbase
-                               diag.setFlag('Squeeze')
-                       else:
-                               print "Setting site %s for 'backoff'" % self.loginbase
-                               diag.setFlag('BackOff')
-
-                       diag.save()
-                       return diag
+               if site.status != "good":
+                       print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
+                       diag.setFlag('Squeeze')
                else:
-                       print "checkStageAndTime Returned NULL Record"
-                       return None
+                       print "diagnose: Setting site %s for 'backoff'" % self.loginbase
+                       diag.setFlag('BackOff')
+
+               diag.save()
+               return diag
+               #else:
+               #       print "checkStageAndTime Returned NULL Record"
+               #       return None
 
        def action(self, record, diag):
-               if record.improved() or diag.getFlag('EndRecord'):
-                       print "end record for %s" % self.hostname
-                       record.end_record()
-                       diag.setFlag('CloseRT')
-                       return None
-
-               if self.getSendEmailFlag(record): 
-                       print "sending email"
+
+               message = None
+
+               #print record.data['stage']
+               #print "improvement" in record.data['stage']
+               #print self.getSendEmailFlag(record)
+               if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']
+                       print "action: getting message"
                        message = record.getMessage(record.data['ticket_id'])
-                       message.reset()
-                       message.send(record.getContacts())
-                       if message.rt.ticket_id:
-                               print "setting record ticket_id"
-                               record.data['ticket_id'] = message.rt.ticket_id
-                       if diag.getFlag('CloseRT'):
-                               message.rt.closeTicket()
+                       if message:
+                               #message.reset()
+                               print "action: sending email"
+                               message.send(record.getContacts())
+                               #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+                               #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+                               #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+                               #print message
+                               if message.rt.ticket_id:
+                                       print "action: setting record ticket_id"
+                                       record.data['ticket_id'] = message.rt.ticket_id
+
+                       if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+                               print "action: taking action"
+                               record.takeAction()
+                               diag.resetFlag('Squeeze')
+                               diag.resetFlag('BackOff')
+                               diag.save()
+
+                       if record.saveAction():
+                               print "action: saving act_all db"
+                               self.add_and_save_act_all(record)
+                       else:
+                               print "action: NOT saving act_all db"
+                               print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
+
+                       if record.improved() or diag.getFlag('RTEndRecord'):
+                               print "action: end record for %s" % self.hostname
+                               record.end_record()
+                               diag.setFlag('CloseRT')
+                               diag.resetFlag('RTEndRecord')
+                               diag.save()
+                               #return None
+
+                       if message:
+                               if diag.getFlag('CloseRT'):
+                                       message.rt.closeTicket()
+                                       diag.resetFlag('CloseRT')
+                                       diag.save()
+
                else:
                        print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
 
-               if record.data['takeaction'] and diag.getFlag('Squeeze'):
-                       print "taking action"
-                       record.takeAction()
-
-               print "saving act_all db"
-               self.add_and_save_act_all(record)
-
                return
 
        def getSendEmailFlag(self, record):
@@ -200,13 +247,16 @@ class MonitorMergeDiagnoseSendEscellate:
                if  'rt' in record.data and \
                        'Status' in record.data['rt'] and \
                        "open" in record.data['rt']['Status'] and \
-                       record.data['rt']['Created'] < 60*60*24*30:
+                       record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
+                       # if created-time is greater than the thirty days ago from the current time
                        return False
 
                return True
 
        def add_and_save_act_all(self, record):
                self.act_all = database.dbLoad("act_all")
+               if self.hostname not in self.act_all:
+                       self.act_all[self.hostname] = []
                self.act_all[self.hostname].insert(0,record.data)
                database.dbDump("act_all", self.act_all)
                
@@ -218,7 +268,7 @@ class MonitorMergeDiagnoseSendEscellate:
                #for key in record.data.keys():
                #       print "%10s %s %s " % (key, "==", record.data[key])
 
-               if record.data['ticket_id'] == "":
+               if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
                        log = "DOWN: %20s : %-40s == %20s %s" % \
                                (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
                else:
@@ -231,79 +281,99 @@ class MonitorMergeDiagnoseSendEscellate:
                record.data['args'] = {'nodename': self.hostname}
                record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
 
-               if record.data['ticket_id'] == "":
-                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+               try:
+                       if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
+                               log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                (self.loginbase, self.hostname, record.data['stage'], 
-                                                state, category, record.data['found_rt_ticket'])
-               else:
-                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                                record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
+                       else:
+                               log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                (self.loginbase, self.hostname, record.data['stage'], 
-                                                state, category, record.data['ticket_id'])
+                                                record.data['prev_category'], record.data['category'], record.data['ticket_id'])
+               except:
+                       log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
                return log
 
        def checkStageAndTime(self, diag, record):
                current_time = time.time()
                delta = current_time - record.data['time']
+               #print record.data
                if   'findbad' in record.data['stage']:
                        # The node is bad, and there's no previous record of it.
                        record.data['email'] = TECH
                        record.data['action'] = ['noop']
                        record.data['takeaction'] = False
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                        record.data['stage'] = 'stage_actinoneweek'
+                       record.data['save-act-all'] = True
 
                elif 'reboot_node' in record.data['stage']:
                        record.data['email'] = TECH
                        record.data['action'] = ['noop']
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                        record.data['stage'] = 'stage_actinoneweek'
                        record.data['takeaction'] = False
+                       record.data['save-act-all'] = False
                        
                elif 'improvement' in record.data['stage']:
-                       print "backing off of %s" % self.hostname
+                       print "checkStageAndTime: backing off of %s" % self.hostname
                        record.data['action'] = ['close_rt']
                        record.data['takeaction'] = True
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                        record.data['stage'] = 'monitor-end-record'
+                       record.data['save-act-all'] = True
 
                elif 'actinoneweek' in record.data['stage']:
                        if delta >= 7 * SPERDAY: 
+                               print "checkStageAndTime: transition to next stage actintwoweeks"
                                record.data['email'] = TECH | PI
                                record.data['stage'] = 'stage_actintwoweeks'
-                               record.data['message'] = record.data['message'][1]
+                               record.data['message'] = record.data['message_series'][1]
                                record.data['action'] = ['nocreate' ]
                                record.data['time'] = current_time              # reset clock for waitforever
                                record.data['takeaction'] = True
+                               record.data['save-act-all'] = True
                        elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
+                               print "checkStageAndTime: second message in one week"
                                record.data['email'] = TECH 
-                               record.data['message'] = record.data['message'][0]
+                               record.data['message'] = record.data['message_series'][0]
                                record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
                                record.data['second-mail-at-oneweek'] = True
                                record.data['takeaction'] = False
+                               record.data['save-act-all'] = True
                        else:
                                record.data['message'] = None
                                record.data['action'] = ['waitforoneweekaction' ]
-                               print "ignoring this record for: %s" % self.hostname
-                               return None                     # don't send if there's no action
+                               record.data['takeaction'] = False
+                               record.data['save-act-all'] = False
+                               print "checkStageAndTime: ignoring this record for: %s" % self.hostname
+                               #return None                    # don't send if there's no action
 
                elif 'actintwoweeks' in record.data['stage']:
                        if delta >= 7 * SPERDAY:
+                               print "checkStageAndTime: transition to next stage waitforever"
                                record.data['email'] = TECH | PI | USER
                                record.data['stage'] = 'stage_waitforever'
-                               record.data['message'] = record.data['message'][2]
+                               record.data['message'] = record.data['message_series'][2]
                                record.data['action'] = ['suspendslices']
                                record.data['time'] = current_time              # reset clock for waitforever
                                record.data['takeaction'] = True
+                               record.data['save-act-all'] = True
                        elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
+                               print "checkStageAndTime: second message in one week for stage two"
                                record.data['email'] = TECH | PI
-                               record.data['message'] = record.data['message'][1]
+                               record.data['message'] = record.data['message_series'][1]
                                record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
                                record.data['second-mail-at-twoweeks'] = True
                                record.data['takeaction'] = False
+                               record.data['save-act-all'] = True
                        else:
                                record.data['message'] = None
+                               record.data['takeaction'] = False
                                record.data['action'] = ['waitfortwoweeksaction']
-                               return None                     # don't send if there's no action
+                               record.data['save-act-all'] = False
+                               print "checkStageAndTime: second message in one week for stage two"
+                               #return None                    # don't send if there's no action
 
                elif 'ticket_waitforever' in record.data['stage']:
                        record.data['email'] = TECH
@@ -314,14 +384,18 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['action'] = ['ticket_waitforever']
                                record.data['message'] = None
                                record.data['time'] = current_time
+                               record.data['save-act-all'] = True
                        else:
                                if delta >= 7*SPERDAY:
                                        record.data['action'] = ['ticket_waitforever']
                                        record.data['message'] = None
                                        record.data['time'] = current_time              # reset clock
+                                       record.data['save-act-all'] = True
                                else:
                                        record.data['action'] = ['ticket_waitforever']
                                        record.data['message'] = None
+                                       record.data['takeaction'] = False
+                                       record.data['save-act-all'] = False
                                        return None
 
                elif 'waitforever' in record.data['stage']:
@@ -331,12 +405,15 @@ class MonitorMergeDiagnoseSendEscellate:
                        record.data['takeaction'] = True
                        if delta >= 3*SPERDAY:
                                record.data['action'] = ['email-againwaitforever']
-                               record.data['message'] = record.data['message'][2]
+                               record.data['message'] = record.data['message_series'][2]
                                record.data['time'] = current_time              # reset clock
+                               record.data['save-act-all'] = True
                        else:
                                record.data['action'] = ['waitforever']
                                record.data['message'] = None
-                               return None                     # don't send if there's no action
+                               record.data['takeaction'] = False
+                               record.data['save-act-all'] = False
+                               #return None                    # don't send if there's no action
 
                else:
                        # There is no action to be taken, possibly b/c the stage has
@@ -347,14 +424,15 @@ class MonitorMergeDiagnoseSendEscellate:
                        # TODO: figure out which. for now assume 2.
                        print "UNKNOWN stage for %s; nothing done" % self.hostname
                        record.data['action'] = ['unknown']
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
 
                        record.data['email'] = TECH
                        record.data['action'] = ['noop']
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                        record.data['stage'] = 'stage_actinoneweek'
                        record.data['time'] = current_time              # reset clock
                        record.data['takeaction'] = False
+                       record.data['save-act-all'] = True
 
                print "%s" % record.data['log'],
                print "%15s" % record.data['action']
index c2e147f..cfbf112 100644 (file)
@@ -30,7 +30,7 @@ If the machine has booted successfully, you may check it more quickly by logging
 
     sudo /usr/sbin/vps ax
 
-If you have a BootCD older than 3.0, you will need to create burn a new BootImage to CD or USB.  You can find instructions for this at the Technical Contact's Guide:
+If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB.  You can find instructions for this at the Technical Contact's Guide:
 
     https://www.planet-lab.org/doc/guides/bootcdsetup
 
@@ -204,24 +204,32 @@ Monitor restarted NM on the following machines:
        """)
        pcudown_one =("""Could not use PCU to reboot %(hostname)s""",
 
-"""As part of our machine monitoring and maintenance, we tried to use the PCU
-registered below, but could not for the following reason at the link below:
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered below, but could not for the reasons at the link below:
 
        https://monitor.planet-lab.org/cgi-bin/printbadpcus.php?id=%(pcu_id)s
 
-We need your help resolving this issue in two ways:  
+We need your help resolving this issue in a few ways:  
+
+ 1. First, we need your help rebooting %(hostname)s.  Because the above PCU does 
+    not appear to work, please manually reboot this machine.  If it turns out that 
+    there is a problem with the PCU configuration, we can help you
+    resolve that independently.
 
-* First, we need your help rebooting %(hostname)s.  Because we cannot leverage
-  the above PCU, please manually reboot this machine and we can help you
-  resolve any configuration errors with the PCU independently.
+ 2. If there is nothing apparently wrong with the PCU, or the mapping between
+    the PCU and the host, then there is likely a problem with our bootstrap
+    software on your machine.  To help us, please make a note of any text on
+    the console and report it to mailto:support@planet-lab.org .  An example
+    might be that the console hangs waiting for a module to unload.  The last
+    reported name or any error messages on the screen would be very helpful.
 
-* Second, if it is possible, please correcct the above PCU problem.  
-  By enabling us to take administrative actions automatically from
-  PlanetLab Central without local intervention, you can trade a small
-  amount of time now for a time savings in the future. 
+ 3. Alternately, if it is possible, please correcct the above PCU problem, or
+    let us know what steps you are taking.  By enabling us to take administrative 
+    actions automatically from PlanetLab Central without your intervention, you 
+    can trade a small amount of time now for a time savings in the future. 
 
 If the PCU is up and running, but behind a firewall, please make it accessible
-from address block 128.112.139.0/25.  You can confirm that this is the address
+from address block 128.112.139.0/24.  You can confirm that this is the address
 space from which the PlanetLab Central servers run.
 
 If the above PCU is no longer in service, please delete it by visiting:
index e3d160d..ca65344 100755 (executable)
@@ -12,6 +12,7 @@ import sets
     
 import signal
 import traceback
+from nodequery import pcu_select
 
 #old_handler = signal.getsignal(signal.SIGCHLD)
 
@@ -329,7 +330,7 @@ def checkAndRecordState(l_pcus, cohash):
        global count
        global_round = externalState['round']
 
-       tp = threadpool.ThreadPool(20)
+       tp = threadpool.ThreadPool(10)
 
        # CREATE all the work requests
        for pcuname in l_pcus:
@@ -390,6 +391,11 @@ def main():
                pcus = []
                for node in l_nodes:
                        pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+       elif config.pcuselect is not None:
+               n, pcus = pcu_select(config.pcuselect)
+               # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
        elif config.nodelist == None and config.pcuid == None:
@@ -421,6 +427,7 @@ if __name__ == '__main__':
        parser.set_defaults(nodelist=None, 
                                                increment=False, 
                                                pcuid=None,
+                                               pcuselect=None,
                                                site=None,
                                                dbname="findbadpcus", 
                                                cachenodes=False,
@@ -430,6 +437,8 @@ if __name__ == '__main__':
                                                help="Provide the input file for the node list")
        parser.add_option("", "--site", dest="site", metavar="FILE", 
                                                help="Get all pcus associated with the given site's nodes")
+       parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
+                                               help="Query string to apply to the findbad pcus")
        parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
                                                help="Provide the id for a single pcu")
 
index 1896f41..d859727 100755 (executable)
@@ -64,13 +64,35 @@ class Reboot(object):
                        self.action = "reboot.reboot('%s')" % host
 
                        pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
-                       pflags.resetRecentFlag('pcutried')
+                       #pflags.resetRecentFlag('pcutried')
                        if not pflags.getRecentFlag('pcutried'):
-                               pflags.setRecentFlag('pcutried')
                                try:
                                        print "CALLING REBOOT!!!"
                                        ret = reboot.reboot(host)
 
+                                       pflags.setRecentFlag('pcutried')
+                                       pflags.save()
+                                       return ret
+
+                               except Exception,e:
+                                       print traceback.print_exc(); print e
+
+                                       # NOTE: this failure could be an implementation issue on
+                                       #               our end.  So, extra notices are confusing...
+                                       # self._send_pcunotice(host) 
+
+                                       pflags.setRecentFlag('pcufailed')
+                                       pflags.save()
+                                       return False
+
+                       elif not pflags.getRecentFlag('pcu_rins_tried'):
+                               try:
+                                       # set node to 'rins' boot state.
+                                       print "CALLING REBOOT +++ RINS"
+                                       plc.nodeBootState(host, 'rins')
+                                       ret = reboot.reboot(host)
+
+                                       pflags.setRecentFlag('pcu_rins_tried')
                                        pflags.save()
                                        return ret
 
@@ -93,12 +115,12 @@ class Reboot(object):
 
                                        pflags.setRecentFlag('pcumessagesent')
                                        pflags.save()
-                                       # NOTE: this will result in just one message sent at a time.
-                                       return True
 
-                               else:
-                                       print "GetRecentFlag()"
-                                       return False
+                               # This will result in mail() being called next, to try to
+                               # engage the technical contact to take care of it also.
+                               print "RETURNING FALSE"
+                               return False
+
                else:
                        print "NO PCUOK"
                        self.action = "None"
@@ -174,8 +196,6 @@ parser.set_defaults( timewait=0,
                                        force=False, 
                                        nosetup=False, 
                                        verbose=False, 
-                                       stopkey=None,
-                                       stopvalue=None,
                                        quiet=False,
                                        )
 
@@ -210,7 +230,7 @@ if config.nodegroup:
 
 if config.node or config.nodelist:
        if config.node: hostnames = [ config.node ] 
-       else: hostnames = config.getListFromFile(config.nodelist)
+       else: hostnames = util.file.getListFromFile(config.nodelist)
 
 fb = database.dbLoad("findbad")
 
@@ -221,14 +241,18 @@ if config.findbad:
        # rerun findbad with the nodes in the given nodes.
        file = "findbad.txt"
        util.file.setFileFromList(file, hostnames)
-       os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
+       os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
+       # TODO: shouldn't we reload the node list now?
 
+l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
 # commands:
 i = 1
 count = 1
+#print "hosts: %s" % hostnames
 for host in hostnames:
 
        #if 'echo' in host or 'hptest-1' in host: continue
+
        try:
                try:
                        node = api.GetNodes(host)[0]
@@ -240,6 +264,9 @@ for host in hostnames:
                print "%-2d" % i, nodegroup_display(node, fb)
                i += 1
                if i-1 <= int(config.skip): continue
+               if host in l_blacklist:
+                       print "%s is blacklisted.  Skipping." % host
+                       continue
 
                if config.stopselect:
                        dict_query = query_to_dict(config.stopselect)
@@ -249,20 +276,17 @@ for host in hostnames:
                        if verify(dict_query, fbnode) and observed_state != "dbg ":
                                # evaluates to true, therefore skip.
                                print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
-                               continue
-
-               if config.stopkey and config.stopvalue:
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
+                               try:
+                                       # todo: clean up act_all record here.
+                                       # todo: send thank you, etc.
+                                       mailmonitor.reboot(host)
+                               except Exception, e:
+                                       print traceback.print_exc(); print e
 
-                       if config.stopkey in fbnode:
-                               if config.stopvalue in fbnode[config.stopkey] and observed_state != "dbg ":
-                                       print "%s has stopvalue; skipping..." % host
-                                       continue
-                       else:
-                               print "stopkey %s not in fbnode record for %s; skipping..." % (config.stopkey, host)
-                               print fbnode
                                continue
+                       #else:
+                               #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
+                               #sys.exit(1)
 
                if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
                        print "recently rebooted %s.  skipping... " % host
index da6249d..97bd173 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -171,7 +171,7 @@ def closeTicketViaRT(ticket_id, comment):
        return
 
 def emailViaRT(subject, text, to, ticket_id=None):
-       if ticket_id == None or ticket_id == "":
+       if ticket_id == None or ticket_id == "" or ticket_id == 0:
                print "No TICKET"
                return emailViaRT_NoTicket(subject, text, to)
 
index 87b301f..c9c1750 100644 (file)
@@ -17,7 +17,7 @@ api = plc.getAuthAPI()
 from clean_policy import *
 
 def reboot(hostname):
-       print "calling reboot!!! %s " % hostname
+       print "CALLING: mailmonitor.reboot(%s)" % hostname
 
        l_nodes = api.GetNodes(hostname)
        if len(l_nodes) == 0:
@@ -30,11 +30,11 @@ def reboot(hostname):
        if len(l_nodes) == 0:
                raise Exception("Host removed via blacklist: %s" % hostname)
 
-       ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
-       if ad_dbTickets == None:
-               raise Exception("Could not find cached dbTickets")
+       #ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
+       #if ad_dbTickets == None:
+       #       raise Exception("Could not find cached dbTickets")
 
-       print "starting new thing"
+       #print "starting new thing"
        mon = MonitorMergeDiagnoseSendEscellate(hostname, True)
        mon.run()
 
index ba67625..a8b82ea 100644 (file)
@@ -132,13 +132,16 @@ def nodegroup_display(node, fb, conf=None):
        node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu']
        node['lastupdate'] = diff_time(node['last_contact'])
        pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
-       node['lc'] = diff_time(pf.last_changed)
+       try:
+               node['lc'] = diff_time(pf.last_changed)
+       except:
+               node['lc'] = "err"
        ut = fb['nodes'][node['hostname']]['values']['comonstats']['uptime']
        if ut != "null":
                ut = diff_time(float(fb['nodes'][node['hostname']]['values']['comonstats']['uptime']), False)
        node['uptime'] = ut
 
-       return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)33s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
+       return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)35.35s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
 
 def datetime_fromstr(str):
        if '-' in str:
index e96e7b4..3f4b980 100755 (executable)
@@ -22,6 +22,7 @@ from nodequery import verify,query_to_dict,node_select
 
 from nodecommon import *
 import database
+import util.file
 
 def main():
        fb = database.dbLoad("findbad")
@@ -52,7 +53,7 @@ def main():
                if config.node: 
                        hostlist = [ config.node ] 
                else: 
-                       hostlist = config.getListFromFile(config.nodelist)
+                       hostlist = util.file.getListFromFile(config.nodelist)
 
                # NOTE: preserve order given in file.  Otherwise, return values are not in order
                # given to GetNodes
index 30838f1..fee8eb3 100755 (executable)
@@ -76,8 +76,7 @@ def act_print_nodeinfo(actnode, header):
        if 'rt' in actnode and 'Status' in actnode['rt']:
                print "\t %5.5s %5.5s | %8.8s | %15.15s | %s" % \
                        (actnode['rt']['Status'], actnode['rt']['id'][7:],
-                        actnode['category'], actnode['action'][0], 
-                        actnode['msg_format'][:-1])
+                        actnode['category'], actnode['action'][0], actnode['info'][1:])
        else:
                if type(actnode['action']) == type([]):
                        action = actnode['action'][0]
index c3f7ab8..e746e5b 100755 (executable)
@@ -18,8 +18,8 @@ import time
 import re
 
 #fb = {}
-fb = {}
-fbpcu = {}
+fb = None
+fbpcu = None
 
 class NoKeyException(Exception): pass
 
@@ -46,7 +46,10 @@ def fb_print_nodeinfo(fbnode, hostname, fields=None):
                        fbnode['kernel'] = fbnode['kernel'].split()[2]
                fbnode['boot_state'] = fbnode['plcnode']['boot_state']
 
-               print "%(hostname)-39s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+               if len(fbnode['nodegroups']) > 0:
+                       fbnode['category'] = fbnode['nodegroups'][0]
+
+               print "%(hostname)-45s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
        else:
                format = ""
                for f in fields:
@@ -143,7 +146,13 @@ def verify(constraints, data):
                        #print "looking at key: %s" % key
                        if key in data: 
                                value_re = re.compile(con[key])
-                               con_and_true = con_and_true & (value_re.search(data[key]) is not None)
+                               if type([]) == type(data[key]):
+                                       local_or_true = False
+                                       for val in data[key]:
+                                               local_or_true = local_or_true | (value_re.search(val) is not None)
+                                       con_and_true = con_and_true & local_or_true
+                               else:
+                                       con_and_true = con_and_true & (value_re.search(data[key]) is not None)
                        elif key not in data:
                                print "missing key %s" % key,
                                pass
@@ -180,10 +189,17 @@ def pcu_in(fbdata):
        return False
 
 def pcu_select(str_query, nodelist=None):
+       global fb
+       global fbpcu
        pcunames = []
        nodenames = []
        if str_query is None: return (nodenames, pcunames)
 
+       if fb is None:
+               fb = database.dbLoad("findbad")
+       if fbpcu is None:
+               fbpcu = database.dbLoad("findbadpcus")
+
        #print str_query
        dict_query = query_to_dict(str_query)
        #print dict_query
@@ -199,7 +215,8 @@ def pcu_select(str_query, nodelist=None):
                                nodenames.append(node)
                                str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \
                                                        (pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
-                               pcunames.append(str)
+                               #pcunames.append(str)
+                               pcunames.append(pcuinfo['pcu_id'])
        return (nodenames, pcunames)
 
 def node_select(str_query, nodelist=None, fbdb=None):
index 337b0b4..e876a76 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -704,9 +704,8 @@ class BayTechCtrlCUnibe(PCUControl):
 
                # Control Outlets  (5 ,1).........5
                try:
-                       print s
-                       print "Enter Request" in s.before
-                       index = s.expect("Enter Request")
+                       #index = s.expect("Enter Request")
+                       index = s.expect(["Enter Request :"])
 
                        if index == 0:
                                print "3"
@@ -720,7 +719,8 @@ class BayTechCtrlCUnibe(PCUControl):
                                        print "Reboot %d" % node_port
                                        s.send("Reboot %d\r\n" % node_port)
 
-                                       index = s.expect(["(Y/N)?"])
+                                       time.sleep(5)
+                                       index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"])
                                        if index == 0:
                                                if dryrun:
                                                        print "sending N"
@@ -728,16 +728,21 @@ class BayTechCtrlCUnibe(PCUControl):
                                                else:
                                                        print "sending Y"
                                                        s.send("Y\r\n")
+                                       elif index == 1:
+                                               raise ExceptionPrompt("PCU Reported 'Port in use.'")
+                                       elif index == 2:
+                                               raise ExceptionSequence("Issued command 'Reboot' failed.")
 
-                               #index = s.expect(["DS-RPC>"])
+                               time.sleep(5)
+                               index = s.expect(["DS-RPC>"])
                                #print "got prompt back"
 
                        s.close()
 
                except pexpect.EOF:
-                       raise ExceptionPrompt("EOF before 'Enter Request' Prompt")
+                       raise ExceptionPrompt("EOF before expected Prompt")
                except pexpect.TIMEOUT:
-                       raise ExceptionPrompt("Timeout before 'Enter Request' Prompt")
+                       raise ExceptionPrompt("Timeout before expected Prompt")
 
                return 0
 
@@ -757,40 +762,54 @@ class BayTechCtrlC(PCUControl):
                # Otherwise, the login succeeded.
 
                # Send a ctrl-c to the remote process.
-               print "sending ctrl-c"
+               print "SENDING ctrl-c"
                s.send(chr(3))
 
                # Control Outlets  (5 ,1).........5
                try:
+                       print "EXPECTING: ", "Enter Request :"
                        index = s.expect(["Enter Request :"])
 
                        if index == 0:
-                               print "5"
+                               print "SENDING: 5"
                                s.send("5\r\n")
-                               index = s.expect(["DS-RPC>", "Enter user name:"])
+                               print "EXPECTING: ", "DS-RPC>"
+                               index = s.expect(["DS-RPC>", "Enter user name:", "Port in use."])
                                if index == 1:
                                        print "sending username"
                                        s.send(self.username + "\r\n")
                                        index = s.expect(["DS-RPC>"])
+                               elif index == 2:
+                                       raise ExceptionPrompt("PCU Reported 'Port in use.'")
 
                                if index == 0:
-                                       print "Reboot %d" % node_port
+                                       print "SENDING: Reboot %d" % node_port
                                        s.send("Reboot %d\r\n" % node_port)
 
-                                       index = s.expect(["(Y/N)?"])
+                                       print "SLEEPING: 5"
+                                       time.sleep(5)
+                                       print "EXPECTING: ", "Y/N?"
+                                       index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"])
                                        if index == 0:
                                                if dryrun:
                                                        print "sending N"
                                                        s.send("N\r\n")
                                                else:
-                                                       print "sending Y"
+                                                       print "SENDING: Y"
                                                        s.send("Y\r\n")
+                                       elif index == 1:
+                                               raise ExceptionPrompt("PCU Reported 'Port in use.'")
+                                       elif index == 2:
+                                               raise ExceptionSequence("Issued command 'Reboot' failed.")
 
                                # NOTE: for some reason, the script times out with the
                                # following line.  In manual tests, it works correctly, but
                                # with automated tests, evidently it fails.
-                               #index = s.expect(["DS-RPC>"])
-                               #print "got prompt back"
+                               print "SLEEPING: 5"
+                               time.sleep(5)
+                               #print "TOTAL--", s.allstr, "--EOT"
+                               index = s.expect(["DS-RPC>"])
+                               print "got prompt back"
 
                        s.close()
 
@@ -817,6 +836,7 @@ class BayTech(PCUControl):
                        # even after login...
                        print "msg: %s" % msg
                        self.transport.write(self.username + "\r\n")
+                       time.sleep(5)
                        self.ifThenSend("DS-RPC>", "Reboot %d" % node_port)
 
                # Reboot Outlet  N        (Y/N)?
@@ -824,6 +844,7 @@ class BayTech(PCUControl):
                        self.ifThenSend("(Y/N)?", "N")
                else:
                        self.ifThenSend("(Y/N)?", "Y")
+               time.sleep(5)
                self.ifThenSend("DS-RPC>", "")
 
                self.close()
@@ -1227,7 +1248,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                        print values
 
                        # TODO: make a more robust version of APC
-                       if values['pcu_id'] in [1163,1055,1111,1231,1113,1127,1128,1148]:
+                       if values['pcu_id'] in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
                                apc = APCEurope(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
@@ -1235,11 +1256,11 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                apc = APCBrazil(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
-                       elif values['pcu_id'] in [1221,1225]:
+                       elif values['pcu_id'] in [1221,1225,1220]:
                                apc = APCBerlin(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
-                       elif values['pcu_id'] in [1173,1221,1220]:
+                       elif values['pcu_id'] in [1173,1240]:
                                apc = APCFolsom(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
@@ -1249,7 +1270,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
 
                # BayTech DS4-RPC
                elif continue_probe and values['model'].find("DS4-RPC") >= 0:
-                       if values['pcu_id'] in [1237,1052,1209,1002,1008,1041,1013,1022]:
+                       if values['pcu_id'] in [1056,1237,1052,1209,1002,1008,1041,1013,1022]:
                                # These  require a 'ctrl-c' to be sent... 
                                baytech = BayTechCtrlC(values, verbose, ['22', '23'])
                                rb_ret = baytech.reboot(values[nodename], dryrun)
index 19ee230..4eab532 100644 (file)
@@ -342,6 +342,7 @@ class spawn (object):
         self.env = env
         self.__irix_hack = sys.platform.lower().find('irix') >= 0 # This flags if we are running on irix
         self.use_native_pty_fork = not (sys.platform.lower().find('solaris') >= 0) # Solaris uses internal __fork_pty(). All other use pty.fork().
+        self.allstr = ""
 
         # allow dummy instances for subclasses that may not use command or args.
         if command is None:
@@ -1108,6 +1109,7 @@ class spawn (object):
                     self.buffer = incoming[self.match.end() : ]
                     self.before = incoming[ : self.match.start()]
                     self.after = incoming[self.match.start() : self.match.end()]
+                    #print "MATCH--", self.after, "--EOM"
                     return self.match_index
                 # No match at this point
                 if timeout < 0 and timeout is not None:
@@ -1116,6 +1118,8 @@ class spawn (object):
                 c = self.read_nonblocking (self.maxread, timeout)
                 time.sleep (0.0001)
                 incoming = incoming + c
+                self.allstr += c
+                #print "INCOMING--", c, "--EOI"
                 if timeout is not None:
                     timeout = end_time - time.time()
         except EOF, e:
index acc89d8..8c5fb7f 100755 (executable)
@@ -91,7 +91,7 @@ class RT(object):
                return self.status
 
        def closeTicket(self):
-               mailer.closeTicketViaRT(self.ticket_id) 
+               mailer.closeTicketViaRT(self.ticket_id, "Ticket CLOSED automatically by SiteAssist."
 
        def email(self, subject, body, to):
                self.ticket_id = mailer.emailViaRT(subject, body, to, self.ticket_id)
@@ -229,10 +229,10 @@ class PersistMessage(Message):
 
                #print pm
                if id in pm:
-                       print "Using existing object"
+                       #print "Using existing object"
                        obj = pm[id]
                else:
-                       print "creating new object"
+                       #print "creating new object"
                        obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
                        obj.id = id
                        obj.actiontracker = Recent(3*60*60*24)
@@ -252,18 +252,19 @@ class PersistMessage(Message):
        def reset(self):
                self.actiontracker.unsetRecent()
 
+       def save(self):
+               pm = database.dbLoad(self.db)
+               pm[self.id] = self
+               database.dbDump(self.db, pm)
+
        def send(self, to):
                if not self.actiontracker.isRecent():
                        self.ticket_id = Message.send(self, to)
                        self.actiontracker.setRecent()
-
-                       #print "recording object for persistance"
-                       pm = database.dbLoad(self.db)
-                       pm[self.id] = self
-                       database.dbDump(self.db, pm)
+                       self.save()
                else:
                        # NOTE: only send a new message every week, regardless.
-                       print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24)
+                       print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
 
 class MonitorMessage(object):
        def __new__(typ, id, *args, **kwargs):
@@ -427,6 +428,7 @@ class Record(object):
        def severity(self):
                category = self.data['category']
                prev_category = self.data['prev_category']
+               #print "SEVERITY: ", category, prev_category
                val = cmpCategoryVal(category, prev_category)
                return val 
 
@@ -504,33 +506,46 @@ class Record(object):
 
        def takeAction(self):
                pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
-               if 'improvement' in self.data['stage'] or self.improved():
-                       print "decreasing penalty for %s"%self.hostname
+               if 'improvement' in self.data['stage'] or self.improved() or \
+                       'monitor-end-record' in self.data['stage']:
+                       print "takeAction: decreasing penalty for %s"%self.hostname
+                       pp.decrease()
                        pp.decrease()
                else:
-                       print "increasing penalty for %s"%self.hostname
+                       print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
                pp.apply(self.hostname)
                pp.save()
 
        def _format_diaginfo(self):
                info = self.data['info']
+               print "FORMAT : STAGE: ", self.data['stage']
                if self.data['stage'] == 'monitor-end-record':
+                       if info[2] == "ALPHA": info = (info[0], info[1], "PROD")
                        hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
                else:
                        hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
                return hlist
+       def saveAction(self):
+               if 'save-act-all' in self.data and self.data['save-act-all'] == True:
+                       return True
+               else:
+                       return False
 
        def getMessage(self, ticket_id=None):
                self.data['args']['hostname'] = self.hostname
                self.data['args']['loginbase'] = self.loginbase
                self.data['args']['hostname_list'] = self._format_diaginfo()
-               message = PersistMessage(self.hostname, 
+               #print self.data['message']
+               if self.data['message']:
+                       message = PersistMessage(self.hostname, 
                                                                 self.data['message'][0] % self.data['args'],
                                                                 self.data['message'][1] % self.data['args'],
                                                                 True, db='monitor_persistmessages',
                                                                 ticket_id=ticket_id)
-               return message
+                       return message
+               else:
+                       return None
        
        def getContacts(self):
                roles = self.data['email']
@@ -579,6 +594,7 @@ class NodeRecord:
        def severity(self):
                category = self.data['category']
                prev_category = self.data['prev_category']
+               print "IMPROVED: ", category, prev_category
                val = cmpCategoryVal(category, prev_category)
                return val 
 
@@ -659,6 +675,15 @@ def node_end_record(node):
                del act_all
                return False
 
+       pm = database.dbLoad("monitor_persistmessages")
+       if node not in pm:
+               del pm
+               return False
+       else:
+               print "deleting node record"
+               del pm[node]
+               database.dbDump("monitor_persistmessages", pm)
+
        a = Action(node, act_all[node][0])
        a.delField('rt')
        a.delField('found_rt_ticket')
@@ -667,8 +692,9 @@ def node_end_record(node):
        a.delField('first-found')
        rec = a.get()
        rec['action'] = ["close_rt"]
-       rec['category'] = "UNKNOWN"
+       rec['category'] = "ALPHA"       # assume that it's up...
        rec['stage'] = "monitor-end-record"
+       rec['ticket_id'] = None
        rec['time'] = time.time() - 7*60*60*24
        act_all[node].insert(0,rec)
        database.dbDump("act_all", act_all)