update of all changes in the last week that fine-tuned the behavior of Monitor

author Stephen Soltesz <soltesz@cs.princeton.edu>

Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
diff --git a/bootman.py b/bootman.py

index a278afe..87d8b71 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -34,9 +34,12 @@ class Sopen(subprocess.Popen):
  #from Rpyc import SocketConnection, Async
  from Rpyc import SocketConnection, Async
  from Rpyc.Utils import *
+fb = None
  
  def get_fbnode(node):
-       fb = database.dbLoad("findbad")
+       global fb
+       if fb is None:
+               fb = database.dbLoad("findbad")
         fbnode = fb['nodes'][node]['values']
         return fbnode
  
@@ -359,7 +362,6 @@ def reboot(hostname, config=None, forced_action=None):
                 except:
                         print traceback.print_exc()
                         return False
-                       
  
         if forced_action == "reboot":
                 conn.restart_node('rins')
diff --git a/clean_policy.py b/clean_policy.py

index d2bde41..a14016e 100644 (file)
--- a/clean_policy.py
+++ b/clean_policy.py
@@ -31,6 +31,9 @@ def get_ticket_id(record):
                 return None
  
  class MonitorMergeDiagnoseSendEscellate:
+       act_all = None
+       fb = None
+
         def __init__(self, hostname, act):
                 self.hostname = hostname
                 self.act = act
@@ -41,7 +44,11 @@ class MonitorMergeDiagnoseSendEscellate:
                 return
  
         def getFBRecord(self):
-               fb = database.dbLoad("findbad")
+               if MonitorMergeDiagnoseSendEscellate.fb == None:
+                       MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
+
+               fb = MonitorMergeDiagnoseSendEscellate.fb
+
                 if self.hostname in fb['nodes']:
                         fbnode = fb['nodes'][self.hostname]['values']
                 else:
@@ -50,12 +57,15 @@ class MonitorMergeDiagnoseSendEscellate:
  
         def getActionRecord(self):
                 # update ticket status
-               act_all = database.dbLoad("act_all")
+               if MonitorMergeDiagnoseSendEscellate.act_all == None:
+                       MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
+
+               act_all = MonitorMergeDiagnoseSendEscellate.act_all 
+
                 if self.hostname in act_all and len(act_all[self.hostname]) > 0:
                         actnode = act_all[self.hostname][0]
                 else:
                         actnode = None
-               del act_all
                 return actnode
  
         def getKernel(self, unamestr):
@@ -73,13 +83,15 @@ class MonitorMergeDiagnoseSendEscellate:
                 fbnode['info'] = None
                 fbnode['log'] = None
                 fbnode['time'] = time.time()
+               fbnode['email'] = TECH
+               fbnode['action'] = ['noop']
                 fbnode['date_created'] = time.time()
  
-               if actnode is None:
+               if actnode is None: # there is no entry in act_all
                         actnode = {} 
                         actnode.update(fbnode)
                         actnode['ticket_id'] = ""
-                       actnode['prev_category'] = "NORECORD" 
+                       actnode['prev_category'] = "ERROR" 
                 else:
                         actnode['prev_category']= actnode['category']
                         actnode['comonstats']   = fbnode['comonstats']
@@ -111,29 +123,40 @@ class MonitorMergeDiagnoseSendEscellate:
  
                 diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
                 # NOTE: change record stage based on RT status.
-               diag.setFlag('ResetStage')
+               #diag.setFlag('ResetStage')
                 if record.stageIswaitforever():
                         ticket = record.data['rt']
                         if 'new' in ticket['Status']:
-                               diag.setFlag('ResetStage')
+                               print "Resetting Stage!!!!!"
+                       #       diag.setFlag('ResetStage')
+                               record.reset_stage()
+                       #if diag.getFlag('ResetStage'):
+                       #       print "diagnose: resetting stage"
+                       #       diag.resetFlag('ResetStage')
                                 
                         if 'resolved' in ticket['Status']:
-                               diag.setFlag('EndRecord')
+                               diag.setFlag('RTEndRecord')
  
                 # NOTE: take category, and prepare action
                 category = record.getCategory()
                 if category == "error":
                         diag.setFlag('SendNodedown')
-                       record.data['message'] = emailTxt.mailtxt.newdown
+                       record.data['message_series'] = emailTxt.mailtxt.newdown
                         record.data['log'] = self.getDownLog(record)
  
-               elif category == "prod":
+               elif category == "prod" or category == "alpha":
                         state = record.getState()
                         if state == "boot":
-                               diag.setFlag('SendThankyou')
-                               record.data['message'] = emailTxt.mailtxt.newthankyou
-                               record.data['log'] = self.getThankyouLog(record)
-
+                               if record.severity() != 0:
+                                       diag.setFlag('SendThankyou')
+                                       print "RESETTING STAGE: improvement"
+                                       record.data['stage'] = 'improvement'
+                                       record.data['message_series'] = emailTxt.mailtxt.newthankyou
+                                       record.data['log'] = self.getThankyouLog(record)
+                               else:
+                                       # NOTE: do nothing, since we've already done the above.
+                                       print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
+                                       return None
                         elif state == "debug":
                                 pass
                         else:
@@ -141,55 +164,79 @@ class MonitorMergeDiagnoseSendEscellate:
                 else:
                         print "unknown category: %s" % category
  
-               if diag.getFlag('ResetStage'):
-                       print "resetting stage"
-                       record.reset_stage()
  
+               # TODO: how to not send email?...
                 record = self.checkStageAndTime(diag,record)
-               if record:
-                       print "checkStageAndTime Returned Valid Record"
-                       site = PersistFlags(self.loginbase, 1, db='site_persistflags')
+               #if record:
+               print "diagnose: checkStageAndTime Returned Valid Record"
+               site = PersistFlags(self.loginbase, 1, db='site_persistflags')
  
-                       if site.status is not "good":
-                               print "Setting site %s for 'squeeze'" % self.loginbase
-                               diag.setFlag('Squeeze')
-                       else:
-                               print "Setting site %s for 'backoff'" % self.loginbase
-                               diag.setFlag('BackOff')
-
-                       diag.save()
-                       return diag
+               if site.status != "good":
+                       print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
+                       diag.setFlag('Squeeze')
                 else:
-                       print "checkStageAndTime Returned NULL Record"
-                       return None
+                       print "diagnose: Setting site %s for 'backoff'" % self.loginbase
+                       diag.setFlag('BackOff')
+
+               diag.save()
+               return diag
+               #else:
+               #       print "checkStageAndTime Returned NULL Record"
+               #       return None
  
         def action(self, record, diag):
-               if record.improved() or diag.getFlag('EndRecord'):
-                       print "end record for %s" % self.hostname
-                       record.end_record()
-                       diag.setFlag('CloseRT')
-                       return None
-
-               if self.getSendEmailFlag(record): 
-                       print "sending email"
+
+               message = None
+
+               #print record.data['stage']
+               #print "improvement" in record.data['stage']
+               #print self.getSendEmailFlag(record)
+               if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: 
+                       print "action: getting message"
                         message = record.getMessage(record.data['ticket_id'])
-                       message.reset()
-                       message.send(record.getContacts())
-                       if message.rt.ticket_id:
-                               print "setting record ticket_id"
-                               record.data['ticket_id'] = message.rt.ticket_id
-                       if diag.getFlag('CloseRT'):
-                               message.rt.closeTicket()
+                       if message:
+                               #message.reset()
+                               print "action: sending email"
+                               message.send(record.getContacts())
+                               #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+                               #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+                               #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+                               #print message
+                               if message.rt.ticket_id:
+                                       print "action: setting record ticket_id"
+                                       record.data['ticket_id'] = message.rt.ticket_id
+
+                       if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+                               print "action: taking action"
+                               record.takeAction()
+                               diag.resetFlag('Squeeze')
+                               diag.resetFlag('BackOff')
+                               diag.save()
+
+                       if record.saveAction():
+                               print "action: saving act_all db"
+                               self.add_and_save_act_all(record)
+                       else:
+                               print "action: NOT saving act_all db"
+                               print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
+
+                       if record.improved() or diag.getFlag('RTEndRecord'):
+                               print "action: end record for %s" % self.hostname
+                               record.end_record()
+                               diag.setFlag('CloseRT')
+                               diag.resetFlag('RTEndRecord')
+                               diag.save()
+                               #return None
+
+                       if message:
+                               if diag.getFlag('CloseRT'):
+                                       message.rt.closeTicket()
+                                       diag.resetFlag('CloseRT')
+                                       diag.save()
+
                 else:
                         print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
  
-               if record.data['takeaction'] and diag.getFlag('Squeeze'):
-                       print "taking action"
-                       record.takeAction()
-
-               print "saving act_all db"
-               self.add_and_save_act_all(record)
-
                 return
  
         def getSendEmailFlag(self, record):
@@ -200,13 +247,16 @@ class MonitorMergeDiagnoseSendEscellate:
                 if  'rt' in record.data and \
                         'Status' in record.data['rt'] and \
                         "open" in record.data['rt']['Status'] and \
-                       record.data['rt']['Created'] < 60*60*24*30:
+                       record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
+                       # if created-time is greater than the thirty days ago from the current time
                         return False
  
                 return True
  
         def add_and_save_act_all(self, record):
                 self.act_all = database.dbLoad("act_all")
+               if self.hostname not in self.act_all:
+                       self.act_all[self.hostname] = []
                 self.act_all[self.hostname].insert(0,record.data)
                 database.dbDump("act_all", self.act_all)
                 
@@ -218,7 +268,7 @@ class MonitorMergeDiagnoseSendEscellate:
                 #for key in record.data.keys():
                 #       print "%10s %s %s " % (key, "==", record.data[key])
  
-               if record.data['ticket_id'] == "":
+               if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
                         log = "DOWN: %20s : %-40s == %20s %s" % \
                                 (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
                 else:
@@ -231,79 +281,99 @@ class MonitorMergeDiagnoseSendEscellate:
                 record.data['args'] = {'nodename': self.hostname}
                 record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
  
-               if record.data['ticket_id'] == "":
-                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+               try:
+                       if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
+                               log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                 (self.loginbase, self.hostname, record.data['stage'], 
-                                                state, category, record.data['found_rt_ticket'])
-               else:
-                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                                record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
+                       else:
+                               log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
                                                 (self.loginbase, self.hostname, record.data['stage'], 
-                                                state, category, record.data['ticket_id'])
+                                                record.data['prev_category'], record.data['category'], record.data['ticket_id'])
+               except:
+                       log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
                 return log
  
         def checkStageAndTime(self, diag, record):
                 current_time = time.time()
                 delta = current_time - record.data['time']
+               #print record.data
                 if   'findbad' in record.data['stage']:
                         # The node is bad, and there's no previous record of it.
                         record.data['email'] = TECH
                         record.data['action'] = ['noop']
                         record.data['takeaction'] = False
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                         record.data['stage'] = 'stage_actinoneweek'
+                       record.data['save-act-all'] = True
  
                 elif 'reboot_node' in record.data['stage']:
                         record.data['email'] = TECH
                         record.data['action'] = ['noop']
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                         record.data['stage'] = 'stage_actinoneweek'
                         record.data['takeaction'] = False
+                       record.data['save-act-all'] = False
                         
                 elif 'improvement' in record.data['stage']:
-                       print "backing off of %s" % self.hostname
+                       print "checkStageAndTime: backing off of %s" % self.hostname
                         record.data['action'] = ['close_rt']
                         record.data['takeaction'] = True
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                         record.data['stage'] = 'monitor-end-record'
+                       record.data['save-act-all'] = True
  
                 elif 'actinoneweek' in record.data['stage']:
                         if delta >= 7 * SPERDAY: 
+                               print "checkStageAndTime: transition to next stage actintwoweeks"
                                 record.data['email'] = TECH | PI
                                 record.data['stage'] = 'stage_actintwoweeks'
-                               record.data['message'] = record.data['message'][1]
+                               record.data['message'] = record.data['message_series'][1]
                                 record.data['action'] = ['nocreate' ]
                                 record.data['time'] = current_time              # reset clock for waitforever
                                 record.data['takeaction'] = True
+                               record.data['save-act-all'] = True
                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
+                               print "checkStageAndTime: second message in one week"
                                 record.data['email'] = TECH 
-                               record.data['message'] = record.data['message'][0]
+                               record.data['message'] = record.data['message_series'][0]
                                 record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
                                 record.data['second-mail-at-oneweek'] = True
                                 record.data['takeaction'] = False
+                               record.data['save-act-all'] = True
                         else:
                                 record.data['message'] = None
                                 record.data['action'] = ['waitforoneweekaction' ]
-                               print "ignoring this record for: %s" % self.hostname
-                               return None                     # don't send if there's no action
+                               record.data['takeaction'] = False
+                               record.data['save-act-all'] = False
+                               print "checkStageAndTime: ignoring this record for: %s" % self.hostname
+                               #return None                    # don't send if there's no action
  
                 elif 'actintwoweeks' in record.data['stage']:
                         if delta >= 7 * SPERDAY:
+                               print "checkStageAndTime: transition to next stage waitforever"
                                 record.data['email'] = TECH | PI | USER
                                 record.data['stage'] = 'stage_waitforever'
-                               record.data['message'] = record.data['message'][2]
+                               record.data['message'] = record.data['message_series'][2]
                                 record.data['action'] = ['suspendslices']
                                 record.data['time'] = current_time              # reset clock for waitforever
                                 record.data['takeaction'] = True
+                               record.data['save-act-all'] = True
                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
+                               print "checkStageAndTime: second message in one week for stage two"
                                 record.data['email'] = TECH | PI
-                               record.data['message'] = record.data['message'][1]
+                               record.data['message'] = record.data['message_series'][1]
                                 record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
                                 record.data['second-mail-at-twoweeks'] = True
                                 record.data['takeaction'] = False
+                               record.data['save-act-all'] = True
                         else:
                                 record.data['message'] = None
+                               record.data['takeaction'] = False
                                 record.data['action'] = ['waitfortwoweeksaction']
-                               return None                     # don't send if there's no action
+                               record.data['save-act-all'] = False
+                               print "checkStageAndTime: second message in one week for stage two"
+                               #return None                    # don't send if there's no action
  
                 elif 'ticket_waitforever' in record.data['stage']:
                         record.data['email'] = TECH
@@ -314,14 +384,18 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['action'] = ['ticket_waitforever']
                                 record.data['message'] = None
                                 record.data['time'] = current_time
+                               record.data['save-act-all'] = True
                         else:
                                 if delta >= 7*SPERDAY:
                                         record.data['action'] = ['ticket_waitforever']
                                         record.data['message'] = None
                                         record.data['time'] = current_time              # reset clock
+                                       record.data['save-act-all'] = True
                                 else:
                                         record.data['action'] = ['ticket_waitforever']
                                         record.data['message'] = None
+                                       record.data['takeaction'] = False
+                                       record.data['save-act-all'] = False
                                         return None
  
                 elif 'waitforever' in record.data['stage']:
@@ -331,12 +405,15 @@ class MonitorMergeDiagnoseSendEscellate:
                         record.data['takeaction'] = True
                         if delta >= 3*SPERDAY:
                                 record.data['action'] = ['email-againwaitforever']
-                               record.data['message'] = record.data['message'][2]
+                               record.data['message'] = record.data['message_series'][2]
                                 record.data['time'] = current_time              # reset clock
+                               record.data['save-act-all'] = True
                         else:
                                 record.data['action'] = ['waitforever']
                                 record.data['message'] = None
-                               return None                     # don't send if there's no action
+                               record.data['takeaction'] = False
+                               record.data['save-act-all'] = False
+                               #return None                    # don't send if there's no action
  
                 else:
                         # There is no action to be taken, possibly b/c the stage has
@@ -347,14 +424,15 @@ class MonitorMergeDiagnoseSendEscellate:
                         # TODO: figure out which. for now assume 2.
                         print "UNKNOWN stage for %s; nothing done" % self.hostname
                         record.data['action'] = ['unknown']
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
  
                         record.data['email'] = TECH
                         record.data['action'] = ['noop']
-                       record.data['message'] = record.data['message'][0]
+                       record.data['message'] = record.data['message_series'][0]
                         record.data['stage'] = 'stage_actinoneweek'
                         record.data['time'] = current_time              # reset clock
                         record.data['takeaction'] = False
+                       record.data['save-act-all'] = True
  
                 print "%s" % record.data['log'],
                 print "%15s" % record.data['action']
diff --git a/emailTxt.py b/emailTxt.py

index c2e147f..cfbf112 100644 (file)
--- a/emailTxt.py
+++ b/emailTxt.py
@@ -30,7 +30,7 @@ If the machine has booted successfully, you may check it more quickly by logging
  
      sudo /usr/sbin/vps ax
  
-If you have a BootCD older than 3.0, you will need to create burn a new BootImage to CD or USB.  You can find instructions for this at the Technical Contact's Guide:
+If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB.  You can find instructions for this at the Technical Contact's Guide:
  
      https://www.planet-lab.org/doc/guides/bootcdsetup
  
@@ -204,24 +204,32 @@ Monitor restarted NM on the following machines:
         """)
         pcudown_one =("""Could not use PCU to reboot %(hostname)s""",
  
-"""As part of our machine monitoring and maintenance, we tried to use the PCU
-registered below, but could not for the following reason at the link below:
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered below, but could not for the reasons at the link below:
  
         https://monitor.planet-lab.org/cgi-bin/printbadpcus.php?id=%(pcu_id)s
  
-We need your help resolving this issue in two ways:  
+We need your help resolving this issue in a few ways:  
+
+ 1. First, we need your help rebooting %(hostname)s.  Because the above PCU does 
+    not appear to work, please manually reboot this machine.  If it turns out that 
+    there is a problem with the PCU configuration, we can help you
+    resolve that independently.
  
-* First, we need your help rebooting %(hostname)s.  Because we cannot leverage
-  the above PCU, please manually reboot this machine and we can help you
-  resolve any configuration errors with the PCU independently.
+ 2. If there is nothing apparently wrong with the PCU, or the mapping between
+    the PCU and the host, then there is likely a problem with our bootstrap
+    software on your machine.  To help us, please make a note of any text on
+    the console and report it to mailto:support@planet-lab.org .  An example
+    might be that the console hangs waiting for a module to unload.  The last
+    reported name or any error messages on the screen would be very helpful.
  
-* Second, if it is possible, please correcct the above PCU problem.  
-  By enabling us to take administrative actions automatically from
-  PlanetLab Central without local intervention, you can trade a small
-  amount of time now for a time savings in the future. 
+ 3. Alternately, if it is possible, please correcct the above PCU problem, or
+    let us know what steps you are taking.  By enabling us to take administrative 
+    actions automatically from PlanetLab Central without your intervention, you 
+    can trade a small amount of time now for a time savings in the future. 
  
  If the PCU is up and running, but behind a firewall, please make it accessible
-from address block 128.112.139.0/25.  You can confirm that this is the address
+from address block 128.112.139.0/24.  You can confirm that this is the address
  space from which the PlanetLab Central servers run.
  
  If the above PCU is no longer in service, please delete it by visiting:
diff --git a/findbadpcu.py b/findbadpcu.py

index e3d160d..ca65344 100755 (executable)
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -12,6 +12,7 @@ import sets
      
  import signal
  import traceback
+from nodequery import pcu_select
  
  #old_handler = signal.getsignal(signal.SIGCHLD)
  
@@ -329,7 +330,7 @@ def checkAndRecordState(l_pcus, cohash):
         global count
         global_round = externalState['round']
  
-       tp = threadpool.ThreadPool(20)
+       tp = threadpool.ThreadPool(10)
  
         # CREATE all the work requests
         for pcuname in l_pcus:
@@ -390,6 +391,11 @@ def main():
                 pcus = []
                 for node in l_nodes:
                         pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+       elif config.pcuselect is not None:
+               n, pcus = pcu_select(config.pcuselect)
+               # clear out dups.
                 l_pcus = [pcu for pcu in sets.Set(pcus)]
  
         elif config.nodelist == None and config.pcuid == None:
@@ -421,6 +427,7 @@ if __name__ == '__main__':
         parser.set_defaults(nodelist=None, 
                                                 increment=False, 
                                                 pcuid=None,
+                                               pcuselect=None,
                                                 site=None,
                                                 dbname="findbadpcus", 
                                                 cachenodes=False,
@@ -430,6 +437,8 @@ if __name__ == '__main__':
                                                 help="Provide the input file for the node list")
         parser.add_option("", "--site", dest="site", metavar="FILE", 
                                                 help="Get all pcus associated with the given site's nodes")
+       parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", 
+                                               help="Query string to apply to the findbad pcus")
         parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
                                                 help="Provide the id for a single pcu")
  
diff --git a/grouprins.py b/grouprins.py

index 1896f41..d859727 100755 (executable)
--- a/grouprins.py
+++ b/grouprins.py
@@ -64,13 +64,35 @@ class Reboot(object):
                         self.action = "reboot.reboot('%s')" % host
  
                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
-                       pflags.resetRecentFlag('pcutried')
+                       #pflags.resetRecentFlag('pcutried')
                         if not pflags.getRecentFlag('pcutried'):
-                               pflags.setRecentFlag('pcutried')
                                 try:
                                         print "CALLING REBOOT!!!"
                                         ret = reboot.reboot(host)
  
+                                       pflags.setRecentFlag('pcutried')
+                                       pflags.save()
+                                       return ret
+
+                               except Exception,e:
+                                       print traceback.print_exc(); print e
+
+                                       # NOTE: this failure could be an implementation issue on
+                                       #               our end.  So, extra notices are confusing...
+                                       # self._send_pcunotice(host) 
+
+                                       pflags.setRecentFlag('pcufailed')
+                                       pflags.save()
+                                       return False
+
+                       elif not pflags.getRecentFlag('pcu_rins_tried'):
+                               try:
+                                       # set node to 'rins' boot state.
+                                       print "CALLING REBOOT +++ RINS"
+                                       plc.nodeBootState(host, 'rins')
+                                       ret = reboot.reboot(host)
+
+                                       pflags.setRecentFlag('pcu_rins_tried')
                                         pflags.save()
                                         return ret
  
@@ -93,12 +115,12 @@ class Reboot(object):
  
                                         pflags.setRecentFlag('pcumessagesent')
                                         pflags.save()
-                                       # NOTE: this will result in just one message sent at a time.
-                                       return True
  
-                               else:
-                                       print "GetRecentFlag()"
-                                       return False
+                               # This will result in mail() being called next, to try to
+                               # engage the technical contact to take care of it also.
+                               print "RETURNING FALSE"
+                               return False
+
                 else:
                         print "NO PCUOK"
                         self.action = "None"
@@ -174,8 +196,6 @@ parser.set_defaults( timewait=0,
                                         force=False, 
                                         nosetup=False, 
                                         verbose=False, 
-                                       stopkey=None,
-                                       stopvalue=None,
                                         quiet=False,
                                         )
  
@@ -210,7 +230,7 @@ if config.nodegroup:
  
  if config.node or config.nodelist:
         if config.node: hostnames = [ config.node ] 
-       else: hostnames = config.getListFromFile(config.nodelist)
+       else: hostnames = util.file.getListFromFile(config.nodelist)
  
  fb = database.dbLoad("findbad")
  
@@ -221,14 +241,18 @@ if config.findbad:
         # rerun findbad with the nodes in the given nodes.
         file = "findbad.txt"
         util.file.setFileFromList(file, hostnames)
-       os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
+       os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
+       # TODO: shouldn't we reload the node list now?
  
+l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
  # commands:
  i = 1
  count = 1
+#print "hosts: %s" % hostnames
  for host in hostnames:
  
         #if 'echo' in host or 'hptest-1' in host: continue
+
         try:
                 try:
                         node = api.GetNodes(host)[0]
@@ -240,6 +264,9 @@ for host in hostnames:
                 print "%-2d" % i, nodegroup_display(node, fb)
                 i += 1
                 if i-1 <= int(config.skip): continue
+               if host in l_blacklist:
+                       print "%s is blacklisted.  Skipping." % host
+                       continue
  
                 if config.stopselect:
                         dict_query = query_to_dict(config.stopselect)
@@ -249,20 +276,17 @@ for host in hostnames:
                         if verify(dict_query, fbnode) and observed_state != "dbg ":
                                 # evaluates to true, therefore skip.
                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
-                               continue
-
-               if config.stopkey and config.stopvalue:
-                       fbnode = fb['nodes'][host]['values']
-                       observed_state = get_current_state(fbnode)
+                               try:
+                                       # todo: clean up act_all record here.
+                                       # todo: send thank you, etc.
+                                       mailmonitor.reboot(host)
+                               except Exception, e:
+                                       print traceback.print_exc(); print e
  
-                       if config.stopkey in fbnode:
-                               if config.stopvalue in fbnode[config.stopkey] and observed_state != "dbg ":
-                                       print "%s has stopvalue; skipping..." % host
-                                       continue
-                       else:
-                               print "stopkey %s not in fbnode record for %s; skipping..." % (config.stopkey, host)
-                               print fbnode
                                 continue
+                       #else:
+                               #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
+                               #sys.exit(1)
  
                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
                         print "recently rebooted %s.  skipping... " % host
diff --git a/mailer.py b/mailer.py

index da6249d..97bd173 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -171,7 +171,7 @@ def closeTicketViaRT(ticket_id, comment):
         return
  
  def emailViaRT(subject, text, to, ticket_id=None):
-       if ticket_id == None or ticket_id == "":
+       if ticket_id == None or ticket_id == "" or ticket_id == 0:
                 print "No TICKET"
                 return emailViaRT_NoTicket(subject, text, to)
  
diff --git a/mailmonitor.py b/mailmonitor.py

index 87b301f..c9c1750 100644 (file)
--- a/mailmonitor.py
+++ b/mailmonitor.py
@@ -17,7 +17,7 @@ api = plc.getAuthAPI()
  from clean_policy import *
  
  def reboot(hostname):
-       print "calling reboot!!! %s " % hostname
+       print "CALLING: mailmonitor.reboot(%s)" % hostname
  
         l_nodes = api.GetNodes(hostname)
         if len(l_nodes) == 0:
@@ -30,11 +30,11 @@ def reboot(hostname):
         if len(l_nodes) == 0:
                 raise Exception("Host removed via blacklist: %s" % hostname)
  
-       ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
-       if ad_dbTickets == None:
-               raise Exception("Could not find cached dbTickets")
+       #ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
+       #if ad_dbTickets == None:
+       #       raise Exception("Could not find cached dbTickets")
  
-       print "starting new thing"
+       #print "starting new thing"
         mon = MonitorMergeDiagnoseSendEscellate(hostname, True)
         mon.run()
  
diff --git a/nodecommon.py b/nodecommon.py

index ba67625..a8b82ea 100644 (file)
--- a/nodecommon.py
+++ b/nodecommon.py
@@ -132,13 +132,16 @@ def nodegroup_display(node, fb, conf=None):
         node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu']
         node['lastupdate'] = diff_time(node['last_contact'])
         pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
-       node['lc'] = diff_time(pf.last_changed)
+       try:
+               node['lc'] = diff_time(pf.last_changed)
+       except:
+               node['lc'] = "err"
         ut = fb['nodes'][node['hostname']]['values']['comonstats']['uptime']
         if ut != "null":
                 ut = diff_time(float(fb['nodes'][node['hostname']]['values']['comonstats']['uptime']), False)
         node['uptime'] = ut
  
-       return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)33s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
+       return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)35.35s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
  
  def datetime_fromstr(str):
         if '-' in str:
diff --git a/nodegroups.py b/nodegroups.py

index e96e7b4..3f4b980 100755 (executable)
--- a/nodegroups.py
+++ b/nodegroups.py
@@ -22,6 +22,7 @@ from nodequery import verify,query_to_dict,node_select
  
  from nodecommon import *
  import database
+import util.file
  
  def main():
         fb = database.dbLoad("findbad")
@@ -52,7 +53,7 @@ def main():
                 if config.node: 
                         hostlist = [ config.node ] 
                 else: 
-                       hostlist = config.getListFromFile(config.nodelist)
+                       hostlist = util.file.getListFromFile(config.nodelist)
  
                 # NOTE: preserve order given in file.  Otherwise, return values are not in order
                 # given to GetNodes
diff --git a/nodeinfo.py b/nodeinfo.py

index 30838f1..fee8eb3 100755 (executable)
--- a/nodeinfo.py
+++ b/nodeinfo.py
@@ -76,8 +76,7 @@ def act_print_nodeinfo(actnode, header):
         if 'rt' in actnode and 'Status' in actnode['rt']:
                 print "\t %5.5s %5.5s | %8.8s | %15.15s | %s" % \
                         (actnode['rt']['Status'], actnode['rt']['id'][7:],
-                        actnode['category'], actnode['action'][0], 
-                        actnode['msg_format'][:-1])
+                        actnode['category'], actnode['action'][0], actnode['info'][1:])
         else:
                 if type(actnode['action']) == type([]):
                         action = actnode['action'][0]
diff --git a/nodequery.py b/nodequery.py

index c3f7ab8..e746e5b 100755 (executable)
--- a/nodequery.py
+++ b/nodequery.py
@@ -18,8 +18,8 @@ import time
  import re
  
  #fb = {}
-fb = {}
-fbpcu = {}
+fb = None
+fbpcu = None
  
  class NoKeyException(Exception): pass
  
@@ -46,7 +46,10 @@ def fb_print_nodeinfo(fbnode, hostname, fields=None):
                         fbnode['kernel'] = fbnode['kernel'].split()[2]
                 fbnode['boot_state'] = fbnode['plcnode']['boot_state']
  
-               print "%(hostname)-39s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+               if len(fbnode['nodegroups']) > 0:
+                       fbnode['category'] = fbnode['nodegroups'][0]
+
+               print "%(hostname)-45s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
         else:
                 format = ""
                 for f in fields:
@@ -143,7 +146,13 @@ def verify(constraints, data):
                         #print "looking at key: %s" % key
                         if key in data: 
                                 value_re = re.compile(con[key])
-                               con_and_true = con_and_true & (value_re.search(data[key]) is not None)
+                               if type([]) == type(data[key]):
+                                       local_or_true = False
+                                       for val in data[key]:
+                                               local_or_true = local_or_true | (value_re.search(val) is not None)
+                                       con_and_true = con_and_true & local_or_true
+                               else:
+                                       con_and_true = con_and_true & (value_re.search(data[key]) is not None)
                         elif key not in data:
                                 print "missing key %s" % key,
                                 pass
@@ -180,10 +189,17 @@ def pcu_in(fbdata):
         return False
  
  def pcu_select(str_query, nodelist=None):
+       global fb
+       global fbpcu
         pcunames = []
         nodenames = []
         if str_query is None: return (nodenames, pcunames)
  
+       if fb is None:
+               fb = database.dbLoad("findbad")
+       if fbpcu is None:
+               fbpcu = database.dbLoad("findbadpcus")
+
         #print str_query
         dict_query = query_to_dict(str_query)
         #print dict_query
@@ -199,7 +215,8 @@ def pcu_select(str_query, nodelist=None):
                                 nodenames.append(node)
                                 str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \
                                                         (pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
-                               pcunames.append(str)
+                               #pcunames.append(str)
+                               pcunames.append(pcuinfo['pcu_id'])
         return (nodenames, pcunames)
  
  def node_select(str_query, nodelist=None, fbdb=None):
diff --git a/reboot.py b/reboot.py

index 337b0b4..e876a76 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -704,9 +704,8 @@ class BayTechCtrlCUnibe(PCUControl):
  
                 # Control Outlets  (5 ,1).........5
                 try:
-                       print s
-                       print "Enter Request" in s.before
-                       index = s.expect("Enter Request")
+                       #index = s.expect("Enter Request")
+                       index = s.expect(["Enter Request :"])
  
                         if index == 0:
                                 print "3"
@@ -720,7 +719,8 @@ class BayTechCtrlCUnibe(PCUControl):
                                         print "Reboot %d" % node_port
                                         s.send("Reboot %d\r\n" % node_port)
  
-                                       index = s.expect(["(Y/N)?"])
+                                       time.sleep(5)
+                                       index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"])
                                         if index == 0:
                                                 if dryrun:
                                                         print "sending N"
@@ -728,16 +728,21 @@ class BayTechCtrlCUnibe(PCUControl):
                                                 else:
                                                         print "sending Y"
                                                         s.send("Y\r\n")
+                                       elif index == 1:
+                                               raise ExceptionPrompt("PCU Reported 'Port in use.'")
+                                       elif index == 2:
+                                               raise ExceptionSequence("Issued command 'Reboot' failed.")
  
-                               #index = s.expect(["DS-RPC>"])
+                               time.sleep(5)
+                               index = s.expect(["DS-RPC>"])
                                 #print "got prompt back"
  
                         s.close()
  
                 except pexpect.EOF:
-                       raise ExceptionPrompt("EOF before 'Enter Request' Prompt")
+                       raise ExceptionPrompt("EOF before expected Prompt")
                 except pexpect.TIMEOUT:
-                       raise ExceptionPrompt("Timeout before 'Enter Request' Prompt")
+                       raise ExceptionPrompt("Timeout before expected Prompt")
  
                 return 0
  
@@ -757,40 +762,54 @@ class BayTechCtrlC(PCUControl):
                 # Otherwise, the login succeeded.
  
                 # Send a ctrl-c to the remote process.
-               print "sending ctrl-c"
+               print "SENDING ctrl-c"
                 s.send(chr(3))
  
                 # Control Outlets  (5 ,1).........5
                 try:
+                       print "EXPECTING: ", "Enter Request :"
                         index = s.expect(["Enter Request :"])
  
                         if index == 0:
-                               print "5"
+                               print "SENDING: 5"
                                 s.send("5\r\n")
-                               index = s.expect(["DS-RPC>", "Enter user name:"])
+                               print "EXPECTING: ", "DS-RPC>"
+                               index = s.expect(["DS-RPC>", "Enter user name:", "Port in use."])
                                 if index == 1:
                                         print "sending username"
                                         s.send(self.username + "\r\n")
                                         index = s.expect(["DS-RPC>"])
+                               elif index == 2:
+                                       raise ExceptionPrompt("PCU Reported 'Port in use.'")
  
                                 if index == 0:
-                                       print "Reboot %d" % node_port
+                                       print "SENDING: Reboot %d" % node_port
                                         s.send("Reboot %d\r\n" % node_port)
  
-                                       index = s.expect(["(Y/N)?"])
+                                       print "SLEEPING: 5"
+                                       time.sleep(5)
+                                       print "EXPECTING: ", "Y/N?"
+                                       index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"])
                                         if index == 0:
                                                 if dryrun:
                                                         print "sending N"
                                                         s.send("N\r\n")
                                                 else:
-                                                       print "sending Y"
+                                                       print "SENDING: Y"
                                                         s.send("Y\r\n")
+                                       elif index == 1:
+                                               raise ExceptionPrompt("PCU Reported 'Port in use.'")
+                                       elif index == 2:
+                                               raise ExceptionSequence("Issued command 'Reboot' failed.")
  
                                 # NOTE: for some reason, the script times out with the
                                 # following line.  In manual tests, it works correctly, but
                                 # with automated tests, evidently it fails.
-                               #index = s.expect(["DS-RPC>"])
-                               #print "got prompt back"
+                               print "SLEEPING: 5"
+                               time.sleep(5)
+                               #print "TOTAL--", s.allstr, "--EOT"
+                               index = s.expect(["DS-RPC>"])
+                               print "got prompt back"
  
                         s.close()
  
@@ -817,6 +836,7 @@ class BayTech(PCUControl):
                         # even after login...
                         print "msg: %s" % msg
                         self.transport.write(self.username + "\r\n")
+                       time.sleep(5)
                         self.ifThenSend("DS-RPC>", "Reboot %d" % node_port)
  
                 # Reboot Outlet  N        (Y/N)?
@@ -824,6 +844,7 @@ class BayTech(PCUControl):
                         self.ifThenSend("(Y/N)?", "N")
                 else:
                         self.ifThenSend("(Y/N)?", "Y")
+               time.sleep(5)
                 self.ifThenSend("DS-RPC>", "")
  
                 self.close()
@@ -1227,7 +1248,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                         print values
  
                         # TODO: make a more robust version of APC
-                       if values['pcu_id'] in [1163,1055,1111,1231,1113,1127,1128,1148]:
+                       if values['pcu_id'] in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
                                 apc = APCEurope(values, verbose, ['22', '23'])
                                 rb_ret = apc.reboot(values[nodename], dryrun)
  
@@ -1235,11 +1256,11 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                 apc = APCBrazil(values, verbose, ['22', '23'])
                                 rb_ret = apc.reboot(values[nodename], dryrun)
  
-                       elif values['pcu_id'] in [1221,1225]:
+                       elif values['pcu_id'] in [1221,1225,1220]:
                                 apc = APCBerlin(values, verbose, ['22', '23'])
                                 rb_ret = apc.reboot(values[nodename], dryrun)
  
-                       elif values['pcu_id'] in [1173,1221,1220]:
+                       elif values['pcu_id'] in [1173,1240]:
                                 apc = APCFolsom(values, verbose, ['22', '23'])
                                 rb_ret = apc.reboot(values[nodename], dryrun)
  
@@ -1249,7 +1270,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
  
                 # BayTech DS4-RPC
                 elif continue_probe and values['model'].find("DS4-RPC") >= 0:
-                       if values['pcu_id'] in [1237,1052,1209,1002,1008,1041,1013,1022]:
+                       if values['pcu_id'] in [1056,1237,1052,1209,1002,1008,1041,1013,1022]:
                                 # These  require a 'ctrl-c' to be sent... 
                                 baytech = BayTechCtrlC(values, verbose, ['22', '23'])
                                 rb_ret = baytech.reboot(values[nodename], dryrun)
diff --git a/ssh/pexpect.py b/ssh/pexpect.py

index 19ee230..4eab532 100644 (file)
--- a/ssh/pexpect.py
+++ b/ssh/pexpect.py
@@ -342,6 +342,7 @@ class spawn (object):
          self.env = env
          self.__irix_hack = sys.platform.lower().find('irix') >= 0 # This flags if we are running on irix
          self.use_native_pty_fork = not (sys.platform.lower().find('solaris') >= 0) # Solaris uses internal __fork_pty(). All other use pty.fork().
+        self.allstr = ""
  
          # allow dummy instances for subclasses that may not use command or args.
          if command is None:
@@ -1108,6 +1109,7 @@ class spawn (object):
                      self.buffer = incoming[self.match.end() : ]
                      self.before = incoming[ : self.match.start()]
                      self.after = incoming[self.match.start() : self.match.end()]
+                    #print "MATCH--", self.after, "--EOM"
                      return self.match_index
                  # No match at this point
                  if timeout < 0 and timeout is not None:
@@ -1116,6 +1118,8 @@ class spawn (object):
                  c = self.read_nonblocking (self.maxread, timeout)
                  time.sleep (0.0001)
                  incoming = incoming + c
+                self.allstr += c
+                #print "INCOMING--", c, "--EOI"
                  if timeout is not None:
                      timeout = end_time - time.time()
          except EOF, e:
diff --git a/unified_model.py b/unified_model.py

index acc89d8..8c5fb7f 100755 (executable)
--- a/unified_model.py
+++ b/unified_model.py
@@ -91,7 +91,7 @@ class RT(object):
                 return self.status
  
         def closeTicket(self):
-               mailer.closeTicketViaRT(self.ticket_id) 
+               mailer.closeTicketViaRT(self.ticket_id, "Ticket CLOSED automatically by SiteAssist.") 
  
         def email(self, subject, body, to):
                 self.ticket_id = mailer.emailViaRT(subject, body, to, self.ticket_id)
@@ -229,10 +229,10 @@ class PersistMessage(Message):
  
                 #print pm
                 if id in pm:
-                       print "Using existing object"
+                       #print "Using existing object"
                         obj = pm[id]
                 else:
-                       print "creating new object"
+                       #print "creating new object"
                         obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
                         obj.id = id
                         obj.actiontracker = Recent(3*60*60*24)
@@ -252,18 +252,19 @@ class PersistMessage(Message):
         def reset(self):
                 self.actiontracker.unsetRecent()
  
+       def save(self):
+               pm = database.dbLoad(self.db)
+               pm[self.id] = self
+               database.dbDump(self.db, pm)
+
         def send(self, to):
                 if not self.actiontracker.isRecent():
                         self.ticket_id = Message.send(self, to)
                         self.actiontracker.setRecent()
-
-                       #print "recording object for persistance"
-                       pm = database.dbLoad(self.db)
-                       pm[self.id] = self
-                       database.dbDump(self.db, pm)
+                       self.save()
                 else:
                         # NOTE: only send a new message every week, regardless.
-                       print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24)
+                       print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
  
  class MonitorMessage(object):
         def __new__(typ, id, *args, **kwargs):
@@ -427,6 +428,7 @@ class Record(object):
         def severity(self):
                 category = self.data['category']
                 prev_category = self.data['prev_category']
+               #print "SEVERITY: ", category, prev_category
                 val = cmpCategoryVal(category, prev_category)
                 return val 
  
@@ -504,33 +506,46 @@ class Record(object):
  
         def takeAction(self):
                 pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
-               if 'improvement' in self.data['stage'] or self.improved():
-                       print "decreasing penalty for %s"%self.hostname
+               if 'improvement' in self.data['stage'] or self.improved() or \
+                       'monitor-end-record' in self.data['stage']:
+                       print "takeAction: decreasing penalty for %s"%self.hostname
+                       pp.decrease()
                         pp.decrease()
                 else:
-                       print "increasing penalty for %s"%self.hostname
+                       print "takeAction: increasing penalty for %s"%self.hostname
                         pp.increase()
                 pp.apply(self.hostname)
                 pp.save()
  
         def _format_diaginfo(self):
                 info = self.data['info']
+               print "FORMAT : STAGE: ", self.data['stage']
                 if self.data['stage'] == 'monitor-end-record':
+                       if info[2] == "ALPHA": info = (info[0], info[1], "PROD")
                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
                 else:
                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
                 return hlist
+       def saveAction(self):
+               if 'save-act-all' in self.data and self.data['save-act-all'] == True:
+                       return True
+               else:
+                       return False
  
         def getMessage(self, ticket_id=None):
                 self.data['args']['hostname'] = self.hostname
                 self.data['args']['loginbase'] = self.loginbase
                 self.data['args']['hostname_list'] = self._format_diaginfo()
-               message = PersistMessage(self.hostname, 
+               #print self.data['message']
+               if self.data['message']:
+                       message = PersistMessage(self.hostname, 
                                                                  self.data['message'][0] % self.data['args'],
                                                                  self.data['message'][1] % self.data['args'],
                                                                  True, db='monitor_persistmessages',
                                                                  ticket_id=ticket_id)
-               return message
+                       return message
+               else:
+                       return None
         
         def getContacts(self):
                 roles = self.data['email']
@@ -579,6 +594,7 @@ class NodeRecord:
         def severity(self):
                 category = self.data['category']
                 prev_category = self.data['prev_category']
+               print "IMPROVED: ", category, prev_category
                 val = cmpCategoryVal(category, prev_category)
                 return val 
  
@@ -659,6 +675,15 @@ def node_end_record(node):
                 del act_all
                 return False
  
+       pm = database.dbLoad("monitor_persistmessages")
+       if node not in pm:
+               del pm
+               return False
+       else:
+               print "deleting node record"
+               del pm[node]
+               database.dbDump("monitor_persistmessages", pm)
+
         a = Action(node, act_all[node][0])
         a.delField('rt')
         a.delField('found_rt_ticket')
@@ -667,8 +692,9 @@ def node_end_record(node):
         a.delField('first-found')
         rec = a.get()
         rec['action'] = ["close_rt"]
-       rec['category'] = "UNKNOWN"
+       rec['category'] = "ALPHA"       # assume that it's up...
         rec['stage'] = "monitor-end-record"
+       rec['ticket_id'] = None
         rec['time'] = time.time() - 7*60*60*24
         act_all[node].insert(0,rec)
         database.dbDump("act_all", act_all)
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Tue, 26 Aug 2008 02:02:06 +0000 (02:02 +0000)
bootman.py		patch \| blob \| history
clean_policy.py		patch \| blob \| history
emailTxt.py		patch \| blob \| history
findbadpcu.py		patch \| blob \| history
grouprins.py		patch \| blob \| history
mailer.py		patch \| blob \| history
mailmonitor.py		patch \| blob \| history
nodecommon.py		patch \| blob \| history
nodegroups.py		patch \| blob \| history
nodeinfo.py		patch \| blob \| history
nodequery.py		patch \| blob \| history
reboot.py		patch \| blob \| history
ssh/pexpect.py		patch \| blob \| history
unified_model.py		patch \| blob \| history