add pcu_name to pcufailed_notice
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 20 Aug 2009 17:55:12 +0000 (17:55 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 20 Aug 2009 17:55:12 +0000 (17:55 +0000)
added pcuerror_notice
added convenience functions to FindbadPCURecord
added pcuerror_notice to policy.py
create a different action type for each bootman_restore operation.
added better formatting to actionsummary, so all types of actions are displayed
fixed nodebad logic so that a node leaves the 'disabled' status when it's boot_state changes.

monitor/bootman.py
monitor/database/dborm.py
monitor/database/info/findbad.py
monitor/database/info/interface.py
monitor/database/info/model.py
monitor/wrapper/emailTxt.py
nodebad.py
policy.py
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/templates/actionlist.kid
web/MonitorWeb/monitorweb/templates/actionsummary.kid

index 09be54f..52a8da2 100755 (executable)
@@ -435,6 +435,7 @@ class DebugInterface:
 
                                "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
                                "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
                                "bminit-cfg-auth-getplc-update-debug-done",
                                "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
@@ -518,23 +519,27 @@ class DebugInterface:
                                  "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
                                  "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
                                  "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
                                ]:
                        sequences.update({n : "fsck_repair"})
 
-               # update_node_config_email
+               # nodeconfig_notice
                for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
                                  "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
                                ]:
-                       sequences.update({n : "update_node_config_email"})
+                       sequences.update({n : "nodeconfig_notice"})
 
                for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
                                   "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
@@ -543,32 +548,37 @@ class DebugInterface:
                                ]:
                        sequences.update({n : "nodenetwork_email"})
 
-               # update_bootcd_email
+               # noblockdevice_notice
                for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
                                "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
                                "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+                               ]:
+                       sequences.update({n : "noblockdevice_notice"})
+
+               # update_bootcd_email
+               for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
                                ]:
                        sequences.update({n : "update_bootcd_email"})
 
                for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                                ]:
-                       sequences.update({n: "suspect_error_email"})
+                       sequences.update({n: "unknownsequence_notice"})
 
-               # update_hardware_email
-               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+               # minimalhardware_notice
+               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
+               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
 
-               # broken_hardware_email
-               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+               # baddisk_notice
+               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
 
-               # bad_dns_email
+               # baddns_notice
                for n in [ 
                 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
                        "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
                        ]:
-                       sequences.update( { n : "bad_dns_email"})
+                       sequences.update( { n : "baddns_notice"})
 
                return sequences
 
@@ -703,6 +713,8 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
 
+       bootman_action = "unknown"
+
        fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
        recent_actions = sitehist.getRecentActions(hostname=hostname)
 
@@ -716,20 +728,20 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                        api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 
                # NOTE: nothing else is possible.
-               return True
+               return "disabled"
 
        debugnode = DebugInterface(hostname)
        conn = debugnode.getConnection()
-       if type(conn) == type(False): return False
+       if type(conn) == type(False): return "error"
 
        boot_state = conn.get_boot_state()
        if boot_state != "debug":
                print "... %s in %s state: skipping..." % (hostname , boot_state)
-               return boot_state == "boot"
+               return "skipped" #boot_state == "boot"
 
        if conn.bootmanager_running():
                print "...BootManager is currently running.  Skipping host %s" %hostname 
-               return True
+               return "skipped" # True
 
        # Read persistent flags, tagged on one week intervals.
 
@@ -758,13 +770,13 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
                                #conn.set_nodestate('disabled')
 
-                       return False
+                       return "skipping_baddisk"
 
        print "...Downloading bm.log from %s" %hostname 
        log = conn.get_bootmanager_log()
        child = fdpexpect.fdspawn(log)
 
-       if hasattr(config, 'collect') and config.collect: return True
+       if hasattr(config, 'collect') and config.collect: return "collect"
 
        if config and not config.quiet: print "...Scanning bm.log for errors"
 
@@ -800,11 +812,14 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
                conn.restart_bootmanager('boot')
 
+               bootman_action = "restart_bootmanager"
+
                # NOTE: Do not set the pflags value for this sequence if it's unknown.
                # This way, we can check it again after we've fixed it.
                flag_set = False
 
        else:
+               bootman_action = sequences[s]
 
                if   sequences[s] == "restart_bootmanager_boot":
                        print "...Restarting BootManager.py on %s "%hostname 
@@ -828,7 +843,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                # there was some failure to synchronize the keys.
                                print "...Unable to repair node keys on %s" %hostname 
 
-               elif sequences[s] == "suspect_error_email":
+               elif sequences[s] == "unknownsequence_notice":
                        args = {}
                        args['hostname'] = hostname
                        args['sequence'] = s
@@ -840,8 +855,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                        sitehist.sendMessage('unknownsequence_notice', **args)
                        conn.restart_bootmanager('boot')
 
-               # TODO: differentiate this and the 'nodenetwork_email' actions.
-               elif sequences[s] == "update_node_config_email":
+               elif sequences[s] == "nodeconfig_notice":
 
                        if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
@@ -858,18 +872,16 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                sitehist.sendMessage('nodeconfig_notice', **args)
                                conn.dump_plconf_file()
 
-               elif sequences[s] == "update_bootcd_email":
+               elif sequences[s] == "noblockdevice_notice":
 
-                       if not found_within(recent_actions, 'newalphacd_notice', 3.5):
+                       if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
                                args = {}
-                               args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+                               #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
                                args['hostname'] = hostname
                        
-                               sitehist.sendMessage('newalphacd_notice', **args)
+                               sitehist.sendMessage('noblockdevice_notice', **args)
 
-                               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-
-               elif sequences[s] == "broken_hardware_email":
+               elif sequences[s] == "baddisk_notice":
                        # MAKE An ACTION record that this host has failed hardware.  May
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
@@ -883,7 +895,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                sitehist.sendMessage('baddisk_notice', **args)
                                #conn.set_nodestate('disabled')
 
-               elif sequences[s] == "update_hardware_email":
+               elif sequences[s] == "minimalhardware_notice":
                        if not found_within(recent_actions, 'minimalhardware_notice', 7):
                                print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
                                args = {}
@@ -891,7 +903,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                args['bmlog'] = conn.get_bootmanager_log().read()
                                sitehist.sendMessage('minimalhardware_notice', **args)
 
-               elif sequences[s] == "bad_dns_email":
+               elif sequences[s] == "baddns_notice":
                        if not found_within(recent_actions, 'baddns_notice', 1):
                                print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
                                args = {}
@@ -903,7 +915,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                        print traceback.print_exc()
                                        # TODO: api error. skip email, b/c all info is not available,
                                        # flag_set will not be recorded.
-                                       return False
+                                       return "exception"
                                nodenet_str = network_config_to_str(net)
 
                                args['hostname'] = hostname
@@ -912,7 +924,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
                                sitehist.sendMessage('baddns_notice', **args)
 
-       return True
+       return bootman_action
        
 
 # MAIN -------------------------------------------------------------------
index 71749f6..bab1784 100644 (file)
@@ -5,6 +5,7 @@ import monitor.config as config
 mon_metadata = sqlalchemy.MetaData()
 mon_metadata.bind = sqlalchemy.create_engine(config.monitor_dburi, echo=config.echo)
 mon_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
+mon_session.bind = mon_metadata.bind
 
 if config.zabbix_enabled:
        zab_metadata = sqlalchemy.MetaData()
index 0cfb965..615e03f 100644 (file)
@@ -5,6 +5,7 @@ from datetime import datetime,timedelta
 import elixir
 import traceback
 from elixir.ext.versioned import *
+from pcucontrol import reboot
 
 from monitor.database.dborm import mon_metadata, mon_session
 __metadata__ = mon_metadata
@@ -77,6 +78,68 @@ class FindbadPCURecord(Entity):
        def get_latest_by(cls, **kwargs):
                return cls.query.filter_by(**kwargs).first()
 
+       def pcu_name(self):
+               if self.plc_pcu_stats['hostname'] is not None and self.plc_pcu_stats['hostname'] is not "":
+                       return self.plc_pcu_stats['hostname']
+               elif self.plc_pcu_stats['ip'] is not None and self.plc_pcu_stats['ip'] is not "":
+                       return self.plc_pcu_stats['ip']
+               else:
+                       return None
+
+       def format_ports(self):
+               retval = []
+               filtered_length=0
+
+               supported_ports=reboot.model_to_object(self.plc_pcu_stats['model']).supported_ports
+               data = self.port_status.copy()
+
+               if data and len(data.keys()) > 0 :
+                       for port in supported_ports:
+                               try:
+                                       state = data[str(port)]
+                               except:
+                                       state = "unknown"
+
+                               if state == "filtered":
+                                       filtered_length += 1
+                                       
+                               retval.append( (port, state) )
+
+               if retval == []: 
+                       retval = [( "Closed/Filtered", "" )]
+
+               if filtered_length == len(supported_ports):
+                       retval = [( "All Filtered", "" )]
+
+               return retval
+
+       def format_pcu_shortstatus(self):
+               status = "error"
+               if self.reboot_trial_status:
+                       if self.reboot_trial_status == str(0):
+                               status = "Ok"
+                       elif self.reboot_trial_status == "NetDown" or self.reboot_trial_status == "Not_Run":
+                               status = self.reboot_trial_status
+                       else:
+                               status = "error"
+
+               return status
+
+       def test_is_ok(self):
+               if self.reboot_trial_status == str(0):
+                       return True
+               else:
+                       return False
+
+       def pcu_errors(self):
+               message = "\n"
+               message += "\tModel: %s\n" % self.plc_pcu_stats['model']
+               message += "\tMissing Fields: %s\n" % ( self.entry_complete == "" and "None missing" or self.entry_complete )
+               message += "\tDNS Status: %s\n" % self.dns_status
+               message += "\tPort Status: %s\n" % self.format_ports()
+               message += "\tTest Results: %s\n" % self.format_pcu_shortstatus()
+               return message
+
 # ACCOUNTING
        date_checked = Field(DateTime)
        round = Field(Int,default=0)
index d37ab2e..ef7d510 100644 (file)
@@ -1,11 +1,13 @@
 
-from monitor import reboot
 from monitor.common import *
 from monitor.model import *
 from monitor.wrapper import plc
 from monitor.wrapper import plccache
 from monitor.wrapper.emailTxt import mailtxt
 from monitor.database.info.model import *
+# NOTE: must import this after monitor.database.info.model b/c that imports
+#      pcucontro.reboot and blocks this version, if it comes last.
+from monitor import reboot
 
 class SiteInterface(HistorySiteRecord):
        @classmethod
@@ -183,7 +185,7 @@ class SiteInterface(HistorySiteRecord):
        def runBootManager(self, hostname):
                from monitor import bootman
                print "attempting BM reboot of %s" % hostname
-               ret = ""
+               ret = "error"
                try:
                        ret = bootman.restore(self, hostname)
                        err = ""
@@ -191,10 +193,18 @@ class SiteInterface(HistorySiteRecord):
                        err = traceback.format_exc()
                        print err
 
+               # TODO: keep this record so that the policy.py can identify all
+               #               bootmanager_* actions without explicitly listing every kind.
                act = ActionRecord(loginbase=self.db.loginbase,
                                                        hostname=hostname,
                                                        action='reboot',
                                                        action_type='bootmanager_restore',
+                                                       error_string="")
+
+               act = ActionRecord(loginbase=self.db.loginbase,
+                                                       hostname=hostname,
+                                                       action='reboot',
+                                                       action_type='bootmanager_' + ret,
                                                        error_string=err)
                return ret
 
index c538c66..acb79eb 100644 (file)
@@ -3,3 +3,4 @@ from monitor.database.info.findbad import *
 from monitor.database.info.history import *
 from monitor.database.info.plc import *
 from monitor.database.dborm import mon_session as session
+from monitor.database.dborm import mon_metadata
index 8cd5bdc..3381d44 100644 (file)
@@ -52,15 +52,33 @@ The registration is very quick.  All we need are: PCU hostname, IP, username,
 and password.  Then, choose which node to associate it with, and we will take 
 care of the rest.
 
+Thank you very much for your help,
+  -- %(plc_name)s (%(support_email)s)
+""")
+
+       pcuerror_notice=("""Please help us configure your PCU: %(pcu_name)s""",
+"""During our standard monitoring of your site we noticed that the following
+PCU is misconfigured:
+
+    %(pcu_name)s
+       %(pcu_errors)s
+You can learn more details about the problem by visiting the link below.
+
+    https://%(monitor_hostname)s/monitor/pcuview?loginbase=%(loginbase)s
+
+We would like to save you time by taking care of as many administrative situations for your site's machines as possible without disturbing you.  Errors like these prevent us from being able to remotely administer your machines, and so we must solicit your help using messages like these.
+
+So, any help and time that you can offer now to help us remotely administer your machines will pay off for you in the future.
+
 Thank you very much for your help,
   -- %(plc_name)s (%(support_email)s)
 """)
 
        pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""",
 
-"""We tried to use the PCU registered for %(hostname)s, but for some reason 
-the host did not come back online.  You may be able to learn more by visiting
-this link:
+"""We tried to use the PCU registered for %(hostname)s, but for some reason the host did not come back online.  This may be for several reasons, and you can learn more by visiting this link:
+
+    %(pcu_name)s
 
     https://%(monitor_hostname)s/monitor/pcuview?loginbase=%(loginbase)s
 
@@ -219,9 +237,25 @@ Thank you for your help,
   -- %(plc_name)s (%(support_email)s)
 """)
 
+       noblockdevice_notice=("""Cannot Detect Disks on %(hostname)s""", 
+"""As part of PlanetLab node monitoring, we noticed that we were not able to detect any hard disks in your machine.  
+
+    %(hostname)s  
+
+This may be the case for a number of reasons:
+    * the hardware is very new and needs a new driver,
+    * the hardware is very old is no longer supported,
+    * the hard disk was physically removed, 
+    * the hard disk cable is loose or disconnected,
+
+Please help us investigate and let us know if there's anything that we can do to assist in getting your machine up and running again.
+
+Thank you for your help,
+  -- %(plc_name)s (%(support_email)s)
+""")
 
        newalphacd_notice=("""New Boot Images for %(hostname)s""", 
-"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine.  This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine.  This means that either it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
 
     %(hostname)s  
 
index acd5007..e7fc819 100755 (executable)
@@ -79,15 +79,19 @@ def check_node_state(rec, node):
                        print "changed status from %s to offline" % node.status
                        node.status = 'offline'
                        node.last_changed = datetime.now()
-                       
-       if node_state == 'DEBUG' and node.status not in ['failboot', 'disabled', 'safeboot']:
+
+       if node_state == 'DEBUG':
                if boot_state != 'disabled' and boot_state != 'safeboot':
                        print "changed status from %s to failboot" % (node.status)
-                       node.status = "failboot"
-                       node.last_changed = datetime.now()
+                       current_status = "failboot"
                else:
                        print "changed status from %s to %s" % (node.status, boot_state)
-                       node.status = boot_state
+                       current_status = boot_state
+
+               if current_status != node.status and \
+                       current_status in ['failboot', 'disabled', 'safeboot']:
+
+                       node.status = current_status
                        node.last_changed = datetime.now()
 
        if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
index 77cf76e..cb5d93f 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -94,8 +94,30 @@ def main(hostnames, sitenames):
                        not nodehist.firewall and \
                        not found_between(recent_actions, 'try_reboot', 3.5, 1):
 
+                               # TODO: there MUST be a better way to do this... 
+                               # get fb node record for pcuid
+                               fbpcu = None
+                               fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
+                               if fbnode:
+                                       fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
+
                                sitehist.attemptReboot(host)
                                print "send message for host %s try_reboot" % host
+                               if not fbpcu.test_is_ok() and \
+                                       not found_within(recent_actions, 'pcuerror_notice', 3.0):
+
+                                       args = {}
+                                       if fbpcu:
+                                               args['pcu_name'] = fbpcu.pcu_name()
+                                               args['pcu_errors'] = fbpcu.pcu_errors()
+                                       else:
+                                               args['pcu_name'] = "error looking up pcu name"
+                                               args['pcu_errors'] = ""
+
+                                       args['hostname'] = host
+                                       sitehist.sendMessage('pcuerror_notice', **args)
+                                       print "send message for host %s PCU Failure" % host
+                                       
 
                # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
                #               will be false for a day after the above condition is satisfied
@@ -105,13 +127,24 @@ def main(hostnames, sitenames):
                        found_between(recent_actions, 'try_reboot', 3.5, 1) and \
                        not found_within(recent_actions, 'pcufailed_notice', 3.5):
                                
+                               # TODO: there MUST be a better way to do this... 
+                               # get fb node record for pcuid
+                               fbpcu = None
+                               fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
+                               if fbnode:
+                                       fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
+                               if fbpcu:
+                                       pcu_name = fbpcu.pcu_name()
+                               else:
+                                       pcu_name = "error looking up pcu name"
+
+                               # get fb pcu record for pcuid
                                # send pcu failure message
-                               #act = ActionRecord(**kwargs)
-                               sitehist.sendMessage('pcufailed_notice', hostname=host)
+                               sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
                                print "send message for host %s PCU Failure" % host
 
                if nodehist.status == 'failboot' and \
-                       changed_greaterthan(nodehist.last_changed, 1) and \
+                       changed_greaterthan(nodehist.last_changed, 0.25) and \
                        not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
                                # send down node notice
                                # delay 0.5 days before retrying...
index 2821b76..4a74782 100644 (file)
@@ -9,8 +9,6 @@ import cherrypy
 import re
 from monitor.database.info.model import *
 #from monitor.database.zabbixapi.model import *
-#from monitor.database.dborm import zab_session as session
-#from monitor.database.dborm import zab_metadata as metadata
 from monitor_xmlrpc import MonitorXmlrpcServer
 
 from monitor import reboot
@@ -796,6 +794,14 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
                types = filter(lambda x: 'notice' in x, dir(mailtxt))
                results = {}
 
+               print mon_metadata.bind
+               if session.bind is None:
+                       #TODO: figure out why this value gets cleared out...
+                       session.bind = mon_metadata.bind
+               result = session.execute("select distinct(action_type) from actionrecord;")
+
+               types = [r[0] for r in result]
+
                try: since = int(since)
                except: since = 7
 
index 16414e5..b1cd26b 100644 (file)
@@ -18,9 +18,9 @@ def zabbix_event_ack_link(eventid):
        <table width="100%">
                <thead>
                        <tr>
-                               <th><a href="${link('actionlist', action_type='online_notice', since=1)}">Last Day</a></th>
-                               <th><a href="${link('actionlist', action_type='online_notice', since=7)}">Last Week</a></th>
-                               <th><a href="${link('actionlist', action_type='online_notice', since=30)}">Last Month</a></th>
+                               <th><a href="${link('actionlist', action_type=action_type, since=1)}">Last Day</a></th>
+                               <th><a href="${link('actionlist', action_type=action_type, since=7)}">Last Week</a></th>
+                               <th><a href="${link('actionlist', action_type=action_type, since=30)}">Last Month</a></th>
                        </tr>
                </thead>
                <tbody>
@@ -30,20 +30,19 @@ def zabbix_event_ack_link(eventid):
                <p py:if="actions and len(actions) == 0">
                        There are no recent actions taken for this site.
                </p>
-               <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+               <table id="actionlist" cellpadding="0" border="0" class="plekit_table sortable-onload-0 colstyle-alt no-arrow paginationcallback-actionlist_paginator max-pages-10 paginate-50" py:if="actions and len(actions) > 0">
+               <!--table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%"-->
                        <thead>
                                <tr>
-                                       <th mochi:format="int"></th>
-                                       <th>Date</th>
-                                       <th>Action taken on</th>
-                                       <th>Action Type</th>
-                                       <th>Message ID</th>
-                                       <th>Errors</th>
+                                       <th class="sortable plekit_table">Date</th>
+                                       <th class="sortable plekit_table">Action taken on</th>
+                                       <th class="sortable plekit_table">Action Type</th>
+                                       <th class="sortable plekit_table">Message ID</th>
+                                       <th class="sortable plekit_table">Errors</th>
                                </tr>
                        </thead>
                        <tbody>
                                <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
-                                       <td></td>
                                        <td py:content="act.date_created"></td>
                                        <td py:if="act.hostname is not None" nowrap="true" >
                                                <a class="ext-link" href="${plc_node_uri(act.hostname)}">
@@ -58,7 +57,7 @@ def zabbix_event_ack_link(eventid):
                                        <td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
                                                        <span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
                                        <td py:if="'bootmanager' in act.action_type or 'unknown' in act.action_type">
-                                               <a href="/monitorlog/bm.${act.hostname}.log">latest bm log</a>
+                                               <a href="/monitorlog/bm.${act.hostname}.log">bm log before action</a>
                                        </td>
                                        <td py:if="'bootmanager' not in act.action_type">
                                                <pre py:content="act.error_string"></pre></td>
index d54e918..25621fb 100644 (file)
@@ -26,17 +26,17 @@ def zabbix_event_ack_link(eventid):
                <tbody>
                <tr>
                <td colspan="5">
-               <table id="sortable_table" class="datagrid" border="1" width="100%">
+               <table id="actionsummarylist" cellpadding="0" border="0" class="plekit_table sortable-onload-0 colstyle-alt no-arrow paginationcallback-actionsummarylist_paginator max-pages-10 paginate-50" >
                        <thead>
                                <tr>
-                                       <th mochi:format="int"></th>
-                                       <th>Notice Name</th>
-                                       <th>Count</th>
+                                       <th class="sortable plekit_table">Type</th>
+                                       <th class="sortable plekit_table">Notice Name</th>
+                                       <th class="sortable plekit_table">Count</th>
                                </tr>
                        </thead>
                        <tbody>
                                <tr py:for="key in results.keys()">
-                                       <td></td>
+                                       <td nowrap="true" py:content="'bootman' in key and 'bootmanager' or ( 'notice' in key and 'notice' or ( 'penalty' in key and 'penalty' or 'unknown' ) ) "></td>
                                        <td nowrap="true"><a href="actionlist?action_type=${key}" py:content="key"></a></td>
                                        <td nowrap='true' py:content="results[key]"></td>
                                </tr>