merge from 2.0 branch
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
$ svn merge -r 13112:13116 https://svn.planet-lab.org/svn/Monitor/branches/2.0/

21 files changed:
Monitor.spec
bootman.py
findall.py
findbadpcu.py
monitor-server.init
monitor/database/info/findbad.py
monitor/database/info/history.py
monitor/database/info/interface.py
monitor/scanapi.py
monitor/wrapper/emailTxt.py
monitor/wrapper/plccache.py
nodebad.py
nodeconfig.py
pcubad.py
pcucontrol/models/IPAL.py
policy.py
showlatlon.py
sitebad.py
syncwithplc.py [new file with mode: 0755]
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/templates/nodehistory.kid

index 005e66a..5f08b25 100644 (file)
@@ -53,8 +53,8 @@ Summary: Monitor hooks for the PLC server.
 Group: Applications/System
 
 Requires: python
-Requires: python-sqlalchemy
-Requires: python-elixir
+#Requires: python-sqlalchemy
+#Requires: python-elixir
 
 Requires: openssh-clients
 Requires: perl-libwww-perl
@@ -65,9 +65,9 @@ Requires: nmap
 Requires: PLCWWW >= 4.2
 Requires: bootcd-planetlab-i386 >= 4.2
 
-Requires: zabbix-client
-Requires: zabbix-gui
-Requires: zabbix-server
+#Requires: zabbix-client
+#Requires: zabbix-gui
+#Requires: zabbix-server
 
 %description server
 The server side include all python modules and scripts needed to fully
@@ -202,7 +202,7 @@ rm -rf $RPM_BUILD_ROOT
 php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py
 
 # apply patches to zabbix
-patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
+#patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
 
 #chkconfig --add monitor-server
 #chkconfig monitor-server on
index 1a04ef0..4f8fb54 100755 (executable)
@@ -24,6 +24,7 @@ from monitor import const
 from monitor.model import *
 from monitor.common import email_exception, found_within
 from monitor.database.info.model import *
+from monitor.database.info.interface import *
 from monitor.wrapper import plc
 from monitor.wrapper import plccache
 from monitor.wrapper.emailTxt import mailtxt
@@ -59,6 +60,7 @@ class NodeConnection:
                        traceback.print_exc()
                        print self.c.modules.sys.path
                except:
+                       email_exception()
                        traceback.print_exc()
 
                return "unknown"
@@ -71,7 +73,8 @@ class NodeConnection:
 
        def get_bootmanager_log(self):
                download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
                log = open("log/bm.%s.log" % self.node, 'r')
                return log
 
@@ -863,7 +866,11 @@ def main():
                sys.exit(1)
 
        for node in nodes:
-               reboot(node, config)
+               # get sitehist
+               lb = plccache.plcdb_hn2lb[node]
+               sitehist = SiteInterface.get_or_make(loginbase=lb)
+               #reboot(node, config)
+               restore(sitehist, node, config=None, forced_action=None)
 
 if __name__ == "__main__":
        main()
index 64c4987..e96c1c4 100755 (executable)
@@ -7,6 +7,8 @@ from sitebad import main as sitebad_main
 from nodebad import main as nodebad_main
 from pcubad import main as pcubad_main
 from monitor.wrapper import plccache
+from monitor.database.info.model import  *
+from monitor.common import  *
 import sys
 
 if __name__ == '__main__':
@@ -29,20 +31,34 @@ if __name__ == '__main__':
        cfg = parsermodule.parse_args(parser)
 
        try:
-               print "sync with plc"
-               plccache.sync()
                print "findbad"
                findbad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                print "findbadpcu"
                findbadpcu_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                print "nodebad"
                nodebad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                print "pcubad"
                pcubad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                print "sitebad"
                sitebad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
        except Exception, err:
                import traceback
+               email_exception()
                print traceback.print_exc()
                print "Exception: %s" % err
                print "Saving data... exitting."
index ab4f5ff..9eb3be7 100755 (executable)
@@ -103,10 +103,9 @@ def main():
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
        elif config.node is not None:
-               l_nodes = plcacche.GetNodeByName(config.node)
-               pcus = []
-               for node in l_nodes:
-                       pcus += node['pcu_ids']
+               node = plccache.GetNodeByName(config.node)
+               print node
+               pcus = node['pcu_ids']
                # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
index b627c17..12193da 100644 (file)
@@ -364,8 +364,8 @@ case "$1" in
                check_monitor_conf
                check_monitor_schema_and_data
 
-               check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
-               check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
+               #check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
+               #check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
 
                if [ -n "$WROTE_PG_CONFIG" ] ; then
                        # NOTE: restart db to enable access by users granted above.
@@ -375,8 +375,8 @@ case "$1" in
                        dialog "$MESSAGE"
                fi
 
-               check_zabbix_schema_and_data
-               check_zabbix_templates_and_import
+               #check_zabbix_schema_and_data
+               #check_zabbix_templates_and_import
 
 
                # create /etc/httpd/conf.d/monitorweb.conf
@@ -390,9 +390,9 @@ case "$1" in
                start_tg_server
 
                # START zabbix services.  SETUP default config files.
-               check_zab_server
-               check_zab_agentd
-               check_zab_webconfig
+               #check_zab_server
+               #check_zab_agentd
+               #check_zab_webconfig
 
                result "$MESSAGE"
        ;;
@@ -442,8 +442,8 @@ case "$1" in
                dialog "$MESSAGE"
 
                stop_tg_server
-               service zabbix_server stop
-               service zabbix_agentd stop
+               #service zabbix_server stop
+               #service zabbix_agentd stop
                # TODO: is there anything to stop?
                result "$MESSAGE"
        ;;
index a5139eb..5e38aca 100644 (file)
@@ -11,46 +11,18 @@ __metadata__ = mon_metadata
 __session__  = mon_session
 
 
-#class FindbadNodeRecordSync(Entity):
-#      hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-#      round    = Field(Int,default=0)
-       
-#class FindbadPCURecordSync(Entity):
-#      plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-#      round     = Field(Int,default=0)
-
 class FindbadNodeRecord(Entity):
        @classmethod
        def get_all_latest(cls):
                return cls.query.all()
-               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               #if fbsync:
-               #       return cls.query.filter_by(round=fbsync.round)
-               #else:
-               #       return []
 
        @classmethod
        def get_latest_by(cls, **kwargs):
                return cls.query.filter_by(**kwargs).first()
-               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               #if fbsync:
-               #       kwargs['round'] = fbsync.round
-               #       return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
-               #else:
-               #       return []
 
        @classmethod
        def get_latest_n_by(cls, n=3, **kwargs):
                return cls.query.filter_by(**kwargs)
-               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               #kwargs['round'] = fbsync.round
-               #ret = []
-               #for i in range(0,n):
-               #       kwargs['round'] = kwargs['round'] - i
-               #       f = cls.query.filter_by(**kwargs).first()
-               #       if f:
-               #               ret.append(f)
-               #return ret
 
 # ACCOUNTING
        date_checked = Field(DateTime,default=datetime.now)
@@ -99,7 +71,7 @@ class FindbadPCURecord(Entity):
 # ACCOUNTING
        date_checked = Field(DateTime)
        round = Field(Int,default=0)
-       plc_pcuid = Field(Int) #alternateID=True,alternateMethodName='by_pcuid')
+       plc_pcuid = Field(Int)
 
 # EXTERNAL
        plc_pcu_stats = Field(PickleType,default=None)
index 3c5842a..6d2ed83 100644 (file)
@@ -15,6 +15,7 @@ class HistoryNodeRecord(Entity):
        last_checked = Field(DateTime,default=datetime.now)
        last_changed = Field(DateTime,default=datetime.now)
        status = Field(String,default="unknown")
+       haspcu = Field(Boolean,default=False)
        acts_as_versioned(ignore=['last_changed', 'last_checked'])
 
        @classmethod
index 2e5064d..29b19be 100644 (file)
@@ -193,6 +193,6 @@ class SiteInterface(HistorySiteRecord):
                act = ActionRecord(loginbase=self.db.loginbase,
                                                        hostname=hostname,
                                                        action='reboot',
-                                                       action_type='first_try_reboot',
+                                                       action_type='try_reboot',
                                                        error_string=err)
 
index 963822d..f7939e6 100644 (file)
@@ -20,7 +20,7 @@ from monitor.sources import comon
 from monitor.wrapper import plc, plccache
 
 import traceback
-from monitor.common import nmap_port_status
+from monitor.common import nmap_port_status, email_exception
 
 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                        "table=table_nodeview&" + \
@@ -157,6 +157,7 @@ class ScanInterface(object):
 
                except:
                        print "ERROR:"
+                       email_exception(nodename)
                        print traceback.print_exc()
                        pass
 
@@ -334,9 +335,10 @@ EOF                                """)
                        plc_lock.acquire()
                        d_node = None
                        try:
-                               d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
-                                                                               'date_created', 'last_updated', 
-                                                                               'last_contact', 'boot_state', 'nodegroup_ids'])[0]
+                               d_node = plccache.GetNodeByName(nodename)
+                               #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
+                               #                                               'date_created', 'last_updated', 
+                               #                                               'last_contact', 'boot_state', 'nodegroup_ids'])[0]
                        except:
                                traceback.print_exc()
                        plc_lock.release()
@@ -363,8 +365,9 @@ EOF                         """)
                        d_site = None
                        values['loginbase'] = ""
                        try:
-                               d_site = plc.getSites({'site_id': site_id}, 
-                                                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
+                               d_site = plccache.GetSitesById([ site_id ])[0]
+                               #d_site = plc.getSites({'site_id': site_id}, 
+                               #                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
                                values['loginbase'] = d_site['login_base']
                        except:
                                traceback.print_exc()
index 220eb10..b50be5b 100644 (file)
@@ -207,6 +207,18 @@ ERROR-        This is an error state, where there is absolutely no contact
            with PlanetLab.
        """)
 
+       pcumissing_notice =("""MONTEST: No PCU available to reboot %(hostname)s""",
+"""As part of PlanetLab node monitoring and maintenance, we noticed that there is no PCU
+associated with %(hostname)s, so we could not reboot it ourselves.
+
+To save you time in the future, please take a moment to register the PCU functionality for
+your machines here:
+
+    http://www.planet-lab.org/db/sites/pcu.php
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
        pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
 
 """As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
@@ -244,7 +256,11 @@ If any action is needed from you, you will recieve additional notices.  Thank yo
 This notice is simply to let you know that:
     %(hostname)s
 
-is down, disconnected from the network and/or non-operational.  Please investigate, thank you very much for your help!
+is down, disconnected from the network and/or non-operational.  
+
+Please investigate, thank you very much for your help!
+
+       http://monitor.planet-lab.org:8082/pcuview?loginbase=%(loginbase)s
        """)
 
        clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
index 0645b18..75ca49b 100755 (executable)
@@ -3,6 +3,7 @@
 import sys
 from monitor.wrapper import plc
 from monitor.database.info.model import *
+import profile
 
 def dsites_from_lsites(l_sites):
        d_sites = {}
@@ -67,17 +68,22 @@ def init():
        global plcdb_hn2lb
        global plcdb_lb2hn
        global plcdb_id2lb
+       print "initing plccache"
 
        dbsites = PlcSite.query.all()
        l_sites = [ s.plc_site_stats for s in dbsites ]
 
+       print "plcnode"
        dbnodes = PlcNode.query.all()
        l_nodes = [ s.plc_node_stats for s in dbnodes ]
 
+       print "plcpcu"
        dbpcus = PlcPCU.query.all()
        l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
 
+       print "dsites_from_lsites"
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
+       print "dsn_from_dsln"
        (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
 
        plcdb_hn2lb = hn2lb
@@ -108,14 +114,31 @@ def GetSitesByName(sitelist):
                ret.append(site.plc_site_stats)
        return ret
 
+def GetSitesById(idlist):
+       ret = []
+       for site_id in idlist:
+               site = PlcSite.get_by(site_id=site_id)
+               ret.append(site.plc_site_stats)
+       return ret
+
+def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_base'):
+       dbobjs = objectClass.query.all()
+       dbobj_key = [ getattr(s, dbKey) for s in dbobjs ]
+       plcobj_key = [ s[plcKey] for s in l_plc ]
+       extra_key = set(dbobj_key) - set(plcobj_key)
+       for obj in extra_key:
+               print "deleting %s" % obj
+               dbobj = objectClass.get_by(**{dbKey : obj})
+               dbobj.delete()
+
 def sync():
        l_sites = plc.api.GetSites({'peer_id':None}, 
                                                ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
                                                'longitude', 'max_slices', 'slice_ids', 'node_ids', 
                                                'enabled', 'date_created' ])
        l_nodes = plc.api.GetNodes({'peer_id':None}, 
-                                               ['hostname', 'node_id', 'ports', 'site_id', 
-                                                'version', 'last_updated', 'date_created', 
+                                               ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+                                                'version', 'last_updated', 'date_created', 'key',
                                                 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
        l_pcus = plc.api.GetPCUs()
 
@@ -125,8 +148,17 @@ def sync():
                dbsite.loginbase = site['login_base']
                dbsite.date_checked = datetime.now()
                dbsite.plc_site_stats = site
-               #dbsite.flush()
-       # TODO: delete old records.
+       deleteExtra(l_sites, PlcSite, 'loginbase', 'login_base')
+       deleteExtra(l_sites, HistorySiteRecord, 'loginbase', 'login_base')
+       session.flush()
+
+       print "sync pcus"
+       for pcu in l_pcus:
+               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+               dbpcu.date_checked = datetime.now()
+               dbpcu.plc_pcu_stats = pcu
+       deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
+       deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
        session.flush()
 
        print "sync nodes"
@@ -135,17 +167,8 @@ def sync():
                dbnode.hostname = node['hostname']
                dbnode.date_checked = datetime.now()
                dbnode.plc_node_stats = node
-               #dbnode.flush()
-       # TODO: delete old records.
-       session.flush()
-
-       print "sync pcus"
-       for pcu in l_pcus:
-               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
-               dbpcu.date_checked = datetime.now()
-               dbpcu.plc_pcu_stats = pcu
-               #dbpcu.flush()
-       # TODO: delete old records.
+       deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname')
+       deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname')
        session.flush()
 
        init()
@@ -153,6 +176,6 @@ def sync():
        return
 
 if __name__ == '__main__':
-       sync()
+       profile.run('sync()')
 else:
        init()
index 46ca879..c3aae39 100755 (executable)
@@ -38,6 +38,7 @@ def check_node_state(rec, node):
 
        node_state = rec.observed_status
        if rec.plc_node_stats:
+               print rec.plc_node_stats
                boot_state = rec.plc_node_stats['boot_state']
                last_contact = rec.plc_node_stats['last_contact']
        else:
@@ -47,6 +48,11 @@ def check_node_state(rec, node):
        if boot_state == 'disable': boot_state = 'disabled'
        if boot_state == 'diag':        boot_state = 'diagnose'
 
+       if len(rec.plc_node_stats['pcu_ids']) > 0:
+               node.haspcu = True
+       else:
+               node.haspcu = False
+
        # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
        #                       'translations' into the node.status state
        #               'BOOT' is a permanent state, but we want it to have a bit of
@@ -131,6 +137,7 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
+                       email_exception()
                        print traceback.print_exc()
                        continue
 
@@ -143,11 +150,8 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                count += 1
                print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
-       # NOTE: this commits all pending operations to the DB.  Do not remove, or
-       # replace with another operations that also commits all pending ops, such
-       # as session.commit() or flush() or something
+       # NOTE: this commits all pending operations to the DB.  Do not remove. 
        session.flush()
-       print HistoryNodeRecord.query.count()
 
        return True
 
index 788d7f8..3fe9a84 100755 (executable)
@@ -56,6 +56,7 @@ def main():
                        #       print k, "==" , net[k]
                except:
                        print "Error with %s" % node
+                       email_exception()
                        import traceback; print traceback.print_exc()
                        pass
 
index 9f0468c..59dfe7a 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -40,10 +40,8 @@ def main2(config):
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
        elif config.node:
-               l_nodes = plccache.GetNodeByName(config.node)
-               pcus = []
-               for node in l_nodes:
-                       pcus += node['pcu_ids']
+               node = plccache.GetNodeByName(config.node)
+               pcus = node['pcu_ids']
                # clear out dups.
                l_pcus = [pcu for pcu in sets.Set(pcus)]
 
@@ -117,6 +115,7 @@ def checkAndRecordState(l_pcus, l_plcpcus):
                except:
                        print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
                        import traceback
+                       email_exception()
                        print traceback.print_exc()
                        # don't have the info to create a new entry right now, so continue.
                        continue 
index 48394df..a2ea026 100644 (file)
@@ -17,7 +17,7 @@ class IPAL(PCUControl):
 
                try:
                        # TODO: make sleep backoff, before stopping.
-                       time.sleep(4)
+                       time.sleep(8)
                        ret = s.recv(count, socket.MSG_DONTWAIT)
                except socket.error, e:
                        if e[0] == errno.EAGAIN:
index 4befbd9..43b37ca 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -47,6 +47,7 @@ def main(hostnames, sitenames):
                        lb = plccache.plcdb_hn2lb[host]
                except:
                        print "unknown host in plcdb_hn2lb %s" % host
+                       email_exception(host)
                        continue
 
                nodeblack = BlacklistRecord.get_by(hostname=host)
@@ -64,35 +65,46 @@ def main(hostnames, sitenames):
                print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
                if nodehist.status == 'good' and \
                        changed_lessthan(nodehist.last_changed, 1.0) and \
+                       found_within(recent_actions, 'down_notice', 7.0) and \
                        not found_within(recent_actions, 'online_notice', 0.5):
+                           # NOTE: searching for down_notice proves that the node has
+                               #               gone through a 'down' state first, rather than just
+                               #               flapping through: good, offline, online, ...
+                               #       
                                # NOTE: there is a narrow window in which this command must be
-                               # evaluated, otherwise the notice will not go out.  this is not ideal.
+                               #               evaluated, otherwise the notice will not go out.  
+                               #               this is not ideal.
                                sitehist.sendMessage('online_notice', hostname=host, viart=False)
                                print "send message for host %s online" % host
 
-                               pass
 
-               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+               # if a node is offline and doesn't have a PCU, remind the user that they should have one.
+               if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.0) and \
-                       not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+                       not found_within(recent_actions, 'pcumissing_notice', 7.0):
+
+                               sitehist.sendMessage('pcumissing_notice', hostname=host)
+                               print "send message for host %s pcumissing_notice" % host
+
+               # if it is offline and HAS a PCU, then try to use it.
+               if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+                       changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not found_between(recent_actions, 'try_reboot', 3.5, 1):
 
                                sitehist.attemptReboot(host)
-                               print "send message for host %s first_try_reboot" % host
-                               pass
+                               print "send message for host %s try_reboot" % host
 
-               # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+               # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
                #               will be false for a day after the above condition is satisfied
-               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+               if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.5) and \
-                       found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+                       found_between(recent_actions, 'try_reboot', 3.5, 1) and \
                        not found_within(recent_actions, 'pcufailed_notice', 3.5):
-                       # found_within(recent_actions, 'first_try_reboot', 3.5) and \
                                
                                # send pcu failure message
                                #act = ActionRecord(**kwargs)
                                sitehist.sendMessage('pcufailed_notice', hostname=host)
                                print "send message for host %s PCU Failure" % host
-                               pass
 
                if nodehist.status == 'monitordebug' and \
                        changed_greaterthan(nodehist.last_changed, 1) and \
@@ -111,9 +123,10 @@ def main(hostnames, sitenames):
 
                                sitehist.sendMessage('down_notice', hostname=host)
                                print "send message for host %s down" % host
-                               pass
 
                node_count = node_count + 1
+               print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+               sys.stdout.flush()
                session.flush()
 
        for i,site in enumerate(sitenames):
@@ -158,13 +171,16 @@ def main(hostnames, sitenames):
                # find all ticket ids for site ( could be on the site record? )
                # determine if there are penalties within the last 30 days?
                # if so, add a 'pause_penalty' action.
-               if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+               if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
+                       sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
                        #       pause escalation
                        print "Pausing penalties for %s" % site
                        sitehist.pausePenalty()
 
                site_count = site_count + 1
 
+               print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+               sys.stdout.flush()
                session.flush()
 
        session.flush()
@@ -227,6 +243,7 @@ if __name__ == "__main__":
 
        try:
                main(hostnames, sitenames)
+               session.flush()
        except KeyboardInterrupt:
                print "Killed by interrupt"
                session.flush()
index 951802a..2176462 100755 (executable)
@@ -9,7 +9,7 @@ from datetime import datetime, timedelta
 
 import database
 import comon
-from monitor.common import color_pcu_state, datetime_fromstr
+from monitor.common import color_pcu_state, datetime_fromstr, email_exception
 from nodehistory import get_filefromglob
 import time
 import traceback
@@ -211,3 +211,5 @@ if __name__ == "__main__":
                main()
        except IOError:
                pass
+       except:
+               email_exception()
index 4d9ee33..6c09c1c 100755 (executable)
@@ -44,6 +44,8 @@ def getnodesup(nodelist):
        up = 0
        for node in nodelist:
                try:
+                       # NOTE: adding a condition for nodehist.haspcu would include pcus
+                       #               in the calculation
                        nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
                        nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
                        if (nodehist is not None and nodehist.status != 'down') or \
@@ -51,6 +53,7 @@ def getnodesup(nodelist):
                                up = up + 1
                except:
                        import traceback
+                       email_exception(node['hostname'])
                        print traceback.print_exc()
        return up
 
diff --git a/syncwithplc.py b/syncwithplc.py
new file mode 100755 (executable)
index 0000000..af01841
--- /dev/null
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+
+from monitor.wrapper import plccache
+
+if __name__ == "__main__":
+       plccache.sync()
index 1c4efe9..7cbaf4f 100644 (file)
@@ -315,7 +315,9 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
        @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
        def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
                print "PCUVIEW------------------"
-               session.clear()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                sitequery=[]
                pcuquery=[]
                nodequery=[]
@@ -333,7 +335,7 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
 
                if loginbase:
                        actions = ActionRecord.query.filter_by(loginbase=loginbase
-                                                       ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+                                                       ).filter(ActionRecord.date_created >= datetime.now() - timedelta(14)
                                                        ).order_by(ActionRecord.date_created.desc())
                        actions = [ a for a in actions ]
                        sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
@@ -387,13 +389,21 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
        def nodehistory(self, hostname=None):
                query = []
                if hostname:
-                       fbnode = FindbadNodeRecord.get_by(hostname=hostname)
-                       # TODO: add links for earlier history if desired.
+                       #fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+                       ## TODO: add links for earlier history if desired.
+                       #l = fbnode.versions[-100:]
+                       #l.reverse()
+                       #for node in l:
+                       #       prep_node_for_display(node)
+                       #       query.append(node)
+
+                       fbnode = HistoryNodeRecord.get_by(hostname=hostname)
                        l = fbnode.versions[-100:]
                        l.reverse()
                        for node in l:
-                               prep_node_for_display(node)
+                               #prep_node_for_display(node)
                                query.append(node)
+
                return dict(query=query, hostname=hostname)
 
        @expose(template="monitorweb.templates.sitehistory")
index 8fa825b..a0ab370 100644 (file)
@@ -44,10 +44,14 @@ from links import *
                                                                py:content="node.pcu_short_status">Reboot Status</span>
                                                </div>
                                        </td-->
-                                       <td id="node-${node.observed_status}" nowrap="true">
+                                       <!--td id="node-${node.observed_status}" nowrap="true">
+                                               <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td-->
+                                       <!--td nowrap="true" py:content="node.kernel"></td-->
+                                       <!--td py:content="node.date_checked"></td-->
+                                       <td py:content="node.last_checked"></td>
+                                       <td nowrap="true">
                                                <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
-                                       <td nowrap="true" py:content="node.kernel"></td>
-                                       <td py:content="node.date_checked"></td>
+                                       <td py:content="node.status"></td>
                                </tr>
                        </tbody>
                </table>