updated the model for actions, site history
authorStephen Soltesz <soltesz@cs.princeton.edu>
Sat, 21 Mar 2009 00:13:35 +0000 (00:13 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Sat, 21 Mar 2009 00:13:35 +0000 (00:13 +0000)
added email messages for new sample policy.py
enhanced web view to show penalties
updated model in nodebad, pcubad, sitebad

15 files changed:
findall.py
monitor/common.py
monitor/database/info/action.py
monitor/database/info/findbad.py
monitor/database/info/history.py
monitor/reboot.py
monitor/wrapper/emailTxt.py
monitor/wrapper/plccache.py
nodebad.py
pcubad.py
sitebad.py
web/MonitorWeb/monitorweb/controllers.py
web/MonitorWeb/monitorweb/static/css/style.css
web/MonitorWeb/monitorweb/templates/pcuview.kid
web/MonitorWeb/monitorweb/templates/sitelist.kid

index 26335e0..41e23c6 100755 (executable)
@@ -4,6 +4,8 @@ from monitor import parser as parsermodule
 from findbad import main as findbad_main
 from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
 from findbad import main as findbad_main
 from findbadpcu import main as findbadpcu_main
 from sitebad import main as sitebad_main
+from nodebad import main as nodebad_main
+from pcubad import main as pcubad_main
 import sys
 
 if __name__ == '__main__':
 import sys
 
 if __name__ == '__main__':
@@ -11,7 +13,7 @@ if __name__ == '__main__':
        parser = parsermodule.getParser(['nodesets'])
 
        parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
        parser = parsermodule.getParser(['nodesets'])
 
        parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, 
-                                               force=False, pcuselect=None, pcuid=None)
+                                               force=False, pcuselect=None, pcuid=None, pcu=None)
        parser.add_option("", "--cachenodes", action="store_true",
                                                help="Cache node lookup from PLC")
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
        parser.add_option("", "--cachenodes", action="store_true",
                                                help="Cache node lookup from PLC")
        parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
@@ -26,8 +28,15 @@ if __name__ == '__main__':
        cfg = parsermodule.parse_args(parser)
 
        try:
        cfg = parsermodule.parse_args(parser)
 
        try:
+               print "findbad"
                findbad_main()
                findbad_main()
+               print "findbadpcu"
                findbadpcu_main()
                findbadpcu_main()
+               print "nodebad"
+               nodebad_main()
+               print "pcubad"
+               pcubad_main()
+               print "sitebad"
                sitebad_main()
        except Exception, err:
                import traceback
                sitebad_main()
        except Exception, err:
                import traceback
index be4a171..6f88051 100644 (file)
@@ -1,13 +1,12 @@
 
 import time
 import struct
 
 import time
 import struct
-from pcucontrol import reboot
-
+from monitor import reboot
 from monitor import util
 from monitor import database
 from monitor.wrapper import plc, plccache
 
 from monitor import util
 from monitor import database
 from monitor.wrapper import plc, plccache
 
-from datetime import datetime 
+from datetime import datetime, timedelta
 from monitor.model import PersistFlags, Message
 
 esc = struct.pack('i', 27)
 from monitor.model import PersistFlags, Message
 
 esc = struct.pack('i', 27)
@@ -222,3 +221,20 @@ def email_exception(content=None):
     m=Message("exception running monitor", msg, False)
     m.send([config.cc_email])
     return
     m=Message("exception running monitor", msg, False)
     m.send([config.cc_email])
     return
+
+def changed_lessthan(last_changed, days):
+       if datetime.now() - last_changed <= timedelta(days):
+               print "last changed less than %s" % timedelta(days)
+               return True
+       else:
+               print "last changed more than %s" % timedelta(days)
+               return False
+
+def changed_greaterthan(last_changed, days):
+       if datetime.now() - last_changed > timedelta(days):
+               print "last changed more than %s" % timedelta(days)
+               return True
+       else:
+               print "last changed less than %s" % timedelta(days)
+               return False
+       
index 2569e35..77e904c 100644 (file)
@@ -47,8 +47,27 @@ class ActionRecord(Entity):
 
 # ACCOUNTING
        date_created = Field(DateTime,default=datetime.now)
 
 # ACCOUNTING
        date_created = Field(DateTime,default=datetime.now)
+       loginbase = Field(String,default=None)
        hostname = Field(String,default=None)
        hostname = Field(String,default=None)
-       loginbase = Field(String)
+       # NOTE:
+       #       the expected kinds of actions are:
+       #               * reboot node
+       #               * open ticket, send notice 
+       #               * close ticket
+       #               * apply penalty to site
+       #               * backoff penalty to site
+       action = Field(String)
+
+       # NOTE: describes the kind of action.  i.e. online-notice, offline-notice,
+       # reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create,
+       # penalty-disable-slices, 
+       action_type = Field(String, default=None)
+
+       message_id = Field(Integer, default=0)
+       penalty_level = Field(Integer, default=0)
+
+       # NOTE: in case an exception is thrown while trying to perform an action.
+       error_string = Field(String, default=None)
 
        #issue = ManyToOne('IssueRecord')
        # NOTE: this is the parent relation to fb records.  first create the
 
        #issue = ManyToOne('IssueRecord')
        # NOTE: this is the parent relation to fb records.  first create the
@@ -61,15 +80,15 @@ class ActionRecord(Entity):
        #  OR
        #    - find fbnode records
        #    - create action record with fbnodes as argument
        #  OR
        #    - find fbnode records
        #    - create action record with fbnodes as argument
-       findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+       findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
 
        # NOTE: can I move 'message_index, escellation_level, and penalty_level'
        #    into the same value?  Maybe not penalty level, since there are only two;
        #    and, there may be additional message and escellation levels.
 
        # NOTE: can I move 'message_index, escellation_level, and penalty_level'
        #    into the same value?  Maybe not penalty level, since there are only two;
        #    and, there may be additional message and escellation levels.
-       send_email_to = Field(PickleType, default=None)
-       action_description = Field(PickleType, default=None)
-       message_arguments = Field(PickleType, default=None)
+       #send_email_to = Field(PickleType, default=None)
+       #action_description = Field(PickleType, default=None)
+       #message_arguments = Field(PickleType, default=None)
 
        # NOTE: not sure this needs to be in the db.
 
        # NOTE: not sure this needs to be in the db.
-       escellation_level = Field(Integer, default=0)
-       stage = Field(String, default=None)
+       #escellation_level = Field(Integer, default=0)
+       #stage = Field(String, default=None)
index e58ef3a..66859b1 100644 (file)
@@ -80,7 +80,7 @@ class FindbadNodeRecord(Entity):
        observed_status = Field(String,default=None)
 
        # NOTE: this is the child relation
        observed_status = Field(String,default=None)
 
        # NOTE: this is the child relation
-       action = ManyToOne('ActionRecord', required=False)
+       #action = ManyToOne('ActionRecord', required=False)
 
 class FindbadPCURecord(Entity):
        @classmethod
 
 class FindbadPCURecord(Entity):
        @classmethod
index dc53860..e31be2e 100644 (file)
@@ -50,6 +50,13 @@ class HistorySiteRecord(Entity):
 
        status = Field(String,default="unknown")
 
 
        status = Field(String,default="unknown")
 
+       message_id = Field(Int, default=0)
+       message_status = Field(String, default=None)
+       message_queue = Field(String, default=None) 
+       message_created = Field(DateTime, default=None)
+
+       penalty_level = Field(Int, default=0)
+
        @classmethod
        def by_loginbase(cls, loginbase):
                return cls.query.filter_by(loginbase=loginbase).first()
        @classmethod
        def by_loginbase(cls, loginbase):
                return cls.query.filter_by(loginbase=loginbase).first()
index a34c16c..289fb47 100755 (executable)
@@ -67,13 +67,13 @@ def reboot_str(nodename):
        if not pcu:
                logger.debug("no pcu for %s" % nodename)
                print "no pcu for %s" % nodename
        if not pcu:
                logger.debug("no pcu for %s" % nodename)
                print "no pcu for %s" % nodename
-               return False # "%s has no pcu" % nodename
+               return "%s has no pcu" % nodename
 
        values = get_pcu_values(pcu['pcu_id'])
        if values == None:
                logger.debug("No values for pcu probe %s" % nodename)
                print "No values for pcu probe %s" % nodename
 
        values = get_pcu_values(pcu['pcu_id'])
        if values == None:
                logger.debug("No values for pcu probe %s" % nodename)
                print "No values for pcu probe %s" % nodename
-               return False #"no info for pcu_id %s" % pcu['pcu_id']
+               return "no info for pcu_id %s" % pcu['pcu_id']
        
        # Try the PCU first
        logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
        
        # Try the PCU first
        logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
index d1bccaa..675068a 100644 (file)
@@ -207,6 +207,65 @@ ERROR-        This is an error state, where there is absolutely no contact
            with PlanetLab.
        """)
 
            with PlanetLab.
        """)
 
+       pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""",
+
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered for %(hostname)s, but could not for some reason.
+
+Please help.
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+       online_notice=("""Host %(hostname)s is online""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+is online and operational.  Thank you very much for your help!
+       """)
+       test_notice=("""Host %(hostname)s is testing""",
+       """
+This notice is simply to test whether notices work.
+    %(hostname)s
+
+Thank you very much for your help!
+       """)
+       offline_notice=("""Host %(hostname)s is offline""",
+       """
+This notice is simply to let you know that:
+    %(hostname)s
+
+is offline and or non-operational.  Please investigate, thank you very much for your help!
+       """)
+
+       clear_penalty=("""All penalties have been cleared from site %(loginbase)s""",
+       """
+This notice is to let you know that any penalties previously applied to your site have 
+been removed: %(penalty_level)s.
+
+All privileges have been restored.  If your slices were disabled, please allow
+up to 30 minutes for them to return to enabled.
+
+Legend:
+
+  0  - no penalties applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+       """)
+
+       increase_penalty=("""Penalty increased for site %(loginbase)s""",
+       """
+This notice is to let you know that the penalty applied to your site has
+increased: %(penalty_level)s.
+
+legend:
+
+  0  - no penalty applied
+  1  - site is disabled.  no new slices can be created.
+  2+ - all existing slices will be disabled.
+       """)
+
        nmreset =("""NM Reset at %(loginbase)s""",
        """
 Monitor restarted NM on the following machines:
        nmreset =("""NM Reset at %(loginbase)s""",
        """
 Monitor restarted NM on the following machines:
index 3efd791..db71b16 100755 (executable)
@@ -90,7 +90,7 @@ def init():
        api = plc.getCachedAuthAPI()
        l_sites = api.GetSites({'peer_id':None}, 
                                                        ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
        api = plc.getCachedAuthAPI()
        l_sites = api.GetSites({'peer_id':None}, 
                                                        ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
-                                                       'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ])
+                                                       'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled', 'date_created' ])
        l_nodes = api.GetNodes({'peer_id':None}, 
                                                        ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
                                                         'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
        l_nodes = api.GetNodes({'peer_id':None}, 
                                                        ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', 
                                                         'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
index 767a4fe..90c3be0 100755 (executable)
@@ -22,33 +22,77 @@ api = plc.getAuthAPI()
 
 round = 1
 count = 0
 
 round = 1
 count = 0
+def main():
+       main2(config)
 
 
-def main(config):
+def main2(config):
 
        l_plcnodes = plccache.l_nodes
        l_nodes = get_nodeset(config)
        
        checkAndRecordState(l_nodes, l_plcnodes)
 
 
        l_plcnodes = plccache.l_nodes
        l_nodes = get_nodeset(config)
        
        checkAndRecordState(l_nodes, l_plcnodes)
 
+# Node states:
+
+def check_node_state(rec, node):
+
+       node_state = rec.observed_status
+       if rec.plc_node_stats:
+               boot_state = rec.plc_node_stats['boot_state']
+               last_contact = rec.plc_node_stats['last_contact']
+       else:
+               boot_state = "unknown"
+               last_contact = None
+
+       if node_state == 'DOWN' and ( node.status == 'online' or node.status == 'good' ):
+               print "changed status from %s to offline" % node.status
+               node.status = 'offline'
+               node.last_changed = datetime.now()
+
+       if node_state == 'BOOT' and changed_lessthan(node.last_changed, 0.5) and node.status != 'online':
+               print "changed status from %s to online" % node.status
+               node.status = 'online'
+               node.last_changed = datetime.now()
+
+       if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+               #send thank you notice, or on-line notice.
+               print "changed status from %s to good" % node.status
+               node.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1): #  and pcu.status == 'good' 
+       #       # attempt reboots
+       #       pass
+       #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1.5): # and node.has_pcu
+       #       # send PCU failure message
+       #       pass
+
+       if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+               print "changed status from %s to down" % node.status
+               # send down node notice
+               node.status = 'down'
+               node.last_changed = datetime.now()
+
+       if ( boot_state == 'disabled' or last_contact == None ) and \
+                       changed_greaterthan(node.last_changed, 2*30) and \
+                       node.status != 'down':
+               print "changed status from %s to down" % node.status
+               node.status = 'down'
+               node.last_changed = datetime.now()
+
 def checkAndRecordState(l_nodes, l_plcnodes):
        global count
 
        for nodename in l_nodes:
 def checkAndRecordState(l_nodes, l_plcnodes):
        global count
 
        for nodename in l_nodes:
-               d_node = None
-               for node in l_plcnodes:
-                       if node['hostname'] == nodename:
-                               d_node = node
-                               break
-               if not d_node:
-                       continue
 
 
-               pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
-               pf.last_checked = datetime.now()
+               nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                                                       if_new_set={'status' : 'offline', 
+                                                                               'last_changed' : datetime.now()})
+               nodehist.last_checked = datetime.now()
 
                try:
                        # Find the most recent record
                        noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
 
                try:
                        # Find the most recent record
                        noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #print "NODEREC: ", noderec.date_checked
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
                except:
                        print "COULD NOT FIND %s" % nodename
                        import traceback
@@ -59,33 +103,16 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                        print "none object for %s"% nodename
                        continue
 
                        print "none object for %s"% nodename
                        continue
 
-               node_state = noderec.observed_status
-               if noderec.plc_node_stats:
-                       boot_state = noderec.plc_node_stats['boot_state']
-               else:
-                       boot_state = "unknown"
-
-               if node_state == "BOOT":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "good"
-               elif node_state == "DEBUG":
-                       if pf.status != boot_state: 
-                               pf.last_changed = datetime.now()
-                               pf.status = boot_state
-               else:
-                       if pf.status != "down": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "down"
+               check_node_state(noderec, nodehist)
 
                count += 1
 
                count += 1
-               print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
-       print HistoryNodeRecord.query.count()
        session.flush()
        session.flush()
+       print HistoryNodeRecord.query.count()
 
        return True
 
 
        return True
 
@@ -97,7 +124,7 @@ if __name__ == '__main__':
        config = parsermodule.parse_args(parser)
 
        try:
        config = parsermodule.parse_args(parser)
 
        try:
-               main(config)
+               main2(config)
        except Exception, err:
                import traceback
                print traceback.print_exc()
        except Exception, err:
                import traceback
                print traceback.print_exc()
index 13fce72..5d14475 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -4,6 +4,7 @@ import os
 import sys
 import string
 import time
 import sys
 import string
 import time
+import sets
 from datetime import datetime,timedelta
 
 from monitor import database
 from datetime import datetime,timedelta
 
 from monitor import database
@@ -21,12 +22,23 @@ from monitor.model import *
 
 api = plc.getAuthAPI()
 
 
 api = plc.getAuthAPI()
 
-def main(config):
+def main():
+       main2(config)
+
+def main2(config):
 
        l_plcpcus = plccache.l_pcus 
 
        l_pcus = None
 
        l_plcpcus = plccache.l_pcus 
 
        l_pcus = None
-       if config.pcu:
+       if config.site is not None:
+               site = api.GetSites(config.site)
+               l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+               pcus = []
+               for node in l_nodes:
+                       pcus += node['pcu_ids']
+               # clear out dups.
+               l_pcus = [pcu for pcu in sets.Set(pcus)]
+       elif config.pcu:
                for pcu in l_plcpcus:
                        if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
                           ( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
                for pcu in l_plcpcus:
                        if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
                           ( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
@@ -41,6 +53,38 @@ def main(config):
 
 hn2lb = plccache.plcdb_hn2lb
 
 
 hn2lb = plccache.plcdb_hn2lb
 
+def check_pcu_state(rec, pcu):
+
+       pcu_state = rec.reboot_trial_status
+
+       if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \
+                       ( pcu.status == 'online' or pcu.status == 'good' ):
+               print "changed status from %s to offline" % pcu.status
+               pcu.status = 'offline'
+               pcu.last_changed = datetime.now()
+
+       if ( pcu_state == 0 or pcu_state == "0" ) and changed_lessthan(pcu.last_changed, 0.5) and pcu.status != 'online':
+               print "changed status from %s to online" % pcu.status
+               pcu.status = 'online'
+               pcu.last_changed = datetime.now()
+
+       if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5):
+               #send thank you notice, or on-line notice.
+               print "changed status from %s to good" % pcu.status
+               pcu.status = 'good'
+               # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+       if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2):
+               # send down pcu notice
+               print "changed status from %s to down" % pcu.status
+               pcu.status = 'down'
+               pcu.last_changed = datetime.now()
+
+       if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30):
+               print "changed status from %s to down" % pcu.status
+               pcu.status = 'down'
+               pcu.last_changed = datetime.now()
+
 def checkAndRecordState(l_pcus, l_plcpcus):
        count = 0
        for pcuname in l_pcus:
 def checkAndRecordState(l_pcus, l_plcpcus):
        count = 0
        for pcuname in l_pcus:
@@ -53,63 +97,52 @@ def checkAndRecordState(l_pcus, l_plcpcus):
                if not d_pcu:
                        continue
 
                if not d_pcu:
                        continue
 
-               pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'])
-               pf.last_checked = datetime.now()
+               pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'], 
+                                                                       if_new_set={'status' : 'offline', 
+                                                                                               'last_changed' : datetime.now()})
+               pcuhist.last_checked = datetime.now()
 
                try:
                        # Find the most recent record
                        pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
 
                try:
                        # Find the most recent record
                        pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
-                       print "NODEREC: ", pcurec.date_checked
                except:
                except:
-                       print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu)
+                       print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
                        import traceback
                        print traceback.print_exc()
                        # don't have the info to create a new entry right now, so continue.
                        continue 
 
                        import traceback
                        print traceback.print_exc()
                        # don't have the info to create a new entry right now, so continue.
                        continue 
 
-               pcu_state      = pcurec.reboot_trial_status
-               current_state = pcu_state
-
-               if current_state == 0 or current_state == "0":
-                       if pf.status != "good": 
-                               pf.last_changed = datetime.now() 
-                               pf.status = "good"
-               elif current_state == 'NetDown':
-                       if pf.status != "netdown": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "netdown"
-               elif current_state == 'Not_Run':
-                       if pf.status != "badconfig": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "badconfig"
-               else:
-                       if pf.status != "error": 
-                               pf.last_changed = datetime.now()
-                               pf.status = "error"
+               if not pcurec:
+                       print "none object for pcu %s"% reboot.pcu_name(d_pcu)
+                       continue
+
+               check_pcu_state(pcurec, pcuhist)
 
                count += 1
 
                count += 1
-               print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+               print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple())))
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
 
        # NOTE: this commits all pending operations to the DB.  Do not remove, or
        # replace with another operations that also commits all pending ops, such
        # as session.commit() or flush() or something
-       print HistoryPCURecord.query.count()
        session.flush()
        session.flush()
+       print HistoryPCURecord.query.count()
 
        return True
 
 if __name__ == '__main__':
        parser = parsermodule.getParser()
 
        return True
 
 if __name__ == '__main__':
        parser = parsermodule.getParser()
-       parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False)
+       parser.set_defaults(filename=None, pcu=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False)
        parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
                                                help="Provide a single pcu to operate on")
        parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
                                                help="Provide a single pcu to operate on")
+       parser.add_option("", "--site", dest="site", metavar="sitename", 
+                                               help="Provide a single sitename to operate on")
        parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
                                                help="Provide a list of files to operate on")
 
        config = parsermodule.parse_args(parser)
 
        try:
        parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
                                                help="Provide a list of files to operate on")
 
        config = parsermodule.parse_args(parser)
 
        try:
-               main(config)
+               main2(config)
        except Exception, err:
                import traceback
                print traceback.print_exc()
        except Exception, err:
                import traceback
                print traceback.print_exc()
index 15c8f1d..5a2f3be 100755 (executable)
@@ -9,7 +9,7 @@ from datetime import datetime,timedelta
 from monitor import database
 from monitor import parser as parsermodule
 from monitor import config
 from monitor import database
 from monitor import parser as parsermodule
 from monitor import config
-from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session
+from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session
 from monitor.wrapper import plc, plccache
 from monitor.const import MINUP
 
 from monitor.wrapper import plc, plccache
 from monitor.const import MINUP
 
@@ -36,33 +36,52 @@ def main2(config):
        
        checkAndRecordState(l_sites, l_plcsites)
 
        
        checkAndRecordState(l_sites, l_plcsites)
 
-def getnewsite(nodelist):
-       new = True
-       for node in nodelist:
-               try:
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       if noderec is not None and \
-                               noderec.plc_node_stats['last_contact'] != None:
-                               new = False
-               except:
-                       import traceback
-                       print traceback.print_exc()
-       return new
-
 def getnodesup(nodelist):
        up = 0
        for node in nodelist:
                try:
 def getnodesup(nodelist):
        up = 0
        for node in nodelist:
                try:
-                       noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
-                       #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], 
-                       #                                                                  orderBy='date_checked').reversed()[0]
-                       if noderec is not None and noderec.observed_status == "BOOT":
+                       nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+                       if nodehist is not None and nodehist.status == "good":
                                up = up + 1
                except:
                        import traceback
                        print traceback.print_exc()
        return up
 
                                up = up + 1
                except:
                        import traceback
                        print traceback.print_exc()
        return up
 
+def check_site_state(rec, sitehist):
+
+       if sitehist.new and sitehist.status != 'new':
+               sitehist.status = 'new'
+               sitehist.last_changed = datetime.now()
+
+       if not sitehist.new:
+
+               if sitehist.nodes_up >= MINUP:
+
+                       if sitehist.status != 'online' and sitehist.status != 'good':
+                               sitehist.last_changed = datetime.now()
+
+                       if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+                               print "changed status from %s to online" % sitehist.status
+                               sitehist.status = 'online'
+
+                       if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+                               print "changed status from %s to good" % sitehist.status
+                               sitehist.status = 'good'
+       
+               else: # sitehist.nodes_up < MINUP:
+
+                       if sitehist.status != 'offline' and sitehist.status != 'down':
+                               sitehist.last_changed = datetime.now()
+
+                       if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+                               print "changed status from %s to offline" % sitehist.status
+                               sitehist.status = 'offline'
+
+                       if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+                               print "changed status from %s to down" % sitehist.status
+                               sitehist.status = 'down'
+
 def checkAndRecordState(l_sites, l_plcsites):
        count = 0
        lb2hn = plccache.plcdb_lb2hn
 def checkAndRecordState(l_sites, l_plcsites):
        count = 0
        lb2hn = plccache.plcdb_lb2hn
@@ -76,27 +95,32 @@ def checkAndRecordState(l_sites, l_plcsites):
                        continue
 
                if sitename in lb2hn:
                        continue
 
                if sitename in lb2hn:
-                       pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
-
-                       pf.last_checked = datetime.now()
-                       pf.slices_total = d_site['max_slices']
-                       pf.slices_used = len(d_site['slice_ids'])
-                       pf.nodes_total = len(lb2hn[sitename])
-                       pf.nodes_up = getnodesup(lb2hn[sitename])
-                       pf.new = getnewsite(lb2hn[sitename])
-                       pf.enabled = d_site['enabled']
-
-                       if pf.nodes_up >= MINUP:
-                               if pf.status != "good": pf.last_changed = datetime.now()
-                               pf.status = "good"
-                       else:
-                               if pf.status != "down": pf.last_changed = datetime.now()
-                               pf.status = "down"
+                       sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename,
+                                                                                               if_new_set={'status' : 'unknown', 
+                                                                                                                       'last_changed' : datetime.now(),
+                                                                                                                       'message_id': 0,
+                                                                                                                       'penalty_level' : 0})
+                       sitehist.last_checked = datetime.now()
+
+                       sitehist.slices_total = d_site['max_slices']
+                       sitehist.slices_used = len(d_site['slice_ids'])
+                       sitehist.nodes_total = len(lb2hn[sitename])
+                       if sitehist.message_id != 0:
+                               rtstatus = mailer.getTicketStatus(sitehist.message_id)
+                               sitehist.message_status = rtstatus['Status']
+                               sitehist.message_queue = rtstatus['Queue']
+                               sitehist.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+                       sitehist.nodes_up = getnodesup(lb2hn[sitename])
+                       sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago
+                       sitehist.enabled = d_site['enabled']
+
+                       check_site_state(d_site, sitehist)
 
                        count += 1
 
                        count += 1
-                       print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
-                                                                                       pf.nodes_total, pf.nodes_up, pf.status)
-                       pf.flush()
+                       print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, sitehist.slices_used, 
+                                                                                       sitehist.nodes_total, sitehist.nodes_up, sitehist.status)
+                       sitehist.flush()
 
        print HistorySiteRecord.query.count()
        session.flush()
 
        print HistorySiteRecord.query.count()
        session.flush()
index a95f3d6..2112979 100644 (file)
@@ -389,7 +389,7 @@ class Root(controllers.RootController):
 
        @expose(template="monitorweb.templates.sitelist")
        def site(self, filter='all'):
 
        @expose(template="monitorweb.templates.sitelist")
        def site(self, filter='all'):
-               filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0}
+               filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
                fbquery = HistorySiteRecord.query.all()
                query = []
                for site in fbquery:
                fbquery = HistorySiteRecord.query.all()
                query = []
                for site in fbquery:
index df07184..473a4d9 100644 (file)
@@ -102,9 +102,16 @@ a.right { float: right; }
 #status-error  { background-color: indianred; }\r
 #status-none   { background-color: white; }\r
 \r
 #status-error  { background-color: indianred; }\r
 #status-none   { background-color: white; }\r
 \r
+#site-new { background-color: gold; }\r
 #site-good { background-color : darkseagreen; }\r
 #site-good { background-color : darkseagreen; }\r
+#site-online { background-color : lightgreen; }\r
+#site-offline { background-color: red; }\r
 #site-down { background-color: indianred; }\r
 \r
 #site-down { background-color: indianred; }\r
 \r
+#site-0 { background-color : white; }\r
+#site-1 { background-color: gold; }\r
+#site-2 { background-color: indianred; }\r
+\r
 #node-BOOT { background-color: darkseagreen; }\r
 #node-DOWN { background-color: indianred; }\r
 #node-DEBUG { background-color: gold; }\r
 #node-BOOT { background-color: darkseagreen; }\r
 #node-DOWN { background-color: indianred; }\r
 #node-DEBUG { background-color: gold; }\r
index 9740ea0..c9e1bd4 100644 (file)
@@ -30,7 +30,7 @@ from links import *
                                                        <span class="icon">${site.loginbase}</span></a>
                                        </td>
                                        <td py:content="site.enabled"></td>
                                                        <span class="icon">${site.loginbase}</span></a>
                                        </td>
                                        <td py:content="site.enabled"></td>
-                                       <td>n/a</td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
index a9b7685..a2bac31 100644 (file)
@@ -46,7 +46,7 @@ from links import *
                                                </div>
                                        </td>
                                        <td py:content="site.enabled"></td>
                                                </div>
                                        </td>
                                        <td py:content="site.enabled"></td>
-                                       <td>n/a</td>
+                                       <td id="site-${site.penalty_level}">${site.penalty_level}</td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
                                        <td>${site.slices_used}/${site.slices_total}</td>
                                        <td>${site.nodes_up} / ${site.nodes_total}</td>
                                        <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>