merge from 2.0 branch

author Stephen Soltesz <soltesz@cs.princeton.edu>

Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
diff --git a/Monitor.spec b/Monitor.spec

index 005e66a..5f08b25 100644 (file)
--- a/Monitor.spec
+++ b/Monitor.spec
@@ -53,8 +53,8 @@ Summary: Monitor hooks for the PLC server.
  Group: Applications/System
  
  Requires: python
-Requires: python-sqlalchemy
-Requires: python-elixir
+#Requires: python-sqlalchemy
+#Requires: python-elixir
  
  Requires: openssh-clients
  Requires: perl-libwww-perl
@@ -65,9 +65,9 @@ Requires: nmap
  Requires: PLCWWW >= 4.2
  Requires: bootcd-planetlab-i386 >= 4.2
  
-Requires: zabbix-client
-Requires: zabbix-gui
-Requires: zabbix-server
+#Requires: zabbix-client
+#Requires: zabbix-gui
+#Requires: zabbix-server
  
  %description server
  The server side include all python modules and scripts needed to fully
@@ -202,7 +202,7 @@ rm -rf $RPM_BUILD_ROOT
  php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py
  
  # apply patches to zabbix
-patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
+#patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
  
  #chkconfig --add monitor-server
  #chkconfig monitor-server on
diff --git a/bootman.py b/bootman.py

index 1a04ef0..4f8fb54 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -24,6 +24,7 @@ from monitor import const
  from monitor.model import *
  from monitor.common import email_exception, found_within
  from monitor.database.info.model import *
+from monitor.database.info.interface import *
  from monitor.wrapper import plc
  from monitor.wrapper import plccache
  from monitor.wrapper.emailTxt import mailtxt
@@ -59,6 +60,7 @@ class NodeConnection:
                         traceback.print_exc()
                         print self.c.modules.sys.path
                 except:
+                       email_exception()
                         traceback.print_exc()
  
                 return "unknown"
@@ -71,7 +73,8 @@ class NodeConnection:
  
         def get_bootmanager_log(self):
                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
                 log = open("log/bm.%s.log" % self.node, 'r')
                 return log
  
@@ -863,7 +866,11 @@ def main():
                 sys.exit(1)
  
         for node in nodes:
-               reboot(node, config)
+               # get sitehist
+               lb = plccache.plcdb_hn2lb[node]
+               sitehist = SiteInterface.get_or_make(loginbase=lb)
+               #reboot(node, config)
+               restore(sitehist, node, config=None, forced_action=None)
  
  if __name__ == "__main__":
         main()
diff --git a/findall.py b/findall.py

index 64c4987..e96c1c4 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -7,6 +7,8 @@ from sitebad import main as sitebad_main
  from nodebad import main as nodebad_main
  from pcubad import main as pcubad_main
  from monitor.wrapper import plccache
+from monitor.database.info.model import  *
+from monitor.common import  *
  import sys
  
  if __name__ == '__main__':
@@ -29,20 +31,34 @@ if __name__ == '__main__':
         cfg = parsermodule.parse_args(parser)
  
         try:
-               print "sync with plc"
-               plccache.sync()
                 print "findbad"
                 findbad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                 print "findbadpcu"
                 findbadpcu_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                 print "nodebad"
                 nodebad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                 print "pcubad"
                 pcubad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                 print "sitebad"
                 sitebad_main()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
         except Exception, err:
                 import traceback
+               email_exception()
                 print traceback.print_exc()
                 print "Exception: %s" % err
                 print "Saving data... exitting."
diff --git a/findbadpcu.py b/findbadpcu.py

index ab4f5ff..9eb3be7 100755 (executable)
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -103,10 +103,9 @@ def main():
                 l_pcus = [pcu for pcu in sets.Set(pcus)]
  
         elif config.node is not None:
-               l_nodes = plcacche.GetNodeByName(config.node)
-               pcus = []
-               for node in l_nodes:
-                       pcus += node['pcu_ids']
+               node = plccache.GetNodeByName(config.node)
+               print node
+               pcus = node['pcu_ids']
                 # clear out dups.
                 l_pcus = [pcu for pcu in sets.Set(pcus)]
  
diff --git a/monitor-server.init b/monitor-server.init

index b627c17..12193da 100644 (file)
--- a/monitor-server.init
+++ b/monitor-server.init
@@ -364,8 +364,8 @@ case "$1" in
                 check_monitor_conf
                 check_monitor_schema_and_data
  
-               check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
-               check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
+               #check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
+               #check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
  
                 if [ -n "$WROTE_PG_CONFIG" ] ; then
                         # NOTE: restart db to enable access by users granted above.
@@ -375,8 +375,8 @@ case "$1" in
                         dialog "$MESSAGE"
                 fi
  
-               check_zabbix_schema_and_data
-               check_zabbix_templates_and_import
+               #check_zabbix_schema_and_data
+               #check_zabbix_templates_and_import
  
  
                 # create /etc/httpd/conf.d/monitorweb.conf
@@ -390,9 +390,9 @@ case "$1" in
                 start_tg_server
  
                 # START zabbix services.  SETUP default config files.
-               check_zab_server
-               check_zab_agentd
-               check_zab_webconfig
+               #check_zab_server
+               #check_zab_agentd
+               #check_zab_webconfig
  
                 result "$MESSAGE"
         ;;
@@ -442,8 +442,8 @@ case "$1" in
                 dialog "$MESSAGE"
  
                 stop_tg_server
-               service zabbix_server stop
-               service zabbix_agentd stop
+               #service zabbix_server stop
+               #service zabbix_agentd stop
                 # TODO: is there anything to stop?
                 result "$MESSAGE"
         ;;
diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py

index a5139eb..5e38aca 100644 (file)
--- a/monitor/database/info/findbad.py
+++ b/monitor/database/info/findbad.py
@@ -11,46 +11,18 @@ __metadata__ = mon_metadata
  __session__  = mon_session
  
  
-#class FindbadNodeRecordSync(Entity):
-#      hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-#      round    = Field(Int,default=0)
-       
-#class FindbadPCURecordSync(Entity):
-#      plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-#      round     = Field(Int,default=0)
-
  class FindbadNodeRecord(Entity):
         @classmethod
         def get_all_latest(cls):
                 return cls.query.all()
-               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               #if fbsync:
-               #       return cls.query.filter_by(round=fbsync.round)
-               #else:
-               #       return []
  
         @classmethod
         def get_latest_by(cls, **kwargs):
                 return cls.query.filter_by(**kwargs).first()
-               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               #if fbsync:
-               #       kwargs['round'] = fbsync.round
-               #       return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
-               #else:
-               #       return []
  
         @classmethod
         def get_latest_n_by(cls, n=3, **kwargs):
                 return cls.query.filter_by(**kwargs)
-               #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
-               #kwargs['round'] = fbsync.round
-               #ret = []
-               #for i in range(0,n):
-               #       kwargs['round'] = kwargs['round'] - i
-               #       f = cls.query.filter_by(**kwargs).first()
-               #       if f:
-               #               ret.append(f)
-               #return ret
  
  # ACCOUNTING
         date_checked = Field(DateTime,default=datetime.now)
@@ -99,7 +71,7 @@ class FindbadPCURecord(Entity):
  # ACCOUNTING
         date_checked = Field(DateTime)
         round = Field(Int,default=0)
-       plc_pcuid = Field(Int) #alternateID=True,alternateMethodName='by_pcuid')
+       plc_pcuid = Field(Int)
  
  # EXTERNAL
         plc_pcu_stats = Field(PickleType,default=None)
diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py

index 3c5842a..6d2ed83 100644 (file)
--- a/monitor/database/info/history.py
+++ b/monitor/database/info/history.py
@@ -15,6 +15,7 @@ class HistoryNodeRecord(Entity):
         last_checked = Field(DateTime,default=datetime.now)
         last_changed = Field(DateTime,default=datetime.now)
         status = Field(String,default="unknown")
+       haspcu = Field(Boolean,default=False)
         acts_as_versioned(ignore=['last_changed', 'last_checked'])
  
         @classmethod
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py

index 2e5064d..29b19be 100644 (file)
--- a/monitor/database/info/interface.py
+++ b/monitor/database/info/interface.py
@@ -193,6 +193,6 @@ class SiteInterface(HistorySiteRecord):
                 act = ActionRecord(loginbase=self.db.loginbase,
                                                         hostname=hostname,
                                                         action='reboot',
-                                                       action_type='first_try_reboot',
+                                                       action_type='try_reboot',
                                                         error_string=err)
  
diff --git a/monitor/scanapi.py b/monitor/scanapi.py

index 963822d..f7939e6 100644 (file)
--- a/monitor/scanapi.py
+++ b/monitor/scanapi.py
@@ -20,7 +20,7 @@ from monitor.sources import comon
  from monitor.wrapper import plc, plccache
  
  import traceback
-from monitor.common import nmap_port_status
+from monitor.common import nmap_port_status, email_exception
  
  COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                         "table=table_nodeview&" + \
@@ -157,6 +157,7 @@ class ScanInterface(object):
  
                 except:
                         print "ERROR:"
+                       email_exception(nodename)
                         print traceback.print_exc()
                         pass
  
@@ -334,9 +335,10 @@ EOF                                """)
                         plc_lock.acquire()
                         d_node = None
                         try:
-                               d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
-                                                                               'date_created', 'last_updated', 
-                                                                               'last_contact', 'boot_state', 'nodegroup_ids'])[0]
+                               d_node = plccache.GetNodeByName(nodename)
+                               #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 
+                               #                                               'date_created', 'last_updated', 
+                               #                                               'last_contact', 'boot_state', 'nodegroup_ids'])[0]
                         except:
                                 traceback.print_exc()
                         plc_lock.release()
@@ -363,8 +365,9 @@ EOF                         """)
                         d_site = None
                         values['loginbase'] = ""
                         try:
-                               d_site = plc.getSites({'site_id': site_id}, 
-                                                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
+                               d_site = plccache.GetSitesById([ site_id ])[0]
+                               #d_site = plc.getSites({'site_id': site_id}, 
+                               #                                       ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
                                 values['loginbase'] = d_site['login_base']
                         except:
                                 traceback.print_exc()
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py

index 220eb10..b50be5b 100644 (file)
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -207,6 +207,18 @@ ERROR-        This is an error state, where there is absolutely no contact
             with PlanetLab.
         """)
  
+       pcumissing_notice =("""MONTEST: No PCU available to reboot %(hostname)s""",
+"""As part of PlanetLab node monitoring and maintenance, we noticed that there is no PCU
+associated with %(hostname)s, so we could not reboot it ourselves.
+
+To save you time in the future, please take a moment to register the PCU functionality for
+your machines here:
+
+    http://www.planet-lab.org/db/sites/pcu.php
+
+Thank you very much for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
         pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
  
  """As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
@@ -244,7 +256,11 @@ If any action is needed from you, you will recieve additional notices.  Thank yo
  This notice is simply to let you know that:
      %(hostname)s
  
-is down, disconnected from the network and/or non-operational.  Please investigate, thank you very much for your help!
+is down, disconnected from the network and/or non-operational.  
+
+Please investigate, thank you very much for your help!
+
+       http://monitor.planet-lab.org:8082/pcuview?loginbase=%(loginbase)s
         """)
  
         clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py

index 0645b18..75ca49b 100755 (executable)
--- a/monitor/wrapper/plccache.py
+++ b/monitor/wrapper/plccache.py
@@ -3,6 +3,7 @@
  import sys
  from monitor.wrapper import plc
  from monitor.database.info.model import *
+import profile
  
  def dsites_from_lsites(l_sites):
         d_sites = {}
@@ -67,17 +68,22 @@ def init():
         global plcdb_hn2lb
         global plcdb_lb2hn
         global plcdb_id2lb
+       print "initing plccache"
  
         dbsites = PlcSite.query.all()
         l_sites = [ s.plc_site_stats for s in dbsites ]
  
+       print "plcnode"
         dbnodes = PlcNode.query.all()
         l_nodes = [ s.plc_node_stats for s in dbnodes ]
  
+       print "plcpcu"
         dbpcus = PlcPCU.query.all()
         l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
  
+       print "dsites_from_lsites"
         (d_sites,id2lb) = dsites_from_lsites(l_sites)
+       print "dsn_from_dsln"
         (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
  
         plcdb_hn2lb = hn2lb
@@ -108,14 +114,31 @@ def GetSitesByName(sitelist):
                 ret.append(site.plc_site_stats)
         return ret
  
+def GetSitesById(idlist):
+       ret = []
+       for site_id in idlist:
+               site = PlcSite.get_by(site_id=site_id)
+               ret.append(site.plc_site_stats)
+       return ret
+
+def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_base'):
+       dbobjs = objectClass.query.all()
+       dbobj_key = [ getattr(s, dbKey) for s in dbobjs ]
+       plcobj_key = [ s[plcKey] for s in l_plc ]
+       extra_key = set(dbobj_key) - set(plcobj_key)
+       for obj in extra_key:
+               print "deleting %s" % obj
+               dbobj = objectClass.get_by(**{dbKey : obj})
+               dbobj.delete()
+
  def sync():
         l_sites = plc.api.GetSites({'peer_id':None}, 
                                                 ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
                                                 'longitude', 'max_slices', 'slice_ids', 'node_ids', 
                                                 'enabled', 'date_created' ])
         l_nodes = plc.api.GetNodes({'peer_id':None}, 
-                                               ['hostname', 'node_id', 'ports', 'site_id', 
-                                                'version', 'last_updated', 'date_created', 
+                                               ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+                                                'version', 'last_updated', 'date_created', 'key',
                                                  'last_contact', 'pcu_ids', 'nodenetwork_ids'])
         l_pcus = plc.api.GetPCUs()
  
@@ -125,8 +148,17 @@ def sync():
                 dbsite.loginbase = site['login_base']
                 dbsite.date_checked = datetime.now()
                 dbsite.plc_site_stats = site
-               #dbsite.flush()
-       # TODO: delete old records.
+       deleteExtra(l_sites, PlcSite, 'loginbase', 'login_base')
+       deleteExtra(l_sites, HistorySiteRecord, 'loginbase', 'login_base')
+       session.flush()
+
+       print "sync pcus"
+       for pcu in l_pcus:
+               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+               dbpcu.date_checked = datetime.now()
+               dbpcu.plc_pcu_stats = pcu
+       deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
+       deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
         session.flush()
  
         print "sync nodes"
@@ -135,17 +167,8 @@ def sync():
                 dbnode.hostname = node['hostname']
                 dbnode.date_checked = datetime.now()
                 dbnode.plc_node_stats = node
-               #dbnode.flush()
-       # TODO: delete old records.
-       session.flush()
-
-       print "sync pcus"
-       for pcu in l_pcus:
-               dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
-               dbpcu.date_checked = datetime.now()
-               dbpcu.plc_pcu_stats = pcu
-               #dbpcu.flush()
-       # TODO: delete old records.
+       deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname')
+       deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname')
         session.flush()
  
         init()
@@ -153,6 +176,6 @@ def sync():
         return
  
  if __name__ == '__main__':
-       sync()
+       profile.run('sync()')
  else:
         init()
diff --git a/nodebad.py b/nodebad.py

index 46ca879..c3aae39 100755 (executable)
--- a/nodebad.py
+++ b/nodebad.py
@@ -38,6 +38,7 @@ def check_node_state(rec, node):
  
         node_state = rec.observed_status
         if rec.plc_node_stats:
+               print rec.plc_node_stats
                 boot_state = rec.plc_node_stats['boot_state']
                 last_contact = rec.plc_node_stats['last_contact']
         else:
@@ -47,6 +48,11 @@ def check_node_state(rec, node):
         if boot_state == 'disable': boot_state = 'disabled'
         if boot_state == 'diag':        boot_state = 'diagnose'
  
+       if len(rec.plc_node_stats['pcu_ids']) > 0:
+               node.haspcu = True
+       else:
+               node.haspcu = False
+
         # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
         #                       'translations' into the node.status state
         #               'BOOT' is a permanent state, but we want it to have a bit of
@@ -131,6 +137,7 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                 except:
                         print "COULD NOT FIND %s" % nodename
                         import traceback
+                       email_exception()
                         print traceback.print_exc()
                         continue
  
@@ -143,11 +150,8 @@ def checkAndRecordState(l_nodes, l_plcnodes):
                 count += 1
                 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
  
-       # NOTE: this commits all pending operations to the DB.  Do not remove, or
-       # replace with another operations that also commits all pending ops, such
-       # as session.commit() or flush() or something
+       # NOTE: this commits all pending operations to the DB.  Do not remove. 
         session.flush()
-       print HistoryNodeRecord.query.count()
  
         return True
  
diff --git a/nodeconfig.py b/nodeconfig.py

index 788d7f8..3fe9a84 100755 (executable)
--- a/nodeconfig.py
+++ b/nodeconfig.py
@@ -56,6 +56,7 @@ def main():
                         #       print k, "==" , net[k]
                 except:
                         print "Error with %s" % node
+                       email_exception()
                         import traceback; print traceback.print_exc()
                         pass
  
diff --git a/pcubad.py b/pcubad.py

index 9f0468c..59dfe7a 100755 (executable)
--- a/pcubad.py
+++ b/pcubad.py
@@ -40,10 +40,8 @@ def main2(config):
                 l_pcus = [pcu for pcu in sets.Set(pcus)]
  
         elif config.node:
-               l_nodes = plccache.GetNodeByName(config.node)
-               pcus = []
-               for node in l_nodes:
-                       pcus += node['pcu_ids']
+               node = plccache.GetNodeByName(config.node)
+               pcus = node['pcu_ids']
                 # clear out dups.
                 l_pcus = [pcu for pcu in sets.Set(pcus)]
  
@@ -117,6 +115,7 @@ def checkAndRecordState(l_pcus, l_plcpcus):
                 except:
                         print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
                         import traceback
+                       email_exception()
                         print traceback.print_exc()
                         # don't have the info to create a new entry right now, so continue.
                         continue 
diff --git a/pcucontrol/models/IPAL.py b/pcucontrol/models/IPAL.py

index 48394df..a2ea026 100644 (file)
--- a/pcucontrol/models/IPAL.py
+++ b/pcucontrol/models/IPAL.py
@@ -17,7 +17,7 @@ class IPAL(PCUControl):
  
                 try:
                         # TODO: make sleep backoff, before stopping.
-                       time.sleep(4)
+                       time.sleep(8)
                         ret = s.recv(count, socket.MSG_DONTWAIT)
                 except socket.error, e:
                         if e[0] == errno.EAGAIN:
diff --git a/policy.py b/policy.py

index 4befbd9..43b37ca 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -47,6 +47,7 @@ def main(hostnames, sitenames):
                         lb = plccache.plcdb_hn2lb[host]
                 except:
                         print "unknown host in plcdb_hn2lb %s" % host
+                       email_exception(host)
                         continue
  
                 nodeblack = BlacklistRecord.get_by(hostname=host)
@@ -64,35 +65,46 @@ def main(hostnames, sitenames):
                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
                 if nodehist.status == 'good' and \
                         changed_lessthan(nodehist.last_changed, 1.0) and \
+                       found_within(recent_actions, 'down_notice', 7.0) and \
                         not found_within(recent_actions, 'online_notice', 0.5):
+                           # NOTE: searching for down_notice proves that the node has
+                               #               gone through a 'down' state first, rather than just
+                               #               flapping through: good, offline, online, ...
+                               #       
                                 # NOTE: there is a narrow window in which this command must be
-                               # evaluated, otherwise the notice will not go out.  this is not ideal.
+                               #               evaluated, otherwise the notice will not go out.  
+                               #               this is not ideal.
                                 sitehist.sendMessage('online_notice', hostname=host, viart=False)
                                 print "send message for host %s online" % host
  
-                               pass
  
-               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+               # if a node is offline and doesn't have a PCU, remind the user that they should have one.
+               if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                         changed_greaterthan(nodehist.last_changed,1.0) and \
-                       not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+                       not found_within(recent_actions, 'pcumissing_notice', 7.0):
+
+                               sitehist.sendMessage('pcumissing_notice', hostname=host)
+                               print "send message for host %s pcumissing_notice" % host
+
+               # if it is offline and HAS a PCU, then try to use it.
+               if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+                       changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not found_between(recent_actions, 'try_reboot', 3.5, 1):
  
                                 sitehist.attemptReboot(host)
-                               print "send message for host %s first_try_reboot" % host
-                               pass
+                               print "send message for host %s try_reboot" % host
  
-               # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+               # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
                 #               will be false for a day after the above condition is satisfied
-               if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+               if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                         changed_greaterthan(nodehist.last_changed,1.5) and \
-                       found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+                       found_between(recent_actions, 'try_reboot', 3.5, 1) and \
                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
-                       # found_within(recent_actions, 'first_try_reboot', 3.5) and \
                                 
                                 # send pcu failure message
                                 #act = ActionRecord(**kwargs)
                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
                                 print "send message for host %s PCU Failure" % host
-                               pass
  
                 if nodehist.status == 'monitordebug' and \
                         changed_greaterthan(nodehist.last_changed, 1) and \
@@ -111,9 +123,10 @@ def main(hostnames, sitenames):
  
                                 sitehist.sendMessage('down_notice', hostname=host)
                                 print "send message for host %s down" % host
-                               pass
  
                 node_count = node_count + 1
+               print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+               sys.stdout.flush()
                 session.flush()
  
         for i,site in enumerate(sitenames):
@@ -158,13 +171,16 @@ def main(hostnames, sitenames):
                 # find all ticket ids for site ( could be on the site record? )
                 # determine if there are penalties within the last 30 days?
                 # if so, add a 'pause_penalty' action.
-               if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+               if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
+                       sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
                         #       pause escalation
                         print "Pausing penalties for %s" % site
                         sitehist.pausePenalty()
  
                 site_count = site_count + 1
  
+               print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+               sys.stdout.flush()
                 session.flush()
  
         session.flush()
@@ -227,6 +243,7 @@ if __name__ == "__main__":
  
         try:
                 main(hostnames, sitenames)
+               session.flush()
         except KeyboardInterrupt:
                 print "Killed by interrupt"
                 session.flush()
diff --git a/showlatlon.py b/showlatlon.py

index 951802a..2176462 100755 (executable)
--- a/showlatlon.py
+++ b/showlatlon.py
@@ -9,7 +9,7 @@ from datetime import datetime, timedelta
  
  import database
  import comon
-from monitor.common import color_pcu_state, datetime_fromstr
+from monitor.common import color_pcu_state, datetime_fromstr, email_exception
  from nodehistory import get_filefromglob
  import time
  import traceback
@@ -211,3 +211,5 @@ if __name__ == "__main__":
                 main()
         except IOError:
                 pass
+       except:
+               email_exception()
diff --git a/sitebad.py b/sitebad.py

index 4d9ee33..6c09c1c 100755 (executable)
--- a/sitebad.py
+++ b/sitebad.py
@@ -44,6 +44,8 @@ def getnodesup(nodelist):
         up = 0
         for node in nodelist:
                 try:
+                       # NOTE: adding a condition for nodehist.haspcu would include pcus
+                       #               in the calculation
                         nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
                         nodebl   = BlacklistRecord.get_by(hostname=node['hostname'])
                         if (nodehist is not None and nodehist.status != 'down') or \
@@ -51,6 +53,7 @@ def getnodesup(nodelist):
                                 up = up + 1
                 except:
                         import traceback
+                       email_exception(node['hostname'])
                         print traceback.print_exc()
         return up
  
diff --git a/syncwithplc.py b/syncwithplc.py

new file mode 100755 (executable)

index 0000000..af01841
--- /dev/null
+++ b/syncwithplc.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+
+from monitor.wrapper import plccache
+
+if __name__ == "__main__":
+       plccache.sync()
diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py

index 1c4efe9..7cbaf4f 100644 (file)
--- a/web/MonitorWeb/monitorweb/controllers.py
+++ b/web/MonitorWeb/monitorweb/controllers.py
@@ -315,7 +315,9 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
         @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
         def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
                 print "PCUVIEW------------------"
-               session.clear()
+               print "befor-len: ", len( [ i for i in session] )
+               session.flush(); session.clear()
+               print "after-len: ", len( [ i for i in session] )
                 sitequery=[]
                 pcuquery=[]
                 nodequery=[]
@@ -333,7 +335,7 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
  
                 if loginbase:
                         actions = ActionRecord.query.filter_by(loginbase=loginbase
-                                                       ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+                                                       ).filter(ActionRecord.date_created >= datetime.now() - timedelta(14)
                                                         ).order_by(ActionRecord.date_created.desc())
                         actions = [ a for a in actions ]
                         sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
@@ -387,13 +389,21 @@ class Root(controllers.RootController, MonitorXmlrpcServer):
         def nodehistory(self, hostname=None):
                 query = []
                 if hostname:
-                       fbnode = FindbadNodeRecord.get_by(hostname=hostname)
-                       # TODO: add links for earlier history if desired.
+                       #fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+                       ## TODO: add links for earlier history if desired.
+                       #l = fbnode.versions[-100:]
+                       #l.reverse()
+                       #for node in l:
+                       #       prep_node_for_display(node)
+                       #       query.append(node)
+
+                       fbnode = HistoryNodeRecord.get_by(hostname=hostname)
                         l = fbnode.versions[-100:]
                         l.reverse()
                         for node in l:
-                               prep_node_for_display(node)
+                               #prep_node_for_display(node)
                                 query.append(node)
+
                 return dict(query=query, hostname=hostname)
  
         @expose(template="monitorweb.templates.sitehistory")
diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid

index 8fa825b..a0ab370 100644 (file)
--- a/web/MonitorWeb/monitorweb/templates/nodehistory.kid
+++ b/web/MonitorWeb/monitorweb/templates/nodehistory.kid
@@ -44,10 +44,14 @@ from links import *
                                                                 py:content="node.pcu_short_status">Reboot Status</span>
                                                 </div>
                                         </td-->
-                                       <td id="node-${node.observed_status}" nowrap="true">
+                                       <!--td id="node-${node.observed_status}" nowrap="true">
+                                               <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td-->
+                                       <!--td nowrap="true" py:content="node.kernel"></td-->
+                                       <!--td py:content="node.date_checked"></td-->
+                                       <td py:content="node.last_checked"></td>
+                                       <td nowrap="true">
                                                 <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
-                                       <td nowrap="true" py:content="node.kernel"></td>
-                                       <td py:content="node.date_checked"></td>
+                                       <td py:content="node.status"></td>
                                 </tr>
                         </tbody>
                 </table>
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Thu, 16 Apr 2009 22:55:29 +0000 (22:55 +0000)
Monitor.spec		patch \| blob \| history
bootman.py		patch \| blob \| history
findall.py		patch \| blob \| history
findbadpcu.py		patch \| blob \| history
monitor-server.init		patch \| blob \| history
monitor/database/info/findbad.py		patch \| blob \| history
monitor/database/info/history.py		patch \| blob \| history
monitor/database/info/interface.py		patch \| blob \| history
monitor/scanapi.py		patch \| blob \| history
monitor/wrapper/emailTxt.py		patch \| blob \| history
monitor/wrapper/plccache.py		patch \| blob \| history
nodebad.py		patch \| blob \| history
nodeconfig.py		patch \| blob \| history
pcubad.py		patch \| blob \| history
pcucontrol/models/IPAL.py		patch \| blob \| history
policy.py		patch \| blob \| history
showlatlon.py		patch \| blob \| history
sitebad.py		patch \| blob \| history
syncwithplc.py	[new file with mode: 0755]	patch \| blob
web/MonitorWeb/monitorweb/controllers.py		patch \| blob \| history
web/MonitorWeb/monitorweb/templates/nodehistory.kid		patch \| blob \| history