From 6a452e8ece2ca8a47105c128eaebc38507bc76c5 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 16 Apr 2009 22:55:29 +0000 Subject: [PATCH] merge from 2.0 branch $ svn merge -r 13112:13116 https://svn.planet-lab.org/svn/Monitor/branches/2.0/ --- Monitor.spec | 12 ++-- bootman.py | 11 +++- findall.py | 20 ++++++- findbadpcu.py | 7 +-- monitor-server.init | 18 +++--- monitor/database/info/findbad.py | 30 +--------- monitor/database/info/history.py | 1 + monitor/database/info/interface.py | 2 +- monitor/scanapi.py | 15 +++-- monitor/wrapper/emailTxt.py | 18 +++++- monitor/wrapper/plccache.py | 55 +++++++++++++------ nodebad.py | 12 ++-- nodeconfig.py | 1 + pcubad.py | 7 +-- pcucontrol/models/IPAL.py | 2 +- policy.py | 43 ++++++++++----- showlatlon.py | 4 +- sitebad.py | 3 + syncwithplc.py | 6 ++ web/MonitorWeb/monitorweb/controllers.py | 20 +++++-- .../monitorweb/templates/nodehistory.kid | 10 +++- 21 files changed, 190 insertions(+), 107 deletions(-) create mode 100755 syncwithplc.py diff --git a/Monitor.spec b/Monitor.spec index 005e66a..5f08b25 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -53,8 +53,8 @@ Summary: Monitor hooks for the PLC server. Group: Applications/System Requires: python -Requires: python-sqlalchemy -Requires: python-elixir +#Requires: python-sqlalchemy +#Requires: python-elixir Requires: openssh-clients Requires: perl-libwww-perl @@ -65,9 +65,9 @@ Requires: nmap Requires: PLCWWW >= 4.2 Requires: bootcd-planetlab-i386 >= 4.2 -Requires: zabbix-client -Requires: zabbix-gui -Requires: zabbix-server +#Requires: zabbix-client +#Requires: zabbix-gui +#Requires: zabbix-server %description server The server side include all python modules and scripts needed to fully @@ -202,7 +202,7 @@ rm -rf $RPM_BUILD_ROOT php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py # apply patches to zabbix -patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff +#patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff #chkconfig --add monitor-server #chkconfig monitor-server on diff --git a/bootman.py b/bootman.py index 1a04ef0..4f8fb54 100755 --- a/bootman.py +++ b/bootman.py @@ -24,6 +24,7 @@ from monitor import const from monitor.model import * from monitor.common import email_exception, found_within from monitor.database.info.model import * +from monitor.database.info.interface import * from monitor.wrapper import plc from monitor.wrapper import plccache from monitor.wrapper.emailTxt import mailtxt @@ -59,6 +60,7 @@ class NodeConnection: traceback.print_exc() print self.c.modules.sys.path except: + email_exception() traceback.print_exc() return "unknown" @@ -71,7 +73,8 @@ class NodeConnection: def get_bootmanager_log(self): download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) - os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node)) log = open("log/bm.%s.log" % self.node, 'r') return log @@ -863,7 +866,11 @@ def main(): sys.exit(1) for node in nodes: - reboot(node, config) + # get sitehist + lb = plccache.plcdb_hn2lb[node] + sitehist = SiteInterface.get_or_make(loginbase=lb) + #reboot(node, config) + restore(sitehist, node, config=None, forced_action=None) if __name__ == "__main__": main() diff --git a/findall.py b/findall.py index 64c4987..e96c1c4 100755 --- a/findall.py +++ b/findall.py @@ -7,6 +7,8 @@ from sitebad import main as sitebad_main from nodebad import main as nodebad_main from pcubad import main as pcubad_main from monitor.wrapper import plccache +from monitor.database.info.model import * +from monitor.common import * import sys if __name__ == '__main__': @@ -29,20 +31,34 @@ if __name__ == '__main__': cfg = parsermodule.parse_args(parser) try: - print "sync with plc" - plccache.sync() print "findbad" findbad_main() + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) print "findbadpcu" findbadpcu_main() + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) print "nodebad" nodebad_main() + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) print "pcubad" pcubad_main() + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) print "sitebad" sitebad_main() + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) except Exception, err: import traceback + email_exception() print traceback.print_exc() print "Exception: %s" % err print "Saving data... exitting." diff --git a/findbadpcu.py b/findbadpcu.py index ab4f5ff..9eb3be7 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -103,10 +103,9 @@ def main(): l_pcus = [pcu for pcu in sets.Set(pcus)] elif config.node is not None: - l_nodes = plcacche.GetNodeByName(config.node) - pcus = [] - for node in l_nodes: - pcus += node['pcu_ids'] + node = plccache.GetNodeByName(config.node) + print node + pcus = node['pcu_ids'] # clear out dups. l_pcus = [pcu for pcu in sets.Set(pcus)] diff --git a/monitor-server.init b/monitor-server.init index b627c17..12193da 100644 --- a/monitor-server.init +++ b/monitor-server.init @@ -364,8 +364,8 @@ case "$1" in check_monitor_conf check_monitor_schema_and_data - check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER - check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER + #check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER + #check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER if [ -n "$WROTE_PG_CONFIG" ] ; then # NOTE: restart db to enable access by users granted above. @@ -375,8 +375,8 @@ case "$1" in dialog "$MESSAGE" fi - check_zabbix_schema_and_data - check_zabbix_templates_and_import + #check_zabbix_schema_and_data + #check_zabbix_templates_and_import # create /etc/httpd/conf.d/monitorweb.conf @@ -390,9 +390,9 @@ case "$1" in start_tg_server # START zabbix services. SETUP default config files. - check_zab_server - check_zab_agentd - check_zab_webconfig + #check_zab_server + #check_zab_agentd + #check_zab_webconfig result "$MESSAGE" ;; @@ -442,8 +442,8 @@ case "$1" in dialog "$MESSAGE" stop_tg_server - service zabbix_server stop - service zabbix_agentd stop + #service zabbix_server stop + #service zabbix_agentd stop # TODO: is there anything to stop? result "$MESSAGE" ;; diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py index a5139eb..5e38aca 100644 --- a/monitor/database/info/findbad.py +++ b/monitor/database/info/findbad.py @@ -11,46 +11,18 @@ __metadata__ = mon_metadata __session__ = mon_session -#class FindbadNodeRecordSync(Entity): -# hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname') -# round = Field(Int,default=0) - -#class FindbadPCURecordSync(Entity): -# plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid') -# round = Field(Int,default=0) - class FindbadNodeRecord(Entity): @classmethod def get_all_latest(cls): return cls.query.all() - #fbsync = FindbadNodeRecordSync.get_by(hostname="global") - #if fbsync: - # return cls.query.filter_by(round=fbsync.round) - #else: - # return [] @classmethod def get_latest_by(cls, **kwargs): return cls.query.filter_by(**kwargs).first() - #fbsync = FindbadNodeRecordSync.get_by(hostname="global") - #if fbsync: - # kwargs['round'] = fbsync.round - # return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc()) - #else: - # return [] @classmethod def get_latest_n_by(cls, n=3, **kwargs): return cls.query.filter_by(**kwargs) - #fbsync = FindbadNodeRecordSync.get_by(hostname="global") - #kwargs['round'] = fbsync.round - #ret = [] - #for i in range(0,n): - # kwargs['round'] = kwargs['round'] - i - # f = cls.query.filter_by(**kwargs).first() - # if f: - # ret.append(f) - #return ret # ACCOUNTING date_checked = Field(DateTime,default=datetime.now) @@ -99,7 +71,7 @@ class FindbadPCURecord(Entity): # ACCOUNTING date_checked = Field(DateTime) round = Field(Int,default=0) - plc_pcuid = Field(Int) #alternateID=True,alternateMethodName='by_pcuid') + plc_pcuid = Field(Int) # EXTERNAL plc_pcu_stats = Field(PickleType,default=None) diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py index 3c5842a..6d2ed83 100644 --- a/monitor/database/info/history.py +++ b/monitor/database/info/history.py @@ -15,6 +15,7 @@ class HistoryNodeRecord(Entity): last_checked = Field(DateTime,default=datetime.now) last_changed = Field(DateTime,default=datetime.now) status = Field(String,default="unknown") + haspcu = Field(Boolean,default=False) acts_as_versioned(ignore=['last_changed', 'last_checked']) @classmethod diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py index 2e5064d..29b19be 100644 --- a/monitor/database/info/interface.py +++ b/monitor/database/info/interface.py @@ -193,6 +193,6 @@ class SiteInterface(HistorySiteRecord): act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='reboot', - action_type='first_try_reboot', + action_type='try_reboot', error_string=err) diff --git a/monitor/scanapi.py b/monitor/scanapi.py index 963822d..f7939e6 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -20,7 +20,7 @@ from monitor.sources import comon from monitor.wrapper import plc, plccache import traceback -from monitor.common import nmap_port_status +from monitor.common import nmap_port_status, email_exception COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ "table=table_nodeview&" + \ @@ -157,6 +157,7 @@ class ScanInterface(object): except: print "ERROR:" + email_exception(nodename) print traceback.print_exc() pass @@ -334,9 +335,10 @@ EOF """) plc_lock.acquire() d_node = None try: - d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', - 'date_created', 'last_updated', - 'last_contact', 'boot_state', 'nodegroup_ids'])[0] + d_node = plccache.GetNodeByName(nodename) + #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', + # 'date_created', 'last_updated', + # 'last_contact', 'boot_state', 'nodegroup_ids'])[0] except: traceback.print_exc() plc_lock.release() @@ -363,8 +365,9 @@ EOF """) d_site = None values['loginbase'] = "" try: - d_site = plc.getSites({'site_id': site_id}, - ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] + d_site = plccache.GetSitesById([ site_id ])[0] + #d_site = plc.getSites({'site_id': site_id}, + # ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0] values['loginbase'] = d_site['login_base'] except: traceback.print_exc() diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 220eb10..b50be5b 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -207,6 +207,18 @@ ERROR- This is an error state, where there is absolutely no contact with PlanetLab. """) + pcumissing_notice =("""MONTEST: No PCU available to reboot %(hostname)s""", +"""As part of PlanetLab node monitoring and maintenance, we noticed that there is no PCU +associated with %(hostname)s, so we could not reboot it ourselves. + +To save you time in the future, please take a moment to register the PCU functionality for +your machines here: + + http://www.planet-lab.org/db/sites/pcu.php + +Thank you very much for your help, + -- PlanetLab Central (support@planet-lab.org) +""") pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""", """As part of PlanetLab node monitoring and maintenance, we tried to use the PCU @@ -244,7 +256,11 @@ If any action is needed from you, you will recieve additional notices. Thank yo This notice is simply to let you know that: %(hostname)s -is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help! +is down, disconnected from the network and/or non-operational. + +Please investigate, thank you very much for your help! + + http://monitor.planet-lab.org:8082/pcuview?loginbase=%(loginbase)s """) clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""", diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index 0645b18..75ca49b 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -3,6 +3,7 @@ import sys from monitor.wrapper import plc from monitor.database.info.model import * +import profile def dsites_from_lsites(l_sites): d_sites = {} @@ -67,17 +68,22 @@ def init(): global plcdb_hn2lb global plcdb_lb2hn global plcdb_id2lb + print "initing plccache" dbsites = PlcSite.query.all() l_sites = [ s.plc_site_stats for s in dbsites ] + print "plcnode" dbnodes = PlcNode.query.all() l_nodes = [ s.plc_node_stats for s in dbnodes ] + print "plcpcu" dbpcus = PlcPCU.query.all() l_pcus = [ s.plc_pcu_stats for s in dbpcus ] + print "dsites_from_lsites" (d_sites,id2lb) = dsites_from_lsites(l_sites) + print "dsn_from_dsln" (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes) plcdb_hn2lb = hn2lb @@ -108,14 +114,31 @@ def GetSitesByName(sitelist): ret.append(site.plc_site_stats) return ret +def GetSitesById(idlist): + ret = [] + for site_id in idlist: + site = PlcSite.get_by(site_id=site_id) + ret.append(site.plc_site_stats) + return ret + +def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_base'): + dbobjs = objectClass.query.all() + dbobj_key = [ getattr(s, dbKey) for s in dbobjs ] + plcobj_key = [ s[plcKey] for s in l_plc ] + extra_key = set(dbobj_key) - set(plcobj_key) + for obj in extra_key: + print "deleting %s" % obj + dbobj = objectClass.get_by(**{dbKey : obj}) + dbobj.delete() + def sync(): l_sites = plc.api.GetSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled', 'date_created' ]) l_nodes = plc.api.GetNodes({'peer_id':None}, - ['hostname', 'node_id', 'ports', 'site_id', - 'version', 'last_updated', 'date_created', + ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', + 'version', 'last_updated', 'date_created', 'key', 'last_contact', 'pcu_ids', 'nodenetwork_ids']) l_pcus = plc.api.GetPCUs() @@ -125,8 +148,17 @@ def sync(): dbsite.loginbase = site['login_base'] dbsite.date_checked = datetime.now() dbsite.plc_site_stats = site - #dbsite.flush() - # TODO: delete old records. + deleteExtra(l_sites, PlcSite, 'loginbase', 'login_base') + deleteExtra(l_sites, HistorySiteRecord, 'loginbase', 'login_base') + session.flush() + + print "sync pcus" + for pcu in l_pcus: + dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id']) + dbpcu.date_checked = datetime.now() + dbpcu.plc_pcu_stats = pcu + deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id') + deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id') session.flush() print "sync nodes" @@ -135,17 +167,8 @@ def sync(): dbnode.hostname = node['hostname'] dbnode.date_checked = datetime.now() dbnode.plc_node_stats = node - #dbnode.flush() - # TODO: delete old records. - session.flush() - - print "sync pcus" - for pcu in l_pcus: - dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id']) - dbpcu.date_checked = datetime.now() - dbpcu.plc_pcu_stats = pcu - #dbpcu.flush() - # TODO: delete old records. + deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname') + deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname') session.flush() init() @@ -153,6 +176,6 @@ def sync(): return if __name__ == '__main__': - sync() + profile.run('sync()') else: init() diff --git a/nodebad.py b/nodebad.py index 46ca879..c3aae39 100755 --- a/nodebad.py +++ b/nodebad.py @@ -38,6 +38,7 @@ def check_node_state(rec, node): node_state = rec.observed_status if rec.plc_node_stats: + print rec.plc_node_stats boot_state = rec.plc_node_stats['boot_state'] last_contact = rec.plc_node_stats['last_contact'] else: @@ -47,6 +48,11 @@ def check_node_state(rec, node): if boot_state == 'disable': boot_state = 'disabled' if boot_state == 'diag': boot_state = 'diagnose' + if len(rec.plc_node_stats['pcu_ids']) > 0: + node.haspcu = True + else: + node.haspcu = False + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need # 'translations' into the node.status state # 'BOOT' is a permanent state, but we want it to have a bit of @@ -131,6 +137,7 @@ def checkAndRecordState(l_nodes, l_plcnodes): except: print "COULD NOT FIND %s" % nodename import traceback + email_exception() print traceback.print_exc() continue @@ -143,11 +150,8 @@ def checkAndRecordState(l_nodes, l_plcnodes): count += 1 print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) - # NOTE: this commits all pending operations to the DB. Do not remove, or - # replace with another operations that also commits all pending ops, such - # as session.commit() or flush() or something + # NOTE: this commits all pending operations to the DB. Do not remove. session.flush() - print HistoryNodeRecord.query.count() return True diff --git a/nodeconfig.py b/nodeconfig.py index 788d7f8..3fe9a84 100755 --- a/nodeconfig.py +++ b/nodeconfig.py @@ -56,6 +56,7 @@ def main(): # print k, "==" , net[k] except: print "Error with %s" % node + email_exception() import traceback; print traceback.print_exc() pass diff --git a/pcubad.py b/pcubad.py index 9f0468c..59dfe7a 100755 --- a/pcubad.py +++ b/pcubad.py @@ -40,10 +40,8 @@ def main2(config): l_pcus = [pcu for pcu in sets.Set(pcus)] elif config.node: - l_nodes = plccache.GetNodeByName(config.node) - pcus = [] - for node in l_nodes: - pcus += node['pcu_ids'] + node = plccache.GetNodeByName(config.node) + pcus = node['pcu_ids'] # clear out dups. l_pcus = [pcu for pcu in sets.Set(pcus)] @@ -117,6 +115,7 @@ def checkAndRecordState(l_pcus, l_plcpcus): except: print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu) import traceback + email_exception() print traceback.print_exc() # don't have the info to create a new entry right now, so continue. continue diff --git a/pcucontrol/models/IPAL.py b/pcucontrol/models/IPAL.py index 48394df..a2ea026 100644 --- a/pcucontrol/models/IPAL.py +++ b/pcucontrol/models/IPAL.py @@ -17,7 +17,7 @@ class IPAL(PCUControl): try: # TODO: make sleep backoff, before stopping. - time.sleep(4) + time.sleep(8) ret = s.recv(count, socket.MSG_DONTWAIT) except socket.error, e: if e[0] == errno.EAGAIN: diff --git a/policy.py b/policy.py index 4befbd9..43b37ca 100755 --- a/policy.py +++ b/policy.py @@ -47,6 +47,7 @@ def main(hostnames, sitenames): lb = plccache.plcdb_hn2lb[host] except: print "unknown host in plcdb_hn2lb %s" % host + email_exception(host) continue nodeblack = BlacklistRecord.get_by(hostname=host) @@ -64,35 +65,46 @@ def main(hostnames, sitenames): print "%s %s %s" % (i, nodehist.hostname, nodehist.status) if nodehist.status == 'good' and \ changed_lessthan(nodehist.last_changed, 1.0) and \ + found_within(recent_actions, 'down_notice', 7.0) and \ not found_within(recent_actions, 'online_notice', 0.5): + # NOTE: searching for down_notice proves that the node has + # gone through a 'down' state first, rather than just + # flapping through: good, offline, online, ... + # # NOTE: there is a narrow window in which this command must be - # evaluated, otherwise the notice will not go out. this is not ideal. + # evaluated, otherwise the notice will not go out. + # this is not ideal. sitehist.sendMessage('online_notice', hostname=host, viart=False) print "send message for host %s online" % host - pass - if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + # if a node is offline and doesn't have a PCU, remind the user that they should have one. + if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.0) and \ - not found_between(recent_actions, 'first_try_reboot', 3.5, 1): + not found_within(recent_actions, 'pcumissing_notice', 7.0): + + sitehist.sendMessage('pcumissing_notice', hostname=host) + print "send message for host %s pcumissing_notice" % host + + # if it is offline and HAS a PCU, then try to use it. + if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ + changed_greaterthan(nodehist.last_changed,1.0) and \ + not found_between(recent_actions, 'try_reboot', 3.5, 1): sitehist.attemptReboot(host) - print "send message for host %s first_try_reboot" % host - pass + print "send message for host %s try_reboot" % host - # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1) + # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1) # will be false for a day after the above condition is satisfied - if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.5) and \ - found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \ + found_between(recent_actions, 'try_reboot', 3.5, 1) and \ not found_within(recent_actions, 'pcufailed_notice', 3.5): - # found_within(recent_actions, 'first_try_reboot', 3.5) and \ # send pcu failure message #act = ActionRecord(**kwargs) sitehist.sendMessage('pcufailed_notice', hostname=host) print "send message for host %s PCU Failure" % host - pass if nodehist.status == 'monitordebug' and \ changed_greaterthan(nodehist.last_changed, 1) and \ @@ -111,9 +123,10 @@ def main(hostnames, sitenames): sitehist.sendMessage('down_notice', hostname=host) print "send message for host %s down" % host - pass node_count = node_count + 1 + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() session.flush() for i,site in enumerate(sitenames): @@ -158,13 +171,16 @@ def main(hostnames, sitenames): # find all ticket ids for site ( could be on the site record? ) # determine if there are penalties within the last 30 days? # if so, add a 'pause_penalty' action. - if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0: + if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \ + sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30): # pause escalation print "Pausing penalties for %s" % site sitehist.pausePenalty() site_count = site_count + 1 + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() session.flush() session.flush() @@ -227,6 +243,7 @@ if __name__ == "__main__": try: main(hostnames, sitenames) + session.flush() except KeyboardInterrupt: print "Killed by interrupt" session.flush() diff --git a/showlatlon.py b/showlatlon.py index 951802a..2176462 100755 --- a/showlatlon.py +++ b/showlatlon.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta import database import comon -from monitor.common import color_pcu_state, datetime_fromstr +from monitor.common import color_pcu_state, datetime_fromstr, email_exception from nodehistory import get_filefromglob import time import traceback @@ -211,3 +211,5 @@ if __name__ == "__main__": main() except IOError: pass + except: + email_exception() diff --git a/sitebad.py b/sitebad.py index 4d9ee33..6c09c1c 100755 --- a/sitebad.py +++ b/sitebad.py @@ -44,6 +44,8 @@ def getnodesup(nodelist): up = 0 for node in nodelist: try: + # NOTE: adding a condition for nodehist.haspcu would include pcus + # in the calculation nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) nodebl = BlacklistRecord.get_by(hostname=node['hostname']) if (nodehist is not None and nodehist.status != 'down') or \ @@ -51,6 +53,7 @@ def getnodesup(nodelist): up = up + 1 except: import traceback + email_exception(node['hostname']) print traceback.print_exc() return up diff --git a/syncwithplc.py b/syncwithplc.py new file mode 100755 index 0000000..af01841 --- /dev/null +++ b/syncwithplc.py @@ -0,0 +1,6 @@ +#!/usr/bin/python + +from monitor.wrapper import plccache + +if __name__ == "__main__": + plccache.sync() diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 1c4efe9..7cbaf4f 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -315,7 +315,9 @@ class Root(controllers.RootController, MonitorXmlrpcServer): @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)") def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data): print "PCUVIEW------------------" - session.clear() + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) sitequery=[] pcuquery=[] nodequery=[] @@ -333,7 +335,7 @@ class Root(controllers.RootController, MonitorXmlrpcServer): if loginbase: actions = ActionRecord.query.filter_by(loginbase=loginbase - ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7) + ).filter(ActionRecord.date_created >= datetime.now() - timedelta(14) ).order_by(ActionRecord.date_created.desc()) actions = [ a for a in actions ] sitequery = [HistorySiteRecord.by_loginbase(loginbase)] @@ -387,13 +389,21 @@ class Root(controllers.RootController, MonitorXmlrpcServer): def nodehistory(self, hostname=None): query = [] if hostname: - fbnode = FindbadNodeRecord.get_by(hostname=hostname) - # TODO: add links for earlier history if desired. + #fbnode = FindbadNodeRecord.get_by(hostname=hostname) + ## TODO: add links for earlier history if desired. + #l = fbnode.versions[-100:] + #l.reverse() + #for node in l: + # prep_node_for_display(node) + # query.append(node) + + fbnode = HistoryNodeRecord.get_by(hostname=hostname) l = fbnode.versions[-100:] l.reverse() for node in l: - prep_node_for_display(node) + #prep_node_for_display(node) query.append(node) + return dict(query=query, hostname=hostname) @expose(template="monitorweb.templates.sitehistory") diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid index 8fa825b..a0ab370 100644 --- a/web/MonitorWeb/monitorweb/templates/nodehistory.kid +++ b/web/MonitorWeb/monitorweb/templates/nodehistory.kid @@ -44,10 +44,14 @@ from links import * py:content="node.pcu_short_status">Reboot Status - + + + + + your.host.org - - + -- 2.43.0