updated files that used old '*Sync' and old messy Findbad*Records.
print " ERROR:", x
print " Possibly, unable to find valid configuration file"
- if bm_continue and self.config and not self.config.quiet:
+ if bm_continue:
for key in bm.VARS.keys():
print key, " == ", bm.VARS[key]
else:
- if self.config and not self.config.quiet: print " Unable to read Node Configuration"
+ print " Unable to read Node Configuration"
def compare_and_repair_nodekeys(self):
# NOTE: Nothing works if the bootcd is REALLY old.
# So, this is the first step.
fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
- if fbnode['category'] == "OLDBOOTCD":
+ print fbnode.keys()
+ if fbnode['observed_category'] == "OLDBOOTCD":
print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
args = {}
args['hostname_list'] = " %s" % hostname
# actual solution appears to involve removing the bad files, and
# continually trying to boot the node.
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
from pcucontrol.util import command
from monitor import config
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+from monitor.database.info.model import FindbadNodeRecord, session
from monitor.sources import comon
from monitor.wrapper import plc, plccache
# CREATE all the work requests
for nodename in l_nodes:
- fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
- node_round = fbnodesync.round
- fbnodesync.flush()
+ #fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
+ #node_round = fbnodesync.round
+ node_round = global_round - 1
+ #fbnodesync.flush()
if node_round < global_round or config.force:
# recreate node stats when refreshed
print "All results collected."
break
- print FindbadNodeRecordSync.query.count()
+ #print FindbadNodeRecordSync.query.count()
print FindbadNodeRecord.query.count()
session.flush()
def main():
global global_round
- fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
- if_new_set={'round' : global_round})
- global_round = fbsync.round
+ #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
+ # if_new_set={'round' : global_round})
+ #global_round = fbsync.round
if config.increment:
# update global round number to force refreshes across all nodes
if config.increment:
# update global round number to force refreshes across all nodes
- fbsync.round = global_round
- fbsync.flush()
+ #fbsync.round = global_round
+ #fbsync.flush()
+ pass
return 0
import monitor
from monitor import config
-from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
+from monitor.database.info.model import FindbadPCURecord, session
from monitor import database
from monitor import util
from monitor.wrapper import plc, plccache
# CREATE all the work requests
for pcuname in l_pcus:
pcu_id = int(pcuname)
- fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
- fbnodesync.flush()
+ #fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
+ #fbnodesync.flush()
- node_round = fbnodesync.round
+ #node_round = fbnodesync.round
+ node_round = global_round - 1
if node_round < global_round or config.force:
# recreate node stats when refreshed
#print "%s" % nodename
print "All results collected."
break
- print FindbadPCURecordSync.query.count()
+ #print FindbadPCURecordSync.query.count()
print FindbadPCURecord.query.count()
session.flush()
l_pcus = plccache.l_pcus
cohash = {}
- fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
- if_new_set={'round' : global_round})
+ #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
+ #if_new_set={'round' : global_round})
- global_round = fbsync.round
+ #global_round = fbsync.round
api = plc.getAuthAPI()
if config.site is not None:
if config.increment:
# update global round number to force refreshes across all nodes
- fbsync.round = global_round
- fbsync.flush()
+ #fbsync.round = global_round
+ #fbsync.flush()
session.flush()
return 0
def changed_lessthan(last_changed, days):
if datetime.now() - last_changed <= timedelta(days):
- print "last changed less than %s" % timedelta(days)
+ #print "last changed less than %s" % timedelta(days)
return True
else:
- print "last changed more than %s" % timedelta(days)
+ #print "last changed more than %s" % timedelta(days)
return False
def changed_greaterthan(last_changed, days):
if datetime.now() - last_changed > timedelta(days):
- print "last changed more than %s" % timedelta(days)
+ #print "last changed more than %s" % timedelta(days)
return True
else:
- print "last changed less than %s" % timedelta(days)
+ #print "last changed less than %s" % timedelta(days)
return False
from datetime import datetime,timedelta
import elixir
import traceback
+from elixir.ext.versioned import *
from monitor.database.dborm import mon_metadata, mon_session
__metadata__ = mon_metadata
__session__ = mon_session
-class FindbadNodeRecordSync(Entity):
- hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
- round = Field(Int,default=0)
+#class FindbadNodeRecordSync(Entity):
+# hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
+# round = Field(Int,default=0)
-class FindbadPCURecordSync(Entity):
- plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
- round = Field(Int,default=0)
+#class FindbadPCURecordSync(Entity):
+# plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
+# round = Field(Int,default=0)
class FindbadNodeRecord(Entity):
@classmethod
def get_all_latest(cls):
- fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- if fbsync:
- return cls.query.filter_by(round=fbsync.round)
- else:
- return []
+ return cls.query.all()
+ #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ #if fbsync:
+ # return cls.query.filter_by(round=fbsync.round)
+ #else:
+ # return []
@classmethod
def get_latest_by(cls, **kwargs):
- fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- if fbsync:
- kwargs['round'] = fbsync.round
- return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
- else:
- return []
+ return cls.query.filter_by(**kwargs).first()
+ #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ #if fbsync:
+ # kwargs['round'] = fbsync.round
+ # return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
+ #else:
+ # return []
@classmethod
def get_latest_n_by(cls, n=3, **kwargs):
- fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- kwargs['round'] = fbsync.round
- ret = []
- for i in range(0,n):
- kwargs['round'] = kwargs['round'] - i
- f = cls.query.filter_by(**kwargs).first()
- if f:
- ret.append(f)
- return ret
+ return cls.query.filter_by(**kwargs)
+ #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ #kwargs['round'] = fbsync.round
+ #ret = []
+ #for i in range(0,n):
+ # kwargs['round'] = kwargs['round'] - i
+ # f = cls.query.filter_by(**kwargs).first()
+ # if f:
+ # ret.append(f)
+ #return ret
# ACCOUNTING
date_checked = Field(DateTime,default=datetime.now)
round = Field(Int,default=0)
- hostname = Field(String,default=None)
+ hostname = Field(String,primary_key=True,default=None)
loginbase = Field(String)
# INTERNAL
observed_category = Field(String,default=None)
observed_status = Field(String,default=None)
+ acts_as_versioned(ignore=['date_checked'])
# NOTE: this is the child relation
#action = ManyToOne('ActionRecord', required=False)
class FindbadPCURecord(Entity):
@classmethod
def get_all_latest(cls):
- fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
- if fbsync:
- return cls.query.filter_by(round=fbsync.round)
- else:
- return []
+ return cls.query.all()
@classmethod
def get_latest_by(cls, **kwargs):
- fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
- kwargs['round'] = fbsync.round
- return cls.query.filter_by(**kwargs).order_by(FindbadPCURecord.date_checked.desc())
+ return cls.query.filter_by(**kwargs)
+
# ACCOUNTING
date_checked = Field(DateTime)
round = Field(Int,default=0)
# INTERNAL
# INFERRED
reboot_trial_status = Field(String)
+
+ acts_as_versioned(ignore=['date_checked'])
from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
from elixir import options_defaults, using_options, setup_all
from elixir import String, Integer as Int, DateTime, Boolean
+from elixir.ext.versioned import *
+
from datetime import datetime,timedelta
from monitor.database.dborm import mon_metadata, mon_session
last_checked = Field(DateTime,default=datetime.now)
last_changed = Field(DateTime,default=datetime.now)
status = Field(String,default="unknown")
+ acts_as_versioned(ignore=['last_changed', 'last_checked'])
@classmethod
def by_hostname(cls, hostname):
last_valid = Field(DateTime,default=None)
valid = Field(String,default="unknown")
+ acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
@classmethod
def by_pcuid(cls, pcuid):
return cls.query.filter_by(pcuid=pcuid).first()
+
class HistorySiteRecord(Entity):
loginbase = Field(String(250),primary_key=True)
penalty_level = Field(Int, default=0)
penalty_applied = Field(Boolean, default=False)
+ acts_as_versioned(ignore=['last_changed', 'last_checked'])
@classmethod
def by_loginbase(cls, loginbase):
from monitor.database.info.model import FindbadPCURecord
print "pcuid: %s" % pcu_id
try:
- pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
+ pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id)
if pcurec:
values = pcurec.to_dict()
else:
syncclass = None
primarykey = 'hostname'
- def __init__(self, round):
+ def __init__(self, round=1):
self.round = round
self.count = 1
try:
if values is None:
return
-
- fbnodesync = self.syncclass.findby_or_create(
- if_new_set={'round' : self.round},
+
+ if self.syncclass:
+ fbnodesync = self.syncclass.findby_or_create(
+ #if_new_set={'round' : self.round},
**{ self.primarykey : nodename})
# NOTE: This code will either add a new record for the new self.round,
# OR it will find the previous value, and update it with new information.
# The data that is 'lost' is not that important, b/c older
# history still exists.
fbrec = self.recordclass.findby_or_create(
- **{'round':self.round, self.primarykey:nodename})
+ **{ self.primarykey:nodename})
fbrec.set( **values )
fbrec.flush()
- fbnodesync.round = self.round
- fbnodesync.flush()
+ if self.syncclass:
+ fbnodesync.round = self.round
+ fbnodesync.flush()
print "%d %s %s" % (self.count, nodename, values)
self.count += 1
class ScanNodeInternal(ScanInterface):
recordclass = FindbadNodeRecord
- syncclass = FindbadNodeRecordSync
+ #syncclass = FindbadNodeRecordSync
+ syncclass = None
primarykey = 'hostname'
def collectNMAP(self, nodename, cohash):
return (nodename, values)
def internalprobe(hostname):
- fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
- if_new_set={'round' : 1})
- scannode = ScanNodeInternal(fbsync.round)
+ #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
+ # if_new_set={'round' : 1})
+ scannode = ScanNodeInternal() # fbsync.round)
try:
(nodename, values) = scannode.collectInternal(hostname, {})
scannode.record(None, (nodename, values))
return False
def externalprobe(hostname):
- fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
- if_new_set={'round' : 1})
- scannode = ScanNodeInternal(fbsync.round)
+ #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
+ # if_new_set={'round' : 1})
+ scannode = ScanNodeInternal() # fbsync.round)
try:
(nodename, values) = scannode.collectNMAP(hostname, {})
scannode.record(None, (nodename, values))
class ScanPCU(ScanInterface):
recordclass = FindbadPCURecord
- syncclass = FindbadPCURecordSync
+ syncclass = None
primarykey = 'plc_pcuid'
def collectInternal(self, pcuname, cohash):
Thank you very much for your help!
""")
- offline_notice=("""Host %(hostname)s is offline""",
+ retry_bootman=("""Running BootManager on %(hostname)s""",
"""
This notice is simply to let you know that:
%(hostname)s
-is offline and or non-operational. Please investigate, thank you very much for your help!
+appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py.
+If any action is needed from you, you will recieve additional notices. Thank you!
+ """)
+ down_notice=("""Host %(hostname)s is down""",
+ """
+This notice is simply to let you know that:
+ %(hostname)s
+
+is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help!
""")
clear_penalty=("""All penalties have been cleared from site %(loginbase)s""",
boot_state = "unknown"
last_contact = None
- if node_state == 'DOWN' and ( node.status == 'online' or node.status == 'good' ):
+ # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
+ # 'translations' into the node.status state
+ # 'BOOT' is a permanent state, but we want it to have a bit of
+ # hysteresis (less than 0.5 days)
+
+ #################################################################3
+ # "Translate" the findbad states into nodebad status.
+
+ if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disable' :
print "changed status from %s to offline" % node.status
node.status = 'offline'
node.last_changed = datetime.now()
- if node_state == 'BOOT' and changed_lessthan(node.last_changed, 0.5) and node.status != 'online':
+ if node_state == 'DEBUG' and node.status != 'monitordebug':
+ print "changed status from %s to monitordebug" % (node.status)
+ node.status = "monitordebug"
+ node.last_changed = datetime.now()
+
+ if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
print "changed status from %s to online" % node.status
node.status = 'online'
node.last_changed = datetime.now()
+ #################################################################3
+ # Switch temporary hystersis states into their 'firm' states.
+
if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
- #send thank you notice, or on-line notice.
print "changed status from %s to good" % node.status
node.status = 'good'
# NOTE: do not reset last_changed, or you lose how long it's been up.
- #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1): # and pcu.status == 'good'
- # # attempt reboots
- # pass
- #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1.5): # and node.has_pcu
- # # send PCU failure message
- # pass
-
if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
print "changed status from %s to down" % node.status
- # send down node notice
node.status = 'down'
- node.last_changed = datetime.now()
+ # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+ if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 14):
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ # NOTE: do not reset last_changed, or you lose how long it's been down.
+ #node.last_changed = datetime.now()
+ # extreme cases of offline nodes
if ( boot_state == 'disabled' or last_contact == None ) and \
changed_greaterthan(node.last_changed, 2*30) and \
node.status != 'down':
try:
# Find the most recent record
- noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
+ noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
except:
print "COULD NOT FIND %s" % nodename
import traceback
i = 1
for node in nodelist:
print "%-2d" % i,
- fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
+ fbrec = FindbadNodeRecord.get_latest_by(hostname=node['hostname'])
fbdata = fbrec.to_dict()
print nodegroup_display(node, fbdata, config)
i += 1
from monitor import util
from monitor import parser as parsermodule
-from monitor import database
+from monitor.database.info.model import *
from monitor import reboot
import time
from monitor.wrapper import plc, plccache
api = plc.getAuthAPI()
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
from monitor import util
from monitor import config
try:
# Find the most recent record
- fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first()
+ fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node)
except:
print traceback.print_exc()
pass
try:
# Find the most recent record
- pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
+ pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first()
except:
print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
import traceback
exceptions = data['exceptions']
if loginbase:
- actions = ActionRecord.query.filter_by(loginbase=loginbase).order_by(ActionRecord.date_created.desc())
+ actions = ActionRecord.query.filter_by(loginbase=loginbase
+ ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+ ).order_by(ActionRecord.date_created.desc())
actions = [ a for a in actions ]
sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
pcus = {}
for plcnode in site_lb2hn[loginbase]:
- for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']):
+ node = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname'])
# NOTE: reformat some fields.
prep_node_for_display(node)
nodequery += [node]
if pcuid and hostname is None:
print "pcuid: %s" % pcuid
- for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid):
- # NOTE: count filter
- prep_pcu_for_display(pcu)
- pcuquery += [pcu]
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=pcuid)
+ # NOTE: count filter
+ prep_pcu_for_display(pcu)
+ pcuquery += [pcu]
if 'site_id' in pcu.plc_pcu_stats:
sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)]
if 'nodenames' in pcu.plc_pcu_stats:
for nodename in pcu.plc_pcu_stats['nodenames']:
print "query for %s" % nodename
- q = FindbadNodeRecord.get_latest_by(hostname=nodename)
- node = q.first()
+ node = FindbadNodeRecord.get_latest_by(hostname=nodename)
print "%s" % node.port_status
print "%s" % node.to_dict()
print "%s" % len(q.all())
nodequery += [node]
if hostname and pcuid is None:
- for node in FindbadNodeRecord.get_latest_by(hostname=hostname):
+ node = FindbadNodeRecord.get_latest_by(hostname=hostname)
# NOTE: reformat some fields.
prep_node_for_display(node)
sitequery = [node.site]
nodequery += [node]
if node.plc_pcuid: # not None
- pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
prep_pcu_for_display(pcu)
pcuquery += [pcu]
import turbogears as tg
import urllib
+def plc_mail_uri(ticketid):
+ return config.RT_WEB_SERVER + "/Ticket/Display.html?id=" + str(ticketid)
def plc_node_uri(hostname):
return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname)
def plc_site_uri(loginbase):
<div id="status_block" class="flash"
py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
- <h4>Recent Actions</h4>
+ <h4>Actions Over the Last Week</h4>
<p py:if="actions and len(actions) == 0">
There are no recent actions taken for this site.
</p>
</td>
<!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
<td py:content="act.action_type"></td>
- <td py:content="act.message_id"></td>
- <td py:content="act.error_string"></td>
+ <td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
+ <span class="icon">${act.message_id}</span></a></td>
+ <td><pre py:content="act.error_string"></pre></td>
</tr>
</tbody>
</table>