#print "last changed less than %s" % timedelta(days)
return False
+def found_within(recent_actions, action_type, within):
+ for action in recent_actions:
+ if action_type == action.action_type and \
+ datetime.now() - action.date_created < timedelta(within):
+ # recent action of given type.
+ #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+ return True
+
+ print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+ return False
+
@classmethod
def get_latest_by(cls, **kwargs):
- return cls.query.filter_by(**kwargs)
+ return cls.query.filter_by(**kwargs).first()
# ACCOUNTING
date_checked = Field(DateTime)
2+ - all existing slices will be disabled.
""")
+ newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD:
+
+ %(hostname)s
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+ -- PlanetLab Central (support@planet-lab.org)
+""")
+
nmreset =("""NM Reset at %(loginbase)s""",
"""
Monitor restarted NM on the following machines:
-- PlanetLab Central (support@planet-lab.org)
""")
- newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""",
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware:
+ newalphacd_notice=(""" New Boot Images for %(hostname)s""",
+"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware:
-%(hostname_list)s
+%(hostname)s
To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.
# TODO: need reminder versions for repeats...
newdown=[newdown_one, newdown_two, newdown_three]
newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
- newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+ #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
newthankyou=[thankyou,thankyou,thankyou]
pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
NMReset=[nmreset,nmreset,nmreset]
pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
pcudown=[pcudown_one, pcudown_one, pcudown_one]
- unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""",
+ unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
- minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""",
+ minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
%(bmlog)s
""" )
- baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""",
+ baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org.
%(bmlog)s
""")
- plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
+ nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
""")
- baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""",
+ baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
%(hostname)s
boot_state = "unknown"
last_contact = None
+ if boot_state == 'disable': boot_state = 'disabled'
+ if boot_state == 'diag': boot_state = 'diagnose'
+
# NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
# 'translations' into the node.status state
# 'BOOT' is a permanent state, but we want it to have a bit of
# hysteresis (less than 0.5 days)
- #################################################################3
- # "Translate" the findbad states into nodebad status.
+ #################################################################
+ # "Initialize" the findbad states into nodebad status if they are not already set
- if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disable' :
+ if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
print "changed status from %s to offline" % node.status
node.status = 'offline'
node.last_changed = datetime.now()
- if node_state == 'DEBUG' and node.status != 'monitordebug':
- print "changed status from %s to monitordebug" % (node.status)
- node.status = "monitordebug"
- node.last_changed = datetime.now()
+ if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+ node.status != 'disabled' and \
+ node.status != 'diagnose':
+ if boot_state != 'disabled' and boot_state != 'diagnose':
+
+ print "changed status from %s to monitordebug" % (node.status)
+ node.status = "monitordebug"
+ node.last_changed = datetime.now()
+ else:
+ print "changed status from %s to %s" % (node.status, boot_state)
+ node.status = boot_state
+ node.last_changed = datetime.now()
if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
print "changed status from %s to online" % node.status
node.status = 'online'
node.last_changed = datetime.now()
- #################################################################3
+ #################################################################
# Switch temporary hystersis states into their 'firm' states.
+ # online -> good after half a day
+ # offline -> down after two days
+ # monitordebug -> down after 30 days
+ # diagnose -> monitordebug after 60 days
+ # disabled -> down after 60 days
if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
print "changed status from %s to good" % node.status
node.status = 'down'
# NOTE: do not reset last_changed, or you lose how long it's been down.
- if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 14):
+ if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
print "changed status from %s to down" % node.status
node.status = 'down'
# NOTE: do not reset last_changed, or you lose how long it's been down.
- #node.last_changed = datetime.now()
+
+ if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+ print "changed status from %s to down" % node.status
+ # NOTE: change an admin mode back into monitordebug after two months.
+ node.status = 'monitordebug'
+ node.last_changed = datetime.now()
# extreme cases of offline nodes
if ( boot_state == 'disabled' or last_contact == None ) and \
def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
self.user, self.host, cmd)
+ #print cmd
r = CMD.run_noexcept(self, cmd, timeout)
self.ret = -1
from monitor.database.dborm import zab_session as session
from monitor.database.dborm import zab_metadata as metadata
-from pcucontrol import reboot
+from monitor import reboot
+from monitor import scanapi
+
from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
from monitorweb.templates.links import *
-from monitor import scanapi
def query_to_dict(query):
def prep_node_for_display(node):
if node.plc_pcuid:
- pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
if pcu:
node.pcu_status = pcu.reboot_trial_status
node.pcu_short_status = format_pcu_shortstatus(pcu)
return self.pcuview(None, hostname) # dict(nodequery=nodequery)
@expose(template="monitorweb.templates.nodelist")
- def node(self, filter='BOOT'):
+ def node(self, filter='boot'):
import time
fbquery = FindbadNodeRecord.get_all_latest()
query = []
- filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
+ filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0,
+ 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
for node in fbquery:
# NOTE: reformat some fields.
prep_node_for_display(node)
- # NOTE: count filters
- if node.observed_status != 'DOWN':
- print node.hostname, node.observed_status
- filtercount[node.observed_status] += 1
- else:
+ node.history.status
+
+ if node.history.status in ['down', 'offline']:
if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
- filtercount[node.observed_status] += 1
+ filtercount['down'] += 1
else:
filtercount['neverboot'] += 1
+ elif node.history.status in ['good', 'online']:
+ filtercount['boot'] += 1
+ elif node.history.status in ['debug', 'monitordebug']:
+ filtercount['debug'] += 1
+ else:
+ filtercount[node.history.status] += 1
+
+ ## NOTE: count filters
+ #if node.observed_status != 'DOWN':
+ # print node.hostname, node.observed_status
+ # if node.observed_status == 'DEBUG':
+ # if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+ # filtercount[node.plc_node_stats['boot_state']] += 1
+ # else:
+ # filtercount['debug'] += 1
+ #
+ # else:
+ # filtercount[node.observed_status] += 1
+ #else:
+ # if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+ # filtercount[node.observed_status] += 1
+ # else:
+ # filtercount['neverboot'] += 1
# NOTE: apply filter
- if filter == node.observed_status:
- if filter == "DOWN":
- if node.plc_node_stats['last_contact'] != None:
- query.append(node)
- else:
- query.append(node)
- elif filter == "neverboot":
+ if filter == "neverboot":
if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
query.append(node)
- elif filter == "pending":
- # TODO: look in message logs...
- pass
elif filter == "all":
query.append(node)
+ elif filter == node.history.status:
+ query.append(node)
+
+ #if filter == node.observed_status:
+ # if filter == "DOWN":
+ # if node.plc_node_stats['last_contact'] != None:
+ # query.append(node)
+ # else:
+ # query.append(node)
+ #elif filter == "neverboot":
+ # if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+ # query.append(node)
+ #elif filter == "pending":
+ # # TODO: look in message logs...
+ # pass
+ #elif filter == node.plc_node_stats['boot_state']:
+ # query.append(node)
+ #elif filter == "all":
+ # query.append(node)
widget = NodeWidget(template='monitorweb.templates.node_template')
return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
if 'pcuid' in val:
pcuid = val['pcuid']
elif 'hostname' in val:
- pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+ pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
else:
pcuid=None
else:
prep_node_for_display(node)
nodequery += [node]
if node.plc_pcuid: # not None
- pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
prep_pcu_for_display(pcu)
pcus[node.plc_pcuid] = pcu
node = FindbadNodeRecord.get_latest_by(hostname=nodename)
print "%s" % node.port_status
print "%s" % node.to_dict()
- print "%s" % len(q.all())
if node:
prep_node_for_display(node)
nodequery += [node]
<table width="100%">
<thead>
<tr>
- <th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
- <th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
- <th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+ <th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+ <th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+ <th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+ <th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+ <th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
<th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
- <th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+ <!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
<th><a href="${link('node', filter='all')}">All</a></th>
</tr>
</thead>
<tbody>
<tr>
- <td colspan="5">
+ <td colspan="7">
<table id="sortable_table" class="datagrid" border="1" width="100%">
<thead>
<tr>