"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
"bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
"bminit-cfg-auth-getplc-update-debug-done",
"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
"bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
"bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
]:
sequences.update({n : "fsck_repair"})
- # update_node_config_email
+ # nodeconfig_notice
for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
"bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
"bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
"bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
"bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
"bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
]:
- sequences.update({n : "update_node_config_email"})
+ sequences.update({n : "nodeconfig_notice"})
for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
"bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
]:
sequences.update({n : "nodenetwork_email"})
- # update_bootcd_email
+ # noblockdevice_notice
for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
"bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
"bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
"bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+ ]:
+ sequences.update({n : "noblockdevice_notice"})
+
+ # update_bootcd_email
+ for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
]:
sequences.update({n : "update_bootcd_email"})
for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
]:
- sequences.update({n: "suspect_error_email"})
+ sequences.update({n: "unknownsequence_notice"})
- # update_hardware_email
- sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
- sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+ # minimalhardware_notice
+ sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
+ sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
- # broken_hardware_email
- sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+ # baddisk_notice
+ sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
- # bad_dns_email
+ # baddns_notice
for n in [
"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
]:
- sequences.update( { n : "bad_dns_email"})
+ sequences.update( { n : "baddns_notice"})
return sequences
# NOTE: Nothing works if the bootcd is REALLY old.
# So, this is the first step.
+ bootman_action = "unknown"
+
fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
recent_actions = sitehist.getRecentActions(hostname=hostname)
api.UpdateNode(hostname, {'boot_state' : 'disabled'})
# NOTE: nothing else is possible.
- return True
+ return "disabled"
debugnode = DebugInterface(hostname)
conn = debugnode.getConnection()
- if type(conn) == type(False): return False
+ if type(conn) == type(False): return "error"
boot_state = conn.get_boot_state()
if boot_state != "debug":
print "... %s in %s state: skipping..." % (hostname , boot_state)
- return boot_state == "boot"
+ return "skipped" #boot_state == "boot"
if conn.bootmanager_running():
print "...BootManager is currently running. Skipping host %s" %hostname
- return True
+ return "skipped" # True
# Read persistent flags, tagged on one week intervals.
sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
#conn.set_nodestate('disabled')
- return False
+ return "skipping_baddisk"
print "...Downloading bm.log from %s" %hostname
log = conn.get_bootmanager_log()
child = fdpexpect.fdspawn(log)
- if hasattr(config, 'collect') and config.collect: return True
+ if hasattr(config, 'collect') and config.collect: return "collect"
if config and not config.quiet: print "...Scanning bm.log for errors"
conn.restart_bootmanager('boot')
+ bootman_action = "restart_bootmanager"
+
# NOTE: Do not set the pflags value for this sequence if it's unknown.
# This way, we can check it again after we've fixed it.
flag_set = False
else:
+ bootman_action = sequences[s]
if sequences[s] == "restart_bootmanager_boot":
print "...Restarting BootManager.py on %s "%hostname
# there was some failure to synchronize the keys.
print "...Unable to repair node keys on %s" %hostname
- elif sequences[s] == "suspect_error_email":
+ elif sequences[s] == "unknownsequence_notice":
args = {}
args['hostname'] = hostname
args['sequence'] = s
sitehist.sendMessage('unknownsequence_notice', **args)
conn.restart_bootmanager('boot')
- # TODO: differentiate this and the 'nodenetwork_email' actions.
- elif sequences[s] == "update_node_config_email":
+ elif sequences[s] == "nodeconfig_notice":
if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
args = {}
sitehist.sendMessage('nodeconfig_notice', **args)
conn.dump_plconf_file()
- elif sequences[s] == "update_bootcd_email":
+ elif sequences[s] == "noblockdevice_notice":
- if not found_within(recent_actions, 'newalphacd_notice', 3.5):
+ if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
args = {}
- args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+ #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
args['hostname'] = hostname
- sitehist.sendMessage('newalphacd_notice', **args)
+ sitehist.sendMessage('noblockdevice_notice', **args)
- print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-
- elif sequences[s] == "broken_hardware_email":
+ elif sequences[s] == "baddisk_notice":
# MAKE An ACTION record that this host has failed hardware. May
# require either an exception "/minhw" or other manual intervention.
# Definitely need to send out some more EMAIL.
sitehist.sendMessage('baddisk_notice', **args)
#conn.set_nodestate('disabled')
- elif sequences[s] == "update_hardware_email":
+ elif sequences[s] == "minimalhardware_notice":
if not found_within(recent_actions, 'minimalhardware_notice', 7):
print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
args = {}
args['bmlog'] = conn.get_bootmanager_log().read()
sitehist.sendMessage('minimalhardware_notice', **args)
- elif sequences[s] == "bad_dns_email":
+ elif sequences[s] == "baddns_notice":
if not found_within(recent_actions, 'baddns_notice', 1):
print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
args = {}
print traceback.print_exc()
# TODO: api error. skip email, b/c all info is not available,
# flag_set will not be recorded.
- return False
+ return "exception"
nodenet_str = network_config_to_str(net)
args['hostname'] = hostname
sitehist.sendMessage('baddns_notice', **args)
- return True
+ return bootman_action
# MAIN -------------------------------------------------------------------
mon_metadata = sqlalchemy.MetaData()
mon_metadata.bind = sqlalchemy.create_engine(config.monitor_dburi, echo=config.echo)
mon_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
+mon_session.bind = mon_metadata.bind
if config.zabbix_enabled:
zab_metadata = sqlalchemy.MetaData()
import elixir
import traceback
from elixir.ext.versioned import *
+from pcucontrol import reboot
from monitor.database.dborm import mon_metadata, mon_session
__metadata__ = mon_metadata
def get_latest_by(cls, **kwargs):
return cls.query.filter_by(**kwargs).first()
+ def pcu_name(self):
+ if self.plc_pcu_stats['hostname'] is not None and self.plc_pcu_stats['hostname'] is not "":
+ return self.plc_pcu_stats['hostname']
+ elif self.plc_pcu_stats['ip'] is not None and self.plc_pcu_stats['ip'] is not "":
+ return self.plc_pcu_stats['ip']
+ else:
+ return None
+
+ def format_ports(self):
+ retval = []
+ filtered_length=0
+
+ supported_ports=reboot.model_to_object(self.plc_pcu_stats['model']).supported_ports
+ data = self.port_status.copy()
+
+ if data and len(data.keys()) > 0 :
+ for port in supported_ports:
+ try:
+ state = data[str(port)]
+ except:
+ state = "unknown"
+
+ if state == "filtered":
+ filtered_length += 1
+
+ retval.append( (port, state) )
+
+ if retval == []:
+ retval = [( "Closed/Filtered", "" )]
+
+ if filtered_length == len(supported_ports):
+ retval = [( "All Filtered", "" )]
+
+ return retval
+
+ def format_pcu_shortstatus(self):
+ status = "error"
+ if self.reboot_trial_status:
+ if self.reboot_trial_status == str(0):
+ status = "Ok"
+ elif self.reboot_trial_status == "NetDown" or self.reboot_trial_status == "Not_Run":
+ status = self.reboot_trial_status
+ else:
+ status = "error"
+
+ return status
+
+ def test_is_ok(self):
+ if self.reboot_trial_status == str(0):
+ return True
+ else:
+ return False
+
+ def pcu_errors(self):
+ message = "\n"
+ message += "\tModel: %s\n" % self.plc_pcu_stats['model']
+ message += "\tMissing Fields: %s\n" % ( self.entry_complete == "" and "None missing" or self.entry_complete )
+ message += "\tDNS Status: %s\n" % self.dns_status
+ message += "\tPort Status: %s\n" % self.format_ports()
+ message += "\tTest Results: %s\n" % self.format_pcu_shortstatus()
+ return message
+
# ACCOUNTING
date_checked = Field(DateTime)
round = Field(Int,default=0)
-from monitor import reboot
from monitor.common import *
from monitor.model import *
from monitor.wrapper import plc
from monitor.wrapper import plccache
from monitor.wrapper.emailTxt import mailtxt
from monitor.database.info.model import *
+# NOTE: must import this after monitor.database.info.model b/c that imports
+# pcucontro.reboot and blocks this version, if it comes last.
+from monitor import reboot
class SiteInterface(HistorySiteRecord):
@classmethod
def runBootManager(self, hostname):
from monitor import bootman
print "attempting BM reboot of %s" % hostname
- ret = ""
+ ret = "error"
try:
ret = bootman.restore(self, hostname)
err = ""
err = traceback.format_exc()
print err
+ # TODO: keep this record so that the policy.py can identify all
+ # bootmanager_* actions without explicitly listing every kind.
act = ActionRecord(loginbase=self.db.loginbase,
hostname=hostname,
action='reboot',
action_type='bootmanager_restore',
+ error_string="")
+
+ act = ActionRecord(loginbase=self.db.loginbase,
+ hostname=hostname,
+ action='reboot',
+ action_type='bootmanager_' + ret,
error_string=err)
return ret
from monitor.database.info.history import *
from monitor.database.info.plc import *
from monitor.database.dborm import mon_session as session
+from monitor.database.dborm import mon_metadata
and password. Then, choose which node to associate it with, and we will take
care of the rest.
+Thank you very much for your help,
+ -- %(plc_name)s (%(support_email)s)
+""")
+
+ pcuerror_notice=("""Please help us configure your PCU: %(pcu_name)s""",
+"""During our standard monitoring of your site we noticed that the following
+PCU is misconfigured:
+
+ %(pcu_name)s
+ %(pcu_errors)s
+You can learn more details about the problem by visiting the link below.
+
+ https://%(monitor_hostname)s/monitor/pcuview?loginbase=%(loginbase)s
+
+We would like to save you time by taking care of as many administrative situations for your site's machines as possible without disturbing you. Errors like these prevent us from being able to remotely administer your machines, and so we must solicit your help using messages like these.
+
+So, any help and time that you can offer now to help us remotely administer your machines will pay off for you in the future.
+
Thank you very much for your help,
-- %(plc_name)s (%(support_email)s)
""")
pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""",
-"""We tried to use the PCU registered for %(hostname)s, but for some reason
-the host did not come back online. You may be able to learn more by visiting
-this link:
+"""We tried to use the PCU registered for %(hostname)s, but for some reason the host did not come back online. This may be for several reasons, and you can learn more by visiting this link:
+
+ %(pcu_name)s
https://%(monitor_hostname)s/monitor/pcuview?loginbase=%(loginbase)s
-- %(plc_name)s (%(support_email)s)
""")
+ noblockdevice_notice=("""Cannot Detect Disks on %(hostname)s""",
+"""As part of PlanetLab node monitoring, we noticed that we were not able to detect any hard disks in your machine.
+
+ %(hostname)s
+
+This may be the case for a number of reasons:
+ * the hardware is very new and needs a new driver,
+ * the hardware is very old is no longer supported,
+ * the hard disk was physically removed,
+ * the hard disk cable is loose or disconnected,
+
+Please help us investigate and let us know if there's anything that we can do to assist in getting your machine up and running again.
+
+Thank you for your help,
+ -- %(plc_name)s (%(support_email)s)
+""")
newalphacd_notice=("""New Boot Images for %(hostname)s""",
-"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that either it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
%(hostname)s
print "changed status from %s to offline" % node.status
node.status = 'offline'
node.last_changed = datetime.now()
-
- if node_state == 'DEBUG' and node.status not in ['failboot', 'disabled', 'safeboot']:
+
+ if node_state == 'DEBUG':
if boot_state != 'disabled' and boot_state != 'safeboot':
print "changed status from %s to failboot" % (node.status)
- node.status = "failboot"
- node.last_changed = datetime.now()
+ current_status = "failboot"
else:
print "changed status from %s to %s" % (node.status, boot_state)
- node.status = boot_state
+ current_status = boot_state
+
+ if current_status != node.status and \
+ current_status in ['failboot', 'disabled', 'safeboot']:
+
+ node.status = current_status
node.last_changed = datetime.now()
if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
not nodehist.firewall and \
not found_between(recent_actions, 'try_reboot', 3.5, 1):
+ # TODO: there MUST be a better way to do this...
+ # get fb node record for pcuid
+ fbpcu = None
+ fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
+ if fbnode:
+ fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
+
sitehist.attemptReboot(host)
print "send message for host %s try_reboot" % host
+ if not fbpcu.test_is_ok() and \
+ not found_within(recent_actions, 'pcuerror_notice', 3.0):
+
+ args = {}
+ if fbpcu:
+ args['pcu_name'] = fbpcu.pcu_name()
+ args['pcu_errors'] = fbpcu.pcu_errors()
+ else:
+ args['pcu_name'] = "error looking up pcu name"
+ args['pcu_errors'] = ""
+
+ args['hostname'] = host
+ sitehist.sendMessage('pcuerror_notice', **args)
+ print "send message for host %s PCU Failure" % host
+
# NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
# will be false for a day after the above condition is satisfied
found_between(recent_actions, 'try_reboot', 3.5, 1) and \
not found_within(recent_actions, 'pcufailed_notice', 3.5):
+ # TODO: there MUST be a better way to do this...
+ # get fb node record for pcuid
+ fbpcu = None
+ fbnode = FindbadNodeRecord.get_latest_by(hostname=host)
+ if fbnode:
+ fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid)
+ if fbpcu:
+ pcu_name = fbpcu.pcu_name()
+ else:
+ pcu_name = "error looking up pcu name"
+
+ # get fb pcu record for pcuid
# send pcu failure message
- #act = ActionRecord(**kwargs)
- sitehist.sendMessage('pcufailed_notice', hostname=host)
+ sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name)
print "send message for host %s PCU Failure" % host
if nodehist.status == 'failboot' and \
- changed_greaterthan(nodehist.last_changed, 1) and \
+ changed_greaterthan(nodehist.last_changed, 0.25) and \
not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
# send down node notice
# delay 0.5 days before retrying...
import re
from monitor.database.info.model import *
#from monitor.database.zabbixapi.model import *
-#from monitor.database.dborm import zab_session as session
-#from monitor.database.dborm import zab_metadata as metadata
from monitor_xmlrpc import MonitorXmlrpcServer
from monitor import reboot
types = filter(lambda x: 'notice' in x, dir(mailtxt))
results = {}
+ print mon_metadata.bind
+ if session.bind is None:
+ #TODO: figure out why this value gets cleared out...
+ session.bind = mon_metadata.bind
+ result = session.execute("select distinct(action_type) from actionrecord;")
+
+ types = [r[0] for r in result]
+
try: since = int(since)
except: since = 7
<table width="100%">
<thead>
<tr>
- <th><a href="${link('actionlist', action_type='online_notice', since=1)}">Last Day</a></th>
- <th><a href="${link('actionlist', action_type='online_notice', since=7)}">Last Week</a></th>
- <th><a href="${link('actionlist', action_type='online_notice', since=30)}">Last Month</a></th>
+ <th><a href="${link('actionlist', action_type=action_type, since=1)}">Last Day</a></th>
+ <th><a href="${link('actionlist', action_type=action_type, since=7)}">Last Week</a></th>
+ <th><a href="${link('actionlist', action_type=action_type, since=30)}">Last Month</a></th>
</tr>
</thead>
<tbody>
<p py:if="actions and len(actions) == 0">
There are no recent actions taken for this site.
</p>
- <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+ <table id="actionlist" cellpadding="0" border="0" class="plekit_table sortable-onload-0 colstyle-alt no-arrow paginationcallback-actionlist_paginator max-pages-10 paginate-50" py:if="actions and len(actions) > 0">
+ <!--table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%"-->
<thead>
<tr>
- <th mochi:format="int"></th>
- <th>Date</th>
- <th>Action taken on</th>
- <th>Action Type</th>
- <th>Message ID</th>
- <th>Errors</th>
+ <th class="sortable plekit_table">Date</th>
+ <th class="sortable plekit_table">Action taken on</th>
+ <th class="sortable plekit_table">Action Type</th>
+ <th class="sortable plekit_table">Message ID</th>
+ <th class="sortable plekit_table">Errors</th>
</tr>
</thead>
<tbody>
<tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
- <td></td>
<td py:content="act.date_created"></td>
<td py:if="act.hostname is not None" nowrap="true" >
<a class="ext-link" href="${plc_node_uri(act.hostname)}">
<td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
<span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
<td py:if="'bootmanager' in act.action_type or 'unknown' in act.action_type">
- <a href="/monitorlog/bm.${act.hostname}.log">latest bm log</a>
+ <a href="/monitorlog/bm.${act.hostname}.log">bm log before action</a>
</td>
<td py:if="'bootmanager' not in act.action_type">
<pre py:content="act.error_string"></pre></td>
<tbody>
<tr>
<td colspan="5">
- <table id="sortable_table" class="datagrid" border="1" width="100%">
+ <table id="actionsummarylist" cellpadding="0" border="0" class="plekit_table sortable-onload-0 colstyle-alt no-arrow paginationcallback-actionsummarylist_paginator max-pages-10 paginate-50" >
<thead>
<tr>
- <th mochi:format="int"></th>
- <th>Notice Name</th>
- <th>Count</th>
+ <th class="sortable plekit_table">Type</th>
+ <th class="sortable plekit_table">Notice Name</th>
+ <th class="sortable plekit_table">Count</th>
</tr>
</thead>
<tbody>
<tr py:for="key in results.keys()">
- <td></td>
+ <td nowrap="true" py:content="'bootman' in key and 'bootmanager' or ( 'notice' in key and 'notice' or ( 'penalty' in key and 'penalty' or 'unknown' ) ) "></td>
<td nowrap="true"><a href="actionlist?action_type=${key}" py:content="key"></a></td>
<td nowrap='true' py:content="results[key]"></td>
</tr>