# Attempt to reboot a node in debug state.
-
-
import os
import sys
import time
from sets import Set
from monitor.getsshkeys import SSHKnownHosts
-
from monitor.Rpyc import SocketConnection, Async
from monitor.Rpyc.Utils import *
from pcucontrol.transports.ssh import fdpexpect as fdpexpect
from pcucontrol.transports.ssh import pexpect as pexpect
-
-
api = plc.getAuthAPI()
fb = None
+def bootmanager_log_name(hostname):
+ t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+ base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
+ short_target_filename = os.path.join('history', base_filename)
+ return short_target_filename
+
+def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
+ try:
+ node = FindbadNodeRecord.get_latest_by(hostname=hostname)
+ loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
+ err = ""
+ except:
+ loginbase = "unknown"
+ err = traceback.format_exc()
+
+ act = ActionRecord(loginbase=loginbase,
+ hostname=hostname,
+ action='log',
+ action_type=logtype,
+ log_path=short_log_path,
+ error_string=err)
+ session.flush(); session.clear()
+ return
+
class ExceptionDoubleSSHError(Exception): pass
return log
def get_bootmanager_log(self):
- t_stamp = time.strftime("%Y-%m-%d-%H:%M")
- download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
- os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
+ bm_name = bootmanager_log_name(self.node)
+ download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
+ bootmanager_log_action(self.node, bm_name, "collected_bm.log")
+ os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
return log
self.setup_host()
def get_connection(self, config):
- conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
- #i = 0
- #while i < 3:
- # print i, conn.c.modules.sys.path
- # print conn.c.modules.os.path.exists('/tmp/source')
- # i+=1
- # time.sleep(1)
+ try:
+ conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+ except:
+ # NOTE: try twice since this can sometimes fail the first time. If
+ # it fails again, let it go.
+ conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
return conn
def setup_host(self):
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
"bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
"bminit-cfg-auth-getplc-update-debug-done",
+ "bminit-cfg-auth-protoerror2-debug-done",
"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
"bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
"bminit-cfg-auth-protoerror-exception-update-debug-done",
"bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
"bminit-cfg-auth-getplc-implementerror-update-debug-done",
+ "bminit-cfg-auth-authfail2-protoerror2-debug-done",
]:
sequences.update({n : "restart_bootmanager_boot"})
"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
"bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
"bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
+ "bminit-cfg-auth-authfail-debug-done",
+ "bminit-cfg-auth-authfail2-authfail-debug-done",
]:
sequences.update({n: "repair_node_keys"})
"bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
"bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
"bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
]:
sequences.update({n : "noblockdevice_notice"})
steps = [
('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
- ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
+ ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
('exception' , 'Exception'),
('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
+ ('protoerror2' , '500 Internal Server Error'),
('protoerror' , 'XML RPC protocol error'),
('nodehostname' , 'Configured node hostname does not resolve'),
('implementerror', 'Implementation Error'),
('nospace' , "No space left on device"),
('nonode' , 'Failed to authenticate call: No such node'),
('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
- ('bootcheckfail' , 'BootCheckAuthentication'),
- ('bootupdatefail' , 'BootUpdateNode'),
+ ('authfail2' , 'Authentication Failed'),
+ ('bootcheckfail' , 'BootCheckAuthentication'),
+ ('bootupdatefail' , 'BootUpdateNode'),
]
return steps
debugnode = DebugInterface(hostname)
conn = debugnode.getConnection()
- if type(conn) == type(False): return "error"
+ if type(conn) == type(False): return "connect_failed"
boot_state = conn.get_boot_state()
if boot_state != "debug":
# the keys either are in sync or were forced in sync.
# so try to start BM again.
conn.restart_bootmanager(conn.get_nodestate())
- pass
else:
# there was some failure to synchronize the keys.
print "...Unable to repair node keys on %s" %hostname
+ if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
+ args = {}
+ args['hostname'] = hostname
+ sitehist.sendMessage('nodeconfig_notice', **args)
+ conn.dump_plconf_file()
+ else:
+ # NOTE: do not add a new action record
+ return ""
elif sequences[s] == "unknownsequence_notice":
args = {}
args['hostname'] = hostname
sitehist.sendMessage('nodeconfig_notice', **args)
conn.dump_plconf_file()
+ else:
+ # NOTE: do not add a new action record
+ return ""
elif sequences[s] == "nodenetwork_email":
args['bmlog'] = conn.get_bootmanager_log().read()
sitehist.sendMessage('nodeconfig_notice', **args)
conn.dump_plconf_file()
+ else:
+ # NOTE: do not add a new action record
+ return ""
elif sequences[s] == "noblockdevice_notice":
args['hostname'] = hostname
sitehist.sendMessage('noblockdevice_notice', **args)
+ else:
+ # NOTE: do not add a new action record
+ return ""
elif sequences[s] == "baddisk_notice":
# MAKE An ACTION record that this host has failed hardware. May
sitehist.sendMessage('baddisk_notice', **args)
#conn.set_nodestate('disabled')
+ else:
+ # NOTE: do not add a new action record
+ return ""
elif sequences[s] == "minimalhardware_notice":
if not found_within(recent_actions, 'minimalhardware_notice', 7):
args['hostname'] = hostname
args['bmlog'] = conn.get_bootmanager_log().read()
sitehist.sendMessage('minimalhardware_notice', **args)
+ else:
+ # NOTE: do not add a new action record
+ return ""
elif sequences[s] == "baddns_notice":
if not found_within(recent_actions, 'baddns_notice', 1):
args['interface_id'] = net['interface_id']
sitehist.sendMessage('baddns_notice', **args)
+ else:
+ # NOTE: do not add a new action record
+ return ""
return bootman_action
import struct
from monitor import reboot
from monitor import util
-from monitor import database
from monitor.wrapper import plc
from datetime import datetime, timedelta
# NOTE: in case an exception is thrown while trying to perform an action.
error_string = Field(String, default=None)
+ log_path = Field(String, default=None)
+
#issue = ManyToOne('IssueRecord')
# NOTE: this is the parent relation to fb records. first create the
# action record, then append to this value all of the findbad records we
# those errors.
args = {'loginbase' : self.db.loginbase,
- 'penalty_level' : self.db.penalty_level,
+ 'penalty_level' : -self.db.penalty_level,
'monitor_hostname' : config.MONITOR_HOSTNAME,
'support_email' : config.support_email,
'plc_name' : config.PLC_NAME,
action_type='bootmanager_restore',
error_string="")
- act = ActionRecord(loginbase=self.db.loginbase,
+ if ret:
+ act = ActionRecord(loginbase=self.db.loginbase,
hostname=hostname,
action='reboot',
action_type='bootmanager_' + ret,
continue_slice_check = True
oval = values['princeton_comon_dir']
- if "princeton_comon_dir" in oval:
+ if "princeton_comon" in oval:
values['princeton_comon_dir'] = True
else:
values['princeton_comon_dir'] = False
import time
import math
+# import local file.py
+import file
+
def diff_time(timestamp, abstime=True):
now = time.time()
if timestamp == None:
return buf
def dumpFile(file, buf):
- f = open(file, 'w')
+ f = open(file, 'wb')
f.write(buf)
f.close()
return
https://%(monitor_hostname)s/monitor/pcuview?loginbase=%(loginbase)s
+If you need to change the PCU configuration in the PLC database:
+
+ https://%(plc_hostname)s/db/sites/pcu.php?id=%(plc_pcuid)s
+
We would like to save you time by taking care of as many administrative situations for your site's machines as possible without disturbing you. Errors like these prevent us from being able to remotely administer your machines, and so we must solicit your help using messages like these.
So, any help and time that you can offer now to help us remotely administer your machines will pay off for you in the future.
Legend:
- 0 - no penalties applied
- 1 - site is disabled. no new slices can be created.
- 2+ - all existing slices will be disabled.
+ 0 - no penalties applied
+ -1 - site is disabled. no new slices can be created.
+ -2 - all existing slices will be disabled.
""")
increase_penalty=("""Privilege reduced for site %(loginbase)s""",
Legend:
- 0 - no penalty applied
- 1 - site is disabled. no new slices can be created.
- 2+ - all existing slices will be disabled.
- """)
+ 0 - no penalties applied
+ -1 - site is disabled. no new slices can be created.
+ -2 - all existing slices will be disabled.
+""")
newbootcd_notice=("""Host %(hostname)s needs a new BootImage""", """
We noticed the following node has an out-dated BootImage:
* the hard disk was physically removed,
* the hard disk cable is loose or disconnected,
-Please help us investigate and let us know if there's anything that we can do to assist in getting your machine up and running again.
+Please help us investigate and let us know if there is anything that we can do to assist in getting your machine up and running again.
Thank you for your help,
-- %(plc_name)s (%(support_email)s)
%(url_list)s
-Instructions to burn or copy these All-in-One images to the appropriate media are available in the Technical Contact's Guide.
+Instructions to burn or copy these All-in-One images to the appropriate media are available in the Technical Contacts Guide.
https://%(plc_hostname)s/doc/guides/bootcdsetup
-If your node returns to normal operation after following these directions, then there's no need to respond to this message. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (%(support_email)s) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken.
+If your node returns to normal operation after following these directions, then there is no need to respond to this message. However, if there are any console messages relating to the nodes failure, please report them to PlanetLab support (%(support_email)s) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you have taken.
Thank you for your help,
-- %(plc_name)s (%(support_email)s)
nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
-"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file.
+"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated configuration.
Either our boot scripts cannot find it because the boot media is corrupted, or it has no NODE_ID or a mis-matched HOSTNAME. This can happen either due to a configuration mistake at your site, with bad information entered into our database, or after a necessary software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
if fbpcu:
args['pcu_name'] = fbpcu.pcu_name()
args['pcu_errors'] = fbpcu.pcu_errors()
+ args['plc_pcuid'] = fbpcu.plc_pcuid
else:
args['pcu_name'] = "error looking up pcu name"
args['pcu_errors'] = ""
+ args['plc_pcuid'] = 0
args['hostname'] = host
sitehist.sendMessage('pcuerror_notice', **args)
--- /dev/null
+-- If there's an existing database, these commands will upgrade it to the
+-- current version
+ALTER TABLE actionrecord ADD COLUMN log_path varchar DEFAULT NULL;
#from monitor.database.zabbixapi.model import *
from monitor_xmlrpc import MonitorXmlrpcServer
+from monitor import util
from monitor import reboot
+from monitor import bootman
from monitor import scanapi
+from monitor import config
import time
from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
agg.dns_short_status = 'Mismatch'
return agg
+class ActionListWidget(widgets.Widget):
+ pass
+
class NodeWidget(widgets.Widget):
pass
for pcuid_key in pcus:
pcuquery += [pcus[pcuid_key]]
- return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions_list, since=since, exceptions=exceptions)
+ actionlist_widget = ActionListWidget(template='monitorweb.templates.actionlist_template')
+ return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions_list, actionlist_widget=actionlist_widget, since=since, exceptions=exceptions)
# TODO: add form validation
return dict(results=results)
@expose(template="monitorweb.templates.actionlist")
- def actionlist(self, action_type='down_notice', since=7, loginbase=None):
+ def actionlist(self, since=7, action_type=None, loginbase=None):
try: since = int(since)
except: since = 7
+ acts_query = ActionRecord.query.filter(
+ ActionRecord.date_created >= datetime.now() - timedelta(since)
+ )
if loginbase:
- acts = ActionRecord.query.filter_by(loginbase=loginbase
- ).filter(ActionRecord.date_created >= datetime.now() - timedelta(since)
- ).order_by(ActionRecord.date_created.desc())
- else:
- acts = ActionRecord.query.filter(ActionRecord.action_type==action_type
- ).filter(ActionRecord.date_created >= datetime.now() - timedelta(since)
- ).order_by(ActionRecord.date_created.desc())
+ acts_query = acts_query.filter_by(loginbase=loginbase)
+
+ if action_type:
+ acts_query = acts_query.filter(ActionRecord.action_type==action_type)
+
+ acts = acts_query.order_by(ActionRecord.date_created.desc())
+
query = [ a for a in acts ]
return dict(actions=query, action_type=action_type, since=since)
@cherrypy.expose()
def upload(self, log, **keywords):
- print "got data"
- data = log.file.read()
- target_file_name = os.path.join(os.getcwd(), log.filename)
- # open file in binary mode for writing
-
- f = open(target_file_name, 'wb')
- print "write data"
- f.write(data)
- f.close()
-
- #flash("File uploaded successfully: %s saved as: %s" \
- # % (upload_file.filename, target_file_name))
- #u = UploadedFile(filename=upload_file.filename,
- # abspath=target_file_name, size=0)
- print "redirecting "
-
- #redirect("monitor")
+ hostname = None
+ logtype = None
+ logtype_list = ['bm.log', ]
+
+ if 'hostname' in keywords:
+ hostname = keywords['hostname']
+ if 'type' in keywords and keywords['type'] in logtype_list:
+ logtype = keywords['type']
+
+ if not hostname: return ""
+ if not logtype: return "unknown logtype: %s" % logtype
+
+ short_target_filename = bootman.bootmanager_log_name(hostname)
+ abs_target_filename = os.path.join(config.MONITOR_BOOTMANAGER_LOG, short_target_filename)
+ print "write data: %s" % abs_target_filename
+ util.file.dumpFile(abs_target_filename, log.file.read())
+ bootman.bootmanager_log_action(hostname, short_target_filename, logtype)
+
+ print "redirecting 3"
+
return dict()
--- /dev/null
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<span xmlns:py="http://purl.org/kid/ns#"
+ xmlns:mochi="http://www.mochi.org">
+ <h4>Actions Over the Last ${since} Days</h4>
+ <p py:if="actions and len(actions) == 0">
+ There are no recent actions taken for this site.
+ </p>
+ <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+ <thead>
+ <tr>
+ <th mochi:format="int"></th>
+ <th>At</th>
+ <th>MyOps acted on</th>
+ <th>Using</th>
+ <th>Message/Log</th>
+ <th>Errors</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
+ <td></td>
+ <td py:content="act.date_created"></td>
+ <td py:if="act.hostname is not None" nowrap="true" >
+ <a class="ext-link" href="${plc_node_uri(act.hostname)}">
+ <span class="icon">${act.hostname}</span></a>
+ </td>
+ <td py:if="act.hostname is None" nowrap="true">
+ <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
+ <span class="icon">${act.loginbase}</span></a>
+ </td>
+ <td py:content="act.action_type"></td>
+
+ <td>
+ <span py:if="act.message_id != 0">
+ <a class="ext-link" href="${plc_mail_uri(act.message_id)}">
+ <span class="icon">${act.message_id}</span></a>
+ </span>
+ <span py:if="act.message_id == 0 and act.log_path is not None">
+ <a class="ext-link" href="/monitorlog/${act.log_path}">
+ <span class="icon">orig bm log</span></a>
+ </span>
+ </td>
+ <!--td py:if="'bootmanager' in act.action_type or 'unknown' in act.action_type">
+ <a href="/monitorlog/bm.${act.hostname}.log">latest bm log</a>
+ </td-->
+
+ <td py:if="act.error_string">
+ <div id="links">
+ <a class="info" href="#">Stack Trace<span>
+ <pre>${act.error_string}</pre>
+ </span>
+ </a>
+ </div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+</span>
<div id="status_block" class="flash"
py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
- <h4>Actions Over the Last ${since} Days</h4>
- <p py:if="actions and len(actions) == 0">
- There are no recent actions taken for this site.
- </p>
- <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
- <thead>
- <tr>
- <th mochi:format="int"></th>
- <th>Date</th>
- <th>Action taken on</th>
- <th>Action Type</th>
- <th>Message ID</th>
- <th>Errors</th>
- </tr>
- </thead>
- <tbody>
- <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
- <td></td>
- <td py:content="act.date_created"></td>
- <td py:if="act.hostname is not None" nowrap="true" >
- <a class="ext-link" href="${plc_node_uri(act.hostname)}">
- <span class="icon">${act.hostname}</span></a>
- </td>
- <td py:if="act.hostname is None" nowrap="true">
- <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
- <span class="icon">${act.loginbase}</span></a>
- </td>
- <!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
- <td py:content="act.action_type"></td>
- <td>
- <span py:if="act.message_id != 0">
- <a class="ext-link" href="${plc_mail_uri(act.message_id)}"><span class="icon">${act.message_id}</span></a>
- </span>
- <span py:if="act.message_id == 0">
- <a py:if="'bootmanager' in act.action_type or 'unknown' in act.action_type" href="/monitorlog/bm.${act.hostname}.log">latest bm log</a>
- </span>
- </td>
- <td><pre py:content="act.error_string"></pre></td>
- </tr>
- </tbody>
- </table>
+ ${actionlist_widget.display(since=since, actions=actions)}
<!-- TODO: figure out how to make this conditional by model rather than port;
it is convenient to have links to ilo, drac, amt, etc.
<div id="status_block" class="flash"
py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
- <h4>Actions Over the Last ${since} Days</h4>
- <p py:if="actions and len(actions) == 0">
- There are no recent actions taken for this site.
- </p>
- <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
- <thead>
- <tr>
- <th mochi:format="int"></th>
- <th>Date</th>
- <th>Action taken on</th>
- <th>Action Type</th>
- <th>Message ID</th>
- <th>Errors</th>
- </tr>
- </thead>
- <tbody>
- <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
- <td></td>
- <td py:content="act.date_created"></td>
- <td py:if="act.hostname is not None" nowrap="true" >
- <a class="ext-link" href="${plc_node_uri(act.hostname)}">
- <span class="icon">${act.hostname}</span></a>
- </td>
- <td py:if="act.hostname is None" nowrap="true">
- <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
- <span class="icon">${act.loginbase}</span></a>
- </td>
- <!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
- <td py:content="act.action_type"></td>
- <td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
- <span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
- <td py:if="'bootmanager' in act.action_type or 'unknown' in act.action_type">
- <a href="/monitorlog/bm.${act.hostname}.log">latest bm log</a>
- </td>
- <td py:if="'bootmanager' not in act.action_type">
- <pre py:content="act.error_string"></pre></td>
- </tr>
- </tbody>
- </table>
-
+ ${actionlist_widget.display(since=since, actions=actions)}
<!-- TODO: figure out how to make this conditional by model rather than port;
it is convenient to have links to ilo, drac, amt, etc.
regardless of whether the last PCU scan was successful. -->