From 0e3cb254ed858745809d57de80437d73aedc6eba Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 24 Sep 2009 21:37:46 +0000 Subject: [PATCH] added BootmanSequenceRecord to allow bootman sequence to be modified at runtime without source changes added init-bootman-sequence to initscripts for pre-defined sequences (from source) added get/setBootmanSequence(s) to xmlrpc API added traceroute to selectable values to return from query() deprecated pcuview() in favor of detailview() and simpleview() updated site links on actionlist to internal references added 'creator' to returned fields in rtsurvey and harvest_rt scripts refined node config directions in emailTxt added extra command to kill old processes before setting up bootman tunnel on node moved old sequence definitions to init-bootman-sqeuence.py. use DB sequences log files were failing to be copied, so make all scripts in automate-default end with "|| :" ; should figure out what's really going on. --- automate-default.sh | 9 +- init-bootman-sequence.py | 192 ++++++++++++++++++ monitor-server.init | 1 + monitor/bootman.py | 185 ++--------------- monitor/database/info/action.py | 5 + monitor/wrapper/emailTxt.py | 2 +- statistics/harvest_rt.py | 46 +++++ statistics/rtsurvey.py | 18 +- test-myops-xmlrpc.py | 8 + web/MonitorWeb/monitorweb/controllers.py | 10 +- web/MonitorWeb/monitorweb/monitor_xmlrpc.py | 49 +++++ .../monitorweb/templates/actionlist.kid | 4 +- 12 files changed, 349 insertions(+), 180 deletions(-) create mode 100755 init-bootman-sequence.py create mode 100755 statistics/harvest_rt.py create mode 100755 test-myops-xmlrpc.py diff --git a/automate-default.sh b/automate-default.sh index 8d300a7..9f0f9d1 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -74,10 +74,11 @@ ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || : # clean up stray 'locfg' processes that hang around inappropriately... ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || : -${MONITOR_SCRIPT_ROOT}/policy.py $DATE -#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : -service plc restart monitor -curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv + +${MONITOR_SCRIPT_ROOT}/policy.py $DATE || : +${MONITOR_SCRIPT_ROOT}/statistics/add-record.py || : +curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv || : cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log +service plc restart monitor || : rm -f $MONITOR_PID diff --git a/init-bootman-sequence.py b/init-bootman-sequence.py new file mode 100755 index 0000000..2432f19 --- /dev/null +++ b/init-bootman-sequence.py @@ -0,0 +1,192 @@ +#!/usr/bin/python +from monitor.database.info.model import * + +def getSequences(): + + # TODO: This can be replaced with a DB definition at a future time. + # This would make it possible for an admin to introduce new + # patterns without touching code. + + sequences = {} + # restart_bootmanager_boot + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-exception-protoerror-protoerror2-protoerror-protoerror2-debug-validate-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done", + "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done", + "bminit-cfg-auth-getplc-update-debug-done", + "bminit-cfg-auth-protoerror2-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", + "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-implementerror-update-debug-done", + "bminit-cfg-auth-authfail2-protoerror2-debug-done", + ]: + sequences.update({n : "restart_bootmanager_boot"}) + + # conn.restart_bootmanager('reinstall') + for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-missingkernel-debug-validate-done", + ]: + sequences.update({n : "restart_bootmanager_rins"}) + + # repair_node_keys + for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done", + "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done", + "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done", + "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done", + "bminit-cfg-auth-authfail-debug-done", + "bminit-cfg-auth-authfail2-authfail-debug-done", + ]: + sequences.update({n: "repair_node_keys"}) + + # conn.restart_node('reinstall') + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + ]: + sequences.update({n : "restart_node_rins"}) + + # restart_node_boot + for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done", + ]: + sequences.update({n: "restart_node_boot"}) + + # fsck_repair + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done", + ]: + sequences.update({n : "fsck_repair"}) + + # nodeconfig_notice + for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done", + "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done", + "bminit-cfg-exception-noconfig-update-debug-validate-exception-done", + "bminit-cfg-exception-noparseconfig-debug-validate-exception-done", + "bminit-cfg-exception-noconfig-debug-validate-exception-done", + "bminit-cfg-auth-authfail2-nonode-debug-done", + ]: + sequences.update({n : "nodeconfig_notice"}) + + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + "bminit-cfg-exception-nodehostname-debug-validate-bmexceptvgscan-done", + "bminit-cfg-exception-nodehostname-debug-validate-exception-done", + ]: + sequences.update({n : "nodenetwork_email"}) + + # noblockdevice_notice + for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done", + ]: + sequences.update({n : "noblockdevice_notice"}) + + # update_bootcd_email + for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", + ]: + sequences.update({n : "update_bootcd_email"}) + + for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + ]: + sequences.update({n: "unknownsequence_notice"}) + + # minimalhardware_notice + for n in [ "bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done", + ]: + sequences.update({n: "minimalhardware_notice"}) + + # baddisk_notice + sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"}) + + # baddns_notice + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "baddns_notice"}) + + return sequences + +sequences = getSequences() + +for s in sequences: + bms = BootmanSequenceRecord.get_by(sequence=s) + if not bms: + bms = BootmanSequenceRecord(sequence=s, action=sequences[s]) + bms.flush() + +session.flush() diff --git a/monitor-server.init b/monitor-server.init index 265a803..8c26416 100644 --- a/monitor-server.init +++ b/monitor-server.init @@ -44,6 +44,7 @@ function check_monitor_schema_and_data() { # NOTE: call create_all() to setup the database from the info model. python -c "from monitor.database.info.model import *; from elixir import create_all; create_all()" + $MONITORPATH/init-bootman-sequence.py } function check_monitor_conf () diff --git a/monitor/bootman.py b/monitor/bootman.py index 3053636..b7ec58c 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -64,11 +64,13 @@ class ExceptionDoubleSSHError(Exception): pass class NodeConnection: def __init__(self, connection, node, config): + print "init nodeconnection" self.node = node self.c = connection self.config = config def get_boot_state(self): + print "get_boot_state(self)" try: if self.c.modules.os.path.exists('/tmp/source'): return "debug" @@ -262,7 +264,10 @@ class PlanetLabSession: def get_connection(self, config): try: - conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config) + print "SocketConnection(localhost, %s" % self.port + sc = SocketConnection("localhost", self.port) + print "NodeConnection(%s, %s)" % (sc, self.node) + conn = NodeConnection(sc, self.node, config) except: # NOTE: try twice since this can sometimes fail the first time. If # it fails again, let it go. @@ -314,6 +319,7 @@ class PlanetLabSession: (ov,ev) = ssh.run_noexcept2("""<<\EOF rm -f out.log echo "kill server" >> out.log + netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; echo "export" >> out.log export PYTHONPATH=$HOME ; @@ -400,22 +406,27 @@ class DebugInterface: email_exception(msg) return False + print "Getting connection: 1st try" try: conn = self.session.get_connection(config) except EOFError: # NOTE: sometimes the wait in setup_host() is not long enough. # So, here we try to wait a little longer before giving up entirely. try: + print "Getting connection: 2nd try" time.sleep(self.session.timeout*5) conn = self.session.get_connection(config) except EOFError: # failed twice... no need to report this really, it's just in a # weird state... + print "Getting connection: failed" + email_exception(self.hostname, "failed twice to get connection") return False except: traceback.print_exc() email_exception(self.hostname) return False + print "Getting connection: ok" #print "trying to use conn before returning it." #print conn.c.modules.sys.path #print conn.c.modules.os.path.exists('/tmp/source') @@ -426,172 +437,14 @@ class DebugInterface: def getSequences(self): - # TODO: This can be replaced with a DB definition at a future time. - # This would make it possible for an admin to introduce new - # patterns without touching code. - + # NOTE: The DB is now the autoritative record for all BM sequences. + # An admin can introduce new patterns and actions without touching code. sequences = {} - # restart_bootmanager_boot - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", - - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", - - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done", - "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done", - "bminit-cfg-auth-getplc-update-debug-done", - "bminit-cfg-auth-protoerror2-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", - "bminit-cfg-auth-protoerror-exception-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-implementerror-update-debug-done", - "bminit-cfg-auth-authfail2-protoerror2-debug-done", - ]: - sequences.update({n : "restart_bootmanager_boot"}) - - # conn.restart_bootmanager('reinstall') - for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", - # actual solution appears to involve removing the bad files, and - # continually trying to boot the node. - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done", - ]: - sequences.update({n : "restart_bootmanager_rins"}) - - # repair_node_keys - for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done", - "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done", - "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done", - "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done", - "bminit-cfg-auth-authfail-debug-done", - "bminit-cfg-auth-authfail2-authfail-debug-done", - ]: - sequences.update({n: "repair_node_keys"}) - - # conn.restart_node('reinstall') - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - ]: - sequences.update({n : "restart_node_rins"}) - - # restart_node_boot - for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done", - ]: - sequences.update({n: "restart_node_boot"}) - - # fsck_repair - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done", - "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done", - ]: - sequences.update({n : "fsck_repair"}) - - # nodeconfig_notice - for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done", - "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done", - "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done", - "bminit-cfg-exception-noconfig-update-debug-validate-exception-done", - ]: - sequences.update({n : "nodeconfig_notice"}) - - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", - "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done", - "bminit-cfg-update-exception-nodehostname-update-debug-done", - "bminit-cfg-exception-nodehostname-debug-validate-exception-done", - ]: - sequences.update({n : "nodenetwork_email"}) - - # noblockdevice_notice - for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done", - "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done", - ]: - sequences.update({n : "noblockdevice_notice"}) - - # update_bootcd_email - for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", - ]: - sequences.update({n : "update_bootcd_email"}) - - for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - ]: - sequences.update({n: "unknownsequence_notice"}) - - # minimalhardware_notice - sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"}) - sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"}) - - # baddisk_notice - sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"}) - - # baddns_notice - for n in [ - "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - ]: - sequences.update( { n : "baddns_notice"}) + bms = BootmanSequenceRecord.query.all() + for s in bms: + sequences[s.sequence] = s.action + return sequences def getDiskSteps(self): @@ -681,7 +534,9 @@ class DebugInterface: ('noinstall' , 'notinstalled'), ('bziperror' , 'bzip2: Data integrity error when decompressing.'), ('noblockdev' , "No block devices detected."), + ('missingkernel', "missingkernel"), ('dnserror' , 'Name or service not known'), + ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"), ('noconfig' , "Unable to find and read a node configuration file"), ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'), ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'), diff --git a/monitor/database/info/action.py b/monitor/database/info/action.py index 6554de8..704f733 100644 --- a/monitor/database/info/action.py +++ b/monitor/database/info/action.py @@ -77,6 +77,11 @@ class BlacklistRecord(Entity): else: return self.date_created + timedelta(0, self.expires) +class BootmanSequenceRecord(Entity): + sequence = Field(String, primary_key=True, default=None) + action = Field(String, default=None) + date_created = Field(DateTime,default=datetime.now) + class ActionRecord(Entity): @classmethod def get_latest_by(cls, **kwargs): diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 6cd1907..7f0f86f 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -353,7 +353,7 @@ Either our boot scripts cannot find it because the boot media is corrupted, or i Then double check the network settings for your host. -Then, select, "Download -> Download ISO image for %(hostname)s" menu. This will generate a new All-in-one BootImage file for your node. Copy this file to the appropriate read-only media, and reboot the machine. +If the network settings are correct, then, select, "Download -> Download image for %(hostname)s" menu. This will generate a new All-in-one BootImage file for your node. Copy this file to the appropriate read-only media, and reboot the machine. There is no need to respond to this message. If you're able to update the boot image without difficulty and your node returns to normal operation, please accept our thanks. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (%(support_email)s) so we can help resolve the issue. diff --git a/statistics/harvest_rt.py b/statistics/harvest_rt.py new file mode 100755 index 0000000..f3940e0 --- /dev/null +++ b/statistics/harvest_rt.py @@ -0,0 +1,46 @@ +#!/usr/bin/python + +import os +import time +from datetime import datetime, timedelta +import sys + +def popen(cmdstr): + f = os.popen(cmdstr) + ret = f.read() + return ret + +def datetime_fromstr(str): + if '-' in str: + try: + tup = time.strptime(str, "%Y-%m-%d") + except: + tup = time.strptime(str, "%Y-%m-%d-%H:%M") + elif '/' in str: + tup = time.strptime(str, "%m/%d/%Y") + else: + tup = time.strptime(str, "%m/%d/%Y") + ret = datetime.fromtimestamp(time.mktime(tup)) + return ret + + +def main(): + queue = sys.argv[1] + d1 = datetime_fromstr(sys.argv[2]) + iterations = int(sys.argv[3]) + i = 0 + while i < iterations: + d1_s = d1.strftime("%Y-%m-%d") + d2 = d1 + timedelta(30) + d2_s = d2.strftime("%Y-%m-%d") + query = "Queue='%s' and " % queue + query = query + "Told > '%s' and Told < '%s'" % (d1_s, d2_s) + cmd = """rt ls -t ticket "%s" | grep -v "No matching" | wc -l """ % query + print cmd + ret = popen(cmd) + print d1_s, ",", ret[:-1] + d1=d2 + i += 1 + +if __name__ == "__main__": + main() diff --git a/statistics/rtsurvey.py b/statistics/rtsurvey.py index 2f2babd..c89af56 100755 --- a/statistics/rtsurvey.py +++ b/statistics/rtsurvey.py @@ -59,10 +59,14 @@ def get_rt_tickets(): return "" sql = """SELECT tk.id, tk.Queue, tr.Type, tr.Field, tr.OldValue, tr.NewValue, - tr.Created, at.id, at.Subject, at.Content - FROM Tickets as tk, Transactions as tr + tr.Created, at.id, at.Subject, at.Content, us.Name + FROM Tickets as tk, Transactions as tr, Users as us LEFT OUTER JOIN Attachments as at ON tr.id=at.TransactionId - WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>10000 """ + WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND + us.id=tr.Creator""" + #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>39896 AND tk.id<42241 AND ## (oct15th2008) + #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND ## (1st3months) + #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>10000 """ print "run query" raw = fetch_from_db(db, sql) @@ -82,6 +86,7 @@ def get_rt_tickets(): attachmentid = x[7] subject = x[8] content = x[9] + creator = x[10] if ticket_id not in tickets: print "found new ticket_id", ticket_id @@ -104,6 +109,7 @@ def get_rt_tickets(): 'newvalue' : newvalue, 'datecreated' : datecreated, 'attachmentid' : attachmentid, + 'creator' : creator, 'subject' : subject, 'content' : content, } @@ -113,8 +119,6 @@ def get_rt_tickets(): print "sort data" list = map(parse_ticket, raw) - # map(lambda x: { "email":str(x[4]), "lastupdated":str(x[5]), "owner":str(x[7]), }, raw) - db.close() @@ -212,9 +216,9 @@ def main(): else: print "loading" tickets = database.dbLoad("survey_tickets") - print tickets[42171]['transactions'][0] + #print tickets[42171]['transactions'][0] - sort_tickets(tickets, re_map) + #sort_tickets(tickets, re_map) # for each ticket id # scan for known keywords and sort into classes diff --git a/test-myops-xmlrpc.py b/test-myops-xmlrpc.py new file mode 100755 index 0000000..ed9ad3c --- /dev/null +++ b/test-myops-xmlrpc.py @@ -0,0 +1,8 @@ +#!/usr/bin/python + +from monitor.wrapper import plc + +api = plc.getAPI("https://monitor.planet-lab.org/monitor/XMLRPC") +print api.upAndRunning() +print api.setBootmanSequence(plc.api.auth, "test-sequence", "value3") +print api.getBootmanSequences(plc.api.auth) diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index c46dc42..5ac368b 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -55,6 +55,7 @@ class NodeQueryFields(widgets.WidgetsList): bootcd_version = widgets.CheckBox(label="BootCD") observed_status = widgets.CheckBox(label="Observed Status") uptime = widgets.CheckBox(label="Uptime") + traceroute = widgets.CheckBox(label="Traceroute") port_status = widgets.CheckBox(label="Port Status") rpms = widgets.CheckBox(label="RPM") rpmvalue = widgets.TextField(label="RPM Pattern") @@ -339,6 +340,8 @@ class Root(controllers.RootController, MonitorXmlrpcServer): agg.update(agg['plc_node_stats']) if agg['kernel_version']: agg['kernel_version'] = agg['kernel_version'].split()[2] + if 'traceroute' in data and agg['traceroute']: + agg['traceroute'] = "
" + agg['traceroute'] + "
" if 'rpmvalue' in data and 'rpms' in data: if agg['rpms']: rpm_list = agg['rpms'].split() @@ -489,10 +492,15 @@ class Root(controllers.RootController, MonitorXmlrpcServer): def simpleview(self, **data): return self.pre_view(**data) + @expose(template="monitorweb.templates.simpleview") + def pcuview(self, **data): + return self.pre_view(**data) + @expose(template="monitorweb.templates.detailview") def detailview(self, **data): return self.pre_view(**data) + def pre_view(self, **data): session.flush(); session.clear() @@ -594,7 +602,7 @@ class Root(controllers.RootController, MonitorXmlrpcServer): # TODO: add form validation @expose(template="monitorweb.templates.pcuview") @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)") - def pcuview(self, loginbase=None, pcuid=None, hostname=None, since=20, **data): + def pcuviewold(self, loginbase=None, pcuid=None, hostname=None, since=20, **data): session.flush(); session.clear() sitequery=[] pcuquery=[] diff --git a/web/MonitorWeb/monitorweb/monitor_xmlrpc.py b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py index 08f71bc..2f46879 100644 --- a/web/MonitorWeb/monitorweb/monitor_xmlrpc.py +++ b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py @@ -4,6 +4,7 @@ import cherrypy import turbogears from datetime import datetime, timedelta import time +from monitor.wrapper import plc try: from monitor.database.info.model import * @@ -127,6 +128,54 @@ class MonitorXmlrpcServer(object): and running before trying any more sophisticated operations. """ return True + # BOOTMAN SEQUENCE ------------------------------------------------------------ + + @cherrypy.expose + @export_to_docbook(roles=['admin'], + accepts=[Parameter(dict, "Auth struct"), + Parameter(str, "The bootman sequence returned by MyOps"), + Parameter(str, "The action string that identifies what to do when this sequence occurs")], + returns=Parameter(bool, 'True on success.')) + def setBootmanSequence(self, auth, sequence, action): + """ Using this call, you can set a new sequence to identify an Unknown + Error sqeuence returned by MyOps and associate it with a pre-defined + action, (i.e. reboot, reinstall, or others). Please see the + documentation for automated actions to see a list of supported + actions. """ + api = plc.getAuthAPI() + api.auth = auth + if api.AuthCheck(): + bms = BootmanSequenceRecord.get_by(sequence=sequence) + if not bms: + bms = BootmanSequenceRecord(sequence=sequence, action=action) + else: + bms.action = action + + bms.flush() + return True + else: + return False + + @cherrypy.expose + @export_to_docbook(roles=['admin'], + accepts=[Parameter(dict, "Auth struct")], + returns=Parameter(list, 'Array of bootman sequences')) + def getBootmanSequences(self, auth): + """ Using this call, you can learn all currently defined bootman + sequences and their associated actions. """ + api = plc.getAuthAPI() + api.auth = auth + if api.AuthCheck(): + ret_list = [] + bms = BootmanSequenceRecord.query.all() + for q in bms: + d = q.to_dict() + d = convert_datetime(d, ['date_created']) + ret_list.append(d) + return ret_list + else: + return [] + # SITES ------------------------------------------------------------ @cherrypy.expose diff --git a/web/MonitorWeb/monitorweb/templates/actionlist.kid b/web/MonitorWeb/monitorweb/templates/actionlist.kid index caf993a..2d2aaa9 100644 --- a/web/MonitorWeb/monitorweb/templates/actionlist.kid +++ b/web/MonitorWeb/monitorweb/templates/actionlist.kid @@ -50,8 +50,8 @@ def zabbix_event_ack_link(eventid): ${act.hostname} - - ${act.loginbase} + + ${act.loginbase} -- 2.43.0