# clean up stray 'locfg' processes that hang around inappropriately...
ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
-${MONITOR_SCRIPT_ROOT}/policy.py $DATE
-#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
-service plc restart monitor
-curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv
+
+${MONITOR_SCRIPT_ROOT}/policy.py $DATE || :
+${MONITOR_SCRIPT_ROOT}/statistics/add-record.py || :
+curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv || :
cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
+service plc restart monitor || :
rm -f $MONITOR_PID
--- /dev/null
+#!/usr/bin/python
+from monitor.database.info.model import *
+
+def getSequences():
+
+ # TODO: This can be replaced with a DB definition at a future time.
+ # This would make it possible for an admin to introduce new
+ # patterns without touching code.
+
+ sequences = {}
+ # restart_bootmanager_boot
+ for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-exception-protoerror-protoerror2-protoerror-protoerror2-debug-validate-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
+ "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
+ "bminit-cfg-auth-getplc-update-debug-done",
+ "bminit-cfg-auth-protoerror2-debug-done",
+ "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+ "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+ "bminit-cfg-auth-authfail2-protoerror2-debug-done",
+ ]:
+ sequences.update({n : "restart_bootmanager_boot"})
+
+ # conn.restart_bootmanager('reinstall')
+ for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+ # actual solution appears to involve removing the bad files, and
+ # continually trying to boot the node.
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-missingkernel-debug-validate-done",
+ ]:
+ sequences.update({n : "restart_bootmanager_rins"})
+
+ # repair_node_keys
+ for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
+ "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
+ "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
+ "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
+ "bminit-cfg-auth-authfail-debug-done",
+ "bminit-cfg-auth-authfail2-authfail-debug-done",
+ ]:
+ sequences.update({n: "repair_node_keys"})
+
+ # conn.restart_node('reinstall')
+ for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ ]:
+ sequences.update({n : "restart_node_rins"})
+
+ # restart_node_boot
+ for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
+ ]:
+ sequences.update({n: "restart_node_boot"})
+
+ # fsck_repair
+ for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
+ ]:
+ sequences.update({n : "fsck_repair"})
+
+ # nodeconfig_notice
+ for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
+ "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
+ "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
+ "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
+ "bminit-cfg-exception-noparseconfig-debug-validate-exception-done",
+ "bminit-cfg-exception-noconfig-debug-validate-exception-done",
+ "bminit-cfg-auth-authfail2-nonode-debug-done",
+ ]:
+ sequences.update({n : "nodeconfig_notice"})
+
+ for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
+ "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
+ "bminit-cfg-update-exception-nodehostname-update-debug-done",
+ "bminit-cfg-exception-nodehostname-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
+ ]:
+ sequences.update({n : "nodenetwork_email"})
+
+ # noblockdevice_notice
+ for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+ ]:
+ sequences.update({n : "noblockdevice_notice"})
+
+ # update_bootcd_email
+ for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+ ]:
+ sequences.update({n : "update_bootcd_email"})
+
+ for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+ ]:
+ sequences.update({n: "unknownsequence_notice"})
+
+ # minimalhardware_notice
+ for n in [ "bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+ ]:
+ sequences.update({n: "minimalhardware_notice"})
+
+ # baddisk_notice
+ sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
+
+ # baddns_notice
+ for n in [
+ "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ ]:
+ sequences.update( { n : "baddns_notice"})
+
+ return sequences
+
+sequences = getSequences()
+
+for s in sequences:
+ bms = BootmanSequenceRecord.get_by(sequence=s)
+ if not bms:
+ bms = BootmanSequenceRecord(sequence=s, action=sequences[s])
+ bms.flush()
+
+session.flush()
{
# NOTE: call create_all() to setup the database from the info model.
python -c "from monitor.database.info.model import *; from elixir import create_all; create_all()"
+ $MONITORPATH/init-bootman-sequence.py
}
function check_monitor_conf ()
class NodeConnection:
def __init__(self, connection, node, config):
+ print "init nodeconnection"
self.node = node
self.c = connection
self.config = config
def get_boot_state(self):
+ print "get_boot_state(self)"
try:
if self.c.modules.os.path.exists('/tmp/source'):
return "debug"
def get_connection(self, config):
try:
- conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+ print "SocketConnection(localhost, %s" % self.port
+ sc = SocketConnection("localhost", self.port)
+ print "NodeConnection(%s, %s)" % (sc, self.node)
+ conn = NodeConnection(sc, self.node, config)
except:
# NOTE: try twice since this can sometimes fail the first time. If
# it fails again, let it go.
(ov,ev) = ssh.run_noexcept2("""<<\EOF
rm -f out.log
echo "kill server" >> out.log
+ netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
echo "export" >> out.log
export PYTHONPATH=$HOME ;
email_exception(msg)
return False
+ print "Getting connection: 1st try"
try:
conn = self.session.get_connection(config)
except EOFError:
# NOTE: sometimes the wait in setup_host() is not long enough.
# So, here we try to wait a little longer before giving up entirely.
try:
+ print "Getting connection: 2nd try"
time.sleep(self.session.timeout*5)
conn = self.session.get_connection(config)
except EOFError:
# failed twice... no need to report this really, it's just in a
# weird state...
+ print "Getting connection: failed"
+ email_exception(self.hostname, "failed twice to get connection")
return False
except:
traceback.print_exc()
email_exception(self.hostname)
return False
+ print "Getting connection: ok"
#print "trying to use conn before returning it."
#print conn.c.modules.sys.path
#print conn.c.modules.os.path.exists('/tmp/source')
def getSequences(self):
- # TODO: This can be replaced with a DB definition at a future time.
- # This would make it possible for an admin to introduce new
- # patterns without touching code.
-
+ # NOTE: The DB is now the autoritative record for all BM sequences.
+ # An admin can introduce new patterns and actions without touching code.
sequences = {}
- # restart_bootmanager_boot
- for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
-
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
-
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
- "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
- "bminit-cfg-auth-getplc-update-debug-done",
- "bminit-cfg-auth-protoerror2-debug-done",
- "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
- "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
- "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
- "bminit-cfg-auth-protoerror-exception-update-debug-done",
- "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
- "bminit-cfg-auth-getplc-implementerror-update-debug-done",
- "bminit-cfg-auth-authfail2-protoerror2-debug-done",
- ]:
- sequences.update({n : "restart_bootmanager_boot"})
-
- # conn.restart_bootmanager('reinstall')
- for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
- # actual solution appears to involve removing the bad files, and
- # continually trying to boot the node.
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
- "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
- "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
- ]:
- sequences.update({n : "restart_bootmanager_rins"})
-
- # repair_node_keys
- for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
- "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
- "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
- "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
- "bminit-cfg-auth-authfail-debug-done",
- "bminit-cfg-auth-authfail2-authfail-debug-done",
- ]:
- sequences.update({n: "repair_node_keys"})
-
- # conn.restart_node('reinstall')
- for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
- ]:
- sequences.update({n : "restart_node_rins"})
-
- # restart_node_boot
- for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
- "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
- "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
- "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
- ]:
- sequences.update({n: "restart_node_boot"})
-
- # fsck_repair
- for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
- ]:
- sequences.update({n : "fsck_repair"})
-
- # nodeconfig_notice
- for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
- "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
- "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
- "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
- ]:
- sequences.update({n : "nodeconfig_notice"})
-
- for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
- "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
- "bminit-cfg-update-exception-nodehostname-update-debug-done",
- "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
- ]:
- sequences.update({n : "nodenetwork_email"})
-
- # noblockdevice_notice
- for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
- "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
- "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
- ]:
- sequences.update({n : "noblockdevice_notice"})
-
- # update_bootcd_email
- for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
- ]:
- sequences.update({n : "update_bootcd_email"})
-
- for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
- ]:
- sequences.update({n: "unknownsequence_notice"})
-
- # minimalhardware_notice
- sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
- sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
-
- # baddisk_notice
- sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
-
- # baddns_notice
- for n in [
- "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
- "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
- ]:
- sequences.update( { n : "baddns_notice"})
+ bms = BootmanSequenceRecord.query.all()
+ for s in bms:
+ sequences[s.sequence] = s.action
+
return sequences
def getDiskSteps(self):
('noinstall' , 'notinstalled'),
('bziperror' , 'bzip2: Data integrity error when decompressing.'),
('noblockdev' , "No block devices detected."),
+ ('missingkernel', "missingkernel"),
('dnserror' , 'Name or service not known'),
+ ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
('noconfig' , "Unable to find and read a node configuration file"),
('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
else:
return self.date_created + timedelta(0, self.expires)
+class BootmanSequenceRecord(Entity):
+ sequence = Field(String, primary_key=True, default=None)
+ action = Field(String, default=None)
+ date_created = Field(DateTime,default=datetime.now)
+
class ActionRecord(Entity):
@classmethod
def get_latest_by(cls, **kwargs):
Then double check the network settings for your host.
-Then, select, "Download -> Download ISO image for %(hostname)s" menu. This will generate a new All-in-one BootImage file for your node. Copy this file to the appropriate read-only media, and reboot the machine.
+If the network settings are correct, then, select, "Download -> Download image for %(hostname)s" menu. This will generate a new All-in-one BootImage file for your node. Copy this file to the appropriate read-only media, and reboot the machine.
There is no need to respond to this message. If you're able to update the boot image without difficulty and your node returns to normal operation, please accept our thanks. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (%(support_email)s) so we can help resolve the issue.
--- /dev/null
+#!/usr/bin/python
+
+import os
+import time
+from datetime import datetime, timedelta
+import sys
+
+def popen(cmdstr):
+ f = os.popen(cmdstr)
+ ret = f.read()
+ return ret
+
+def datetime_fromstr(str):
+ if '-' in str:
+ try:
+ tup = time.strptime(str, "%Y-%m-%d")
+ except:
+ tup = time.strptime(str, "%Y-%m-%d-%H:%M")
+ elif '/' in str:
+ tup = time.strptime(str, "%m/%d/%Y")
+ else:
+ tup = time.strptime(str, "%m/%d/%Y")
+ ret = datetime.fromtimestamp(time.mktime(tup))
+ return ret
+
+
+def main():
+ queue = sys.argv[1]
+ d1 = datetime_fromstr(sys.argv[2])
+ iterations = int(sys.argv[3])
+ i = 0
+ while i < iterations:
+ d1_s = d1.strftime("%Y-%m-%d")
+ d2 = d1 + timedelta(30)
+ d2_s = d2.strftime("%Y-%m-%d")
+ query = "Queue='%s' and " % queue
+ query = query + "Told > '%s' and Told < '%s'" % (d1_s, d2_s)
+ cmd = """rt ls -t ticket "%s" | grep -v "No matching" | wc -l """ % query
+ print cmd
+ ret = popen(cmd)
+ print d1_s, ",", ret[:-1]
+ d1=d2
+ i += 1
+
+if __name__ == "__main__":
+ main()
return ""
sql = """SELECT tk.id, tk.Queue, tr.Type, tr.Field, tr.OldValue, tr.NewValue,
- tr.Created, at.id, at.Subject, at.Content
- FROM Tickets as tk, Transactions as tr
+ tr.Created, at.id, at.Subject, at.Content, us.Name
+ FROM Tickets as tk, Transactions as tr, Users as us
LEFT OUTER JOIN Attachments as at ON tr.id=at.TransactionId
- WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>10000 """
+ WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND
+ us.id=tr.Creator"""
+ #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>39896 AND tk.id<42241 AND ## (oct15th2008)
+ #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND ## (1st3months)
+ #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>10000 """
print "run query"
raw = fetch_from_db(db, sql)
attachmentid = x[7]
subject = x[8]
content = x[9]
+ creator = x[10]
if ticket_id not in tickets:
print "found new ticket_id", ticket_id
'newvalue' : newvalue,
'datecreated' : datecreated,
'attachmentid' : attachmentid,
+ 'creator' : creator,
'subject' : subject,
'content' : content,
}
print "sort data"
list = map(parse_ticket, raw)
- # map(lambda x: { "email":str(x[4]), "lastupdated":str(x[5]), "owner":str(x[7]), }, raw)
-
db.close()
else:
print "loading"
tickets = database.dbLoad("survey_tickets")
- print tickets[42171]['transactions'][0]
+ #print tickets[42171]['transactions'][0]
- sort_tickets(tickets, re_map)
+ #sort_tickets(tickets, re_map)
# for each ticket id
# scan for known keywords and sort into classes
--- /dev/null
+#!/usr/bin/python
+
+from monitor.wrapper import plc
+
+api = plc.getAPI("https://monitor.planet-lab.org/monitor/XMLRPC")
+print api.upAndRunning()
+print api.setBootmanSequence(plc.api.auth, "test-sequence", "value3")
+print api.getBootmanSequences(plc.api.auth)
bootcd_version = widgets.CheckBox(label="BootCD")
observed_status = widgets.CheckBox(label="Observed Status")
uptime = widgets.CheckBox(label="Uptime")
+ traceroute = widgets.CheckBox(label="Traceroute")
port_status = widgets.CheckBox(label="Port Status")
rpms = widgets.CheckBox(label="RPM")
rpmvalue = widgets.TextField(label="RPM Pattern")
agg.update(agg['plc_node_stats'])
if agg['kernel_version']:
agg['kernel_version'] = agg['kernel_version'].split()[2]
+ if 'traceroute' in data and agg['traceroute']:
+ agg['traceroute'] = "<pre>" + agg['traceroute'] + "</pre>"
if 'rpmvalue' in data and 'rpms' in data:
if agg['rpms']:
rpm_list = agg['rpms'].split()
def simpleview(self, **data):
return self.pre_view(**data)
+ @expose(template="monitorweb.templates.simpleview")
+ def pcuview(self, **data):
+ return self.pre_view(**data)
+
@expose(template="monitorweb.templates.detailview")
def detailview(self, **data):
return self.pre_view(**data)
+
def pre_view(self, **data):
session.flush(); session.clear()
# TODO: add form validation
@expose(template="monitorweb.templates.pcuview")
@exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
- def pcuview(self, loginbase=None, pcuid=None, hostname=None, since=20, **data):
+ def pcuviewold(self, loginbase=None, pcuid=None, hostname=None, since=20, **data):
session.flush(); session.clear()
sitequery=[]
pcuquery=[]
import turbogears
from datetime import datetime, timedelta
import time
+from monitor.wrapper import plc
try:
from monitor.database.info.model import *
and running before trying any more sophisticated operations. """
return True
+ # BOOTMAN SEQUENCE ------------------------------------------------------------
+
+ @cherrypy.expose
+ @export_to_docbook(roles=['admin'],
+ accepts=[Parameter(dict, "Auth struct"),
+ Parameter(str, "The bootman sequence returned by MyOps"),
+ Parameter(str, "The action string that identifies what to do when this sequence occurs")],
+ returns=Parameter(bool, 'True on success.'))
+ def setBootmanSequence(self, auth, sequence, action):
+ """ Using this call, you can set a new sequence to identify an Unknown
+ Error sqeuence returned by MyOps and associate it with a pre-defined
+ action, (i.e. reboot, reinstall, or others). Please see the
+ documentation for automated actions to see a list of supported
+ actions. """
+ api = plc.getAuthAPI()
+ api.auth = auth
+ if api.AuthCheck():
+ bms = BootmanSequenceRecord.get_by(sequence=sequence)
+ if not bms:
+ bms = BootmanSequenceRecord(sequence=sequence, action=action)
+ else:
+ bms.action = action
+
+ bms.flush()
+ return True
+ else:
+ return False
+
+ @cherrypy.expose
+ @export_to_docbook(roles=['admin'],
+ accepts=[Parameter(dict, "Auth struct")],
+ returns=Parameter(list, 'Array of bootman sequences'))
+ def getBootmanSequences(self, auth):
+ """ Using this call, you can learn all currently defined bootman
+ sequences and their associated actions. """
+ api = plc.getAuthAPI()
+ api.auth = auth
+ if api.AuthCheck():
+ ret_list = []
+ bms = BootmanSequenceRecord.query.all()
+ for q in bms:
+ d = q.to_dict()
+ d = convert_datetime(d, ['date_created'])
+ ret_list.append(d)
+ return ret_list
+ else:
+ return []
+
# SITES ------------------------------------------------------------
@cherrypy.expose
<a href="${link('detailview', hostname=act.hostname)}">${act.hostname}</a>
</td>
<td py:if="act.hostname is None" nowrap="true">
- <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
- <span class="icon">${act.loginbase}</span></a>
+ <a class="ext-link" href="${link('detailview', loginbase=act.loginbase)}">
+ ${act.loginbase}</a>
</td>
<!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
<td py:content="act.action_type"></td>