# Attempt to reboot a node in debug state.
-import plc
-import auth
-api = plc.PLC(auth.auth, auth.plc)
+from monitor import const
+from monitor.database.info.model import *
+from monitor.wrapper import plc
+api = plc.getAuthAPI()
import sys
import os
-import policy
from getsshkeys import SSHKnownHosts
import subprocess
import time
-import soltesz
+from monitor.util import command as moncommands
from sets import Set
-import ssh.pxssh as pxssh
-import ssh.fdpexpect as fdpexpect
-import ssh.pexpect as pexpect
-from unified_model import *
-from emailTxt import mailtxt
+from pcucontrol.transports.ssh import pxssh as pxssh
+from pcucontrol.transports.ssh import fdpexpect as fdpexpect
+from pcucontrol.transports.ssh import pexpect as pexpect
+from monitor.model import *
+from monitor.wrapper.emailTxt import mailtxt
+from nodeconfig import network_config_to_str
+import traceback
+from monitor import config
import signal
class Sopen(subprocess.Popen):
#from Rpyc import SocketConnection, Async
from Rpyc import SocketConnection, Async
from Rpyc.Utils import *
-
-def get_fbnode(node):
- fb = soltesz.dbLoad("findbad")
- fbnode = fb['nodes'][node]['values']
- return fbnode
+fb = None
class NodeConnection:
def __init__(self, connection, node, config):
def dump_plconf_file(self):
c = self.c
- c.modules.sys.path.append("/tmp/source/")
- c.modules.os.chdir('/tmp/source')
+ self.c.modules.sys.path.append("/tmp/source/")
+ self.c.modules.os.chdir('/tmp/source')
log = c.modules.BootManager.log('/tmp/new.log')
bm = c.modules.BootManager.BootManager(log,'boot')
def compare_and_repair_nodekeys(self):
c = self.c
- c.modules.sys.path.append("/tmp/source/")
- c.modules.os.chdir('/tmp/source')
+ self.c.modules.sys.path.append("/tmp/source/")
+ self.c.modules.os.chdir('/tmp/source')
log = c.modules.BootManager.log('/tmp/new.log')
bm = c.modules.BootManager.BootManager(log,'boot')
args['port'] = self.port
args['user'] = 'root'
args['hostname'] = self.node
- args['monitordir'] = "/home/soltesz/monitor"
+ args['monitordir'] = config.MONITOR_SCRIPT_ROOT
ssh_port = 22
if self.nosetup:
return
# COPY Rpyc files to host
- cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
+ cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
if self.verbose: print cmd
# TODO: Add timeout
timeout = 120
- localos = soltesz.CMD()
+ localos = moncommands.CMD()
ret = localos.system(cmd, timeout)
print ret
t1 = time.time()
# KILL any already running servers.
- ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
+ ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
(ov,ev) = ssh.run_noexcept2("""<<\EOF
rm -f out.log
echo "kill server" >> out.log
# TODO: the read() here may block indefinitely. Need a better
# approach therefore, that includes a timeout.
#ret = self.command.stdout.read(5)
- ret = soltesz.read_t(self.command.stdout, 5)
+ ret = moncommands.read_t(self.command.stdout, 5)
t2 = time.time()
if 'READY' in ret:
# NOTE: Nothing works if the bootcd is REALLY old.
# So, this is the first step.
- fbnode = get_fbnode(hostname)
+ fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
if fbnode['category'] == "OLDBOOTCD":
print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
args = {}
mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
print "\tDisabling %s due to out-of-date BOOTCD" % hostname
api.UpdateNode(hostname, {'boot_state' : 'disable'})
try:
k = SSHKnownHosts(); k.update(node); k.write(); del k
except:
- import traceback; print traceback.print_exc()
+ print traceback.print_exc()
return False
try:
session = PlanetLabSession(node, config.nosetup, config.verbose)
except Exception, e:
print "ERROR setting up session for %s" % hostname
- import traceback; print traceback.print_exc()
+ print traceback.print_exc()
print e
return False
time.sleep(session.timeout*4)
conn = session.get_connection(config)
except:
- import traceback; print traceback.print_exc()
+ print traceback.print_exc()
return False
-
if forced_action == "reboot":
conn.restart_node('rins')
('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+ ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+ ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
('floppytimeout','floppy0: floppy timeout called'),
('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
+ # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+ # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
# floppy0: floppy timeout called
# end_request: I/O error, dev fd0, sector 0
- #Buffer I/O error on device dm-2, logical block 8888896
- #ata1: status=0x51 { DriveReady SeekComplete Error }
- #ata1: error=0x40 { UncorrectableError }
- #SCSI error : <0 0 0 0> return code = 0x8000002
- #sda: Current: sense key: Medium Error
+ # Buffer I/O error on device dm-2, logical block 8888896
+ # ata1: status=0x51 { DriveReady SeekComplete Error }
+ # ata1: error=0x40 { UncorrectableError }
+ # SCSI error : <0 0 0 0> return code = 0x8000002
+ # sda: Current: sense key: Medium Error
# Additional sense: Unrecovered read error - auto reallocate failed
- #SCSI error : <0 2 0 0> return code = 0x40001
- #end_request: I/O error, dev sda, sector 572489600
+ # SCSI error : <0 2 0 0> return code = 0x40001
+ # end_request: I/O error, dev sda, sector 572489600
]
id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
sequence.append(id)
mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
- conn.set_nodestate('diag')
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
+ conn.set_nodestate('disable')
return False
print "...Downloading bm.log from %s" % node
('nodehostname' , 'Configured node hostname does not resolve'),
('implementerror', 'Implementation Error'),
('readonlyfs' , '[Errno 30] Read-only file system'),
+ ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
('noinstall' , 'notinstalled'),
('bziperror' , 'bzip2: Data integrity error when decompressing.'),
('noblockdev' , "No block devices detected."),
+ ('dnserror' , 'Name or service not known'),
('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
('hardwarerequirefail' , 'Hardware requirements not met'),
('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+ ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
('modulefail' , 'Unable to get list of system modules'),
('writeerror' , 'write error: No space left on device'),
# By using the sequence identifier, we guarantee that there will be no
# frequent loops. I'm guessing there is a better way to track loops,
# though.
- if not config.force and pflags.getRecentFlag(s):
- pflags.setRecentFlag(s)
- pflags.save()
- print "... flag is set or it has already run recently. Skipping %s" % node
- return True
+ #if not config.force and pflags.getRecentFlag(s):
+ # pflags.setRecentFlag(s)
+ # pflags.save()
+ # print "... flag is set or it has already run recently. Skipping %s" % node
+ # return True
sequences = {}
for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
"bminit-cfg-auth-getplc-update-debug-done",
"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
"bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
"bminit-cfg-auth-protoerror-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
"bminit-cfg-auth-getplc-implementerror-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_boot"})
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+ # actual solution appears to involve removing the bad files, and
+ # continually trying to boot the node.
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
"bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
]:
sequences.update({n: "restart_node_boot"})
# update_node_config_email
for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
]:
sequences.update({n : "update_node_config_email"})
- for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+ for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
+ "bminit-cfg-update-exception-nodehostname-update-debug-done",
+ ]:
sequences.update({n : "nodenetwork_email"})
# update_bootcd_email
# broken_hardware_email
sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+ # bad_dns_email
+ for n in [
+ "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ ]:
+ sequences.update( { n : "bad_dns_email"})
+
flag_set = True
m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
m.reset()
- m.send(['monitor-list@lists.planet-lab.org'])
+ m.send([config.cc_email])
conn.restart_bootmanager('boot')
m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
m.reset()
- m.send(['monitor-list@lists.planet-lab.org'])
+ m.send([config.cc_email])
conn.restart_bootmanager('boot')
m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodeid_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.dump_plconf_file()
- conn.set_nodestate('diag')
+ conn.set_nodestate('disable')
elif sequences[s] == "nodenetwork_email":
print "...Sending message to LOOK AT NODE NETWORK"
m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodenet_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.dump_plconf_file()
- conn.set_nodestate('diag')
+ conn.set_nodestate('disable')
elif sequences[s] == "update_bootcd_email":
print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
print "\tDisabling %s due to out-of-date BOOTCD" % hostname
conn.set_nodestate('disable')
mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.set_nodestate('disable')
elif sequences[s] == "update_hardware_email":
mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
+ conn.set_nodestate('disable')
+
+ elif sequences[s] == "bad_dns_email":
+ print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+ args = {}
+ try:
+ node = api.GetNodes(hostname)[0]
+ net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+ except:
+ print traceback.print_exc()
+ # TODO: api error. skip email, b/c all info is not available,
+ # flag_set will not be recorded.
+ return False
+ nodenet_str = network_config_to_str(net)
+
+ args['hostname'] = hostname
+ args['network_config'] = nodenet_str
+ args['nodenetwork_id'] = net['nodenetwork_id']
+ m = PersistMessage(hostname, mailtxt.baddns[0] % args,
+ mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
+
+ loginbase = plc.siteId(hostname)
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.set_nodestate('disable')
if flag_set:
# MAIN -------------------------------------------------------------------
def main():
- from config import config
- from optparse import OptionParser
- parser = OptionParser()
- parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
+ from monitor import parser as parsermodule
+ parser = parsermodule.getParser()
+
+ parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
+ force=None, quiet=False)
parser.add_option("", "--child", dest="child", action="store_true",
help="This is the child mode of this process.")
parser.add_option("", "--force", dest="force", metavar="boot_state",
help="Extra quiet output messages.")
parser.add_option("", "--verbose", dest="verbose", action="store_true",
help="Extra debug output messages.")
+ parser.add_option("", "--nonet", dest="nonet", action="store_true",
+ help="Do not setup the network, use existing log files to re-run a test pass.")
parser.add_option("", "--collect", dest="collect", action="store_true",
help="No action, just collect dmesg, and bm.log")
parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
help="Do not perform the orginary setup phase.")
- parser.add_option("", "--node", dest="node", metavar="nodename.edu",
- help="A single node name to try to bring out of debug mode.")
- parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt",
- help="A list of nodes to bring out of debug mode.")
- config = config(parser)
- config.parse_args()
+
+ parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
+ config = parsermodule.parse_args(parser)
if config.nodelist:
nodes = config.getListFromFile(config.nodelist)