X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor%2Fbootman.py;h=975421871593f2677ff2317fe24916c268eda8a8;hb=32e47491837a321e684ea167ab6aa430145562f1;hp=b7ec58cf7aeae507b2410b665ce90ba33d5e14d0;hpb=0e3cb254ed858745809d57de80437d73aedc6eba;p=monitor.git diff --git a/monitor/bootman.py b/monitor/bootman.py index b7ec58c..9754218 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -11,7 +11,7 @@ import traceback import subprocess from sets import Set -from monitor.getsshkeys import SSHKnownHosts +from monitor.util.sshknownhosts import SSHKnownHosts from monitor.Rpyc import SocketConnection, Async from monitor.Rpyc.Utils import * @@ -129,6 +129,21 @@ class NodeConnection: print key, " == ", bm.VARS[key] else: print " Unable to read Node Configuration" + + def fprobe_repair_node(self): + # When fprobe data gets too much, it fills the root partition and + # fails to boot + c = self.c + self.c.modules.sys.path.append("/tmp/source/") + + # NOTE: assume that the root fs is already mounted... + if self.c.modules.os.path.exists('/tmp/mnt/sysimg/var/local/fprobe'): + print "CLEARING FPROBE DATA on %s" % self.node + self.c.modules.os.chdir('/tmp/mnt/sysimg/var/local/fprobe') + cmd = """ ls -lrt . | awk '{if (i return code = 0x\d+'), ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'), @@ -681,10 +697,13 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None): args['saveact'] = True args['ccemail'] = True - sitehist.sendMessage('unknownsequence_notice', **args) + if 'nospace' in s: + # NOTE: sequence is unknown and contains nospace, so try the + # fprobe repair trick first. + conn.fprobe_repair_node() + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') - bootman_action = "restart_bootmanager" # NOTE: Do not set the pflags value for this sequence if it's unknown. @@ -825,46 +844,5 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None): return bootman_action -# MAIN ------------------------------------------------------------------- - -def main(): - from monitor import parser as parsermodule - parser = parsermodule.getParser() - - parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, - force=None, quiet=False) - parser.add_option("", "--child", dest="child", action="store_true", - help="This is the child mode of this process.") - parser.add_option("", "--force", dest="force", metavar="boot_state", - help="Force a boot state passed to BootManager.py.") - parser.add_option("", "--quiet", dest="quiet", action="store_true", - help="Extra quiet output messages.") - parser.add_option("", "--verbose", dest="verbose", action="store_true", - help="Extra debug output messages.") - parser.add_option("", "--nonet", dest="nonet", action="store_true", - help="Do not setup the network, use existing log files to re-run a test pass.") - parser.add_option("", "--collect", dest="collect", action="store_true", - help="No action, just collect dmesg, and bm.log") - parser.add_option("", "--nosetup", dest="nosetup", action="store_true", - help="Do not perform the orginary setup phase.") - - parser = parsermodule.getParser(['nodesets', 'defaults'], parser) - config = parsermodule.parse_args(parser) - - if config.nodelist: - nodes = config.getListFromFile(config.nodelist) - elif config.node: - nodes = [ config.node ] - else: - parser.print_help() - sys.exit(1) - - for node in nodes: - # get sitehist - lb = plccache.plcdb_hn2lb[node] - sitehist = SiteInterface.get_or_make(loginbase=lb) - #reboot(node, config) - restore(sitehist, node, config=None, forced_action=None) - if __name__ == "__main__": - main() + print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]