clearer names for actions, and infer actions better
[monitor.git] / monitor / bootman.py
index b7ec58c..9754218 100755 (executable)
@@ -11,7 +11,7 @@ import traceback
 import subprocess
 from sets import Set
 
-from monitor.getsshkeys import SSHKnownHosts
+from monitor.util.sshknownhosts import SSHKnownHosts
 from monitor.Rpyc import SocketConnection, Async
 from monitor.Rpyc.Utils import *
 
@@ -129,6 +129,21 @@ class NodeConnection:
                                print key, " == ", bm.VARS[key]
                else:
                        print "   Unable to read Node Configuration"
+
+       def fprobe_repair_node(self):
+               # When fprobe data gets too much, it fills the root partition and
+               # fails to boot
+               c = self.c
+               self.c.modules.sys.path.append("/tmp/source/")
+
+               # NOTE: assume that the root fs is already mounted...
+               if self.c.modules.os.path.exists('/tmp/mnt/sysimg/var/local/fprobe'):
+                       print "CLEARING FPROBE DATA on %s" % self.node
+                       self.c.modules.os.chdir('/tmp/mnt/sysimg/var/local/fprobe')
+                       cmd = """ ls -lrt . | awk '{if (i<NR/2 && $9) {print "rm "$9;i=i+1;}}' | sh """
+                       self.c.modules.os.system(cmd)
+               else:
+                       print "COULD NOT CLEAR FPROBE DATA on %s" % self.node
                
        def fsck_repair_node(self):
                c = self.c
@@ -291,7 +306,7 @@ class PlanetLabSession:
 
                # COPY Rpyc files to host
                #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
-               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
+               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
                if self.verbose: print cmd
                print cmd
                # TODO: Add timeout
@@ -449,6 +464,7 @@ class DebugInterface:
 
        def getDiskSteps(self):
                steps = [
+                       ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
                        ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
@@ -681,10 +697,13 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                args['saveact'] = True
                args['ccemail'] = True
 
-               sitehist.sendMessage('unknownsequence_notice', **args)
+               if 'nospace' in s:
+                       # NOTE: sequence is unknown and contains nospace, so try the
+                       # fprobe repair trick first.
+                       conn.fprobe_repair_node()
 
+               sitehist.sendMessage('unknownsequence_notice', **args)
                conn.restart_bootmanager('boot')
-
                bootman_action = "restart_bootmanager"
 
                # NOTE: Do not set the pflags value for this sequence if it's unknown.
@@ -825,46 +844,5 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
        return bootman_action
        
 
-# MAIN -------------------------------------------------------------------
-
-def main():
-       from monitor import parser as parsermodule
-       parser = parsermodule.getParser()
-
-       parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
-                                               force=None, quiet=False)
-       parser.add_option("", "--child", dest="child", action="store_true", 
-                                               help="This is the child mode of this process.")
-       parser.add_option("", "--force", dest="force", metavar="boot_state",
-                                               help="Force a boot state passed to BootManager.py.")
-       parser.add_option("", "--quiet", dest="quiet", action="store_true", 
-                                               help="Extra quiet output messages.")
-       parser.add_option("", "--verbose", dest="verbose", action="store_true", 
-                                               help="Extra debug output messages.")
-       parser.add_option("", "--nonet", dest="nonet", action="store_true", 
-                                               help="Do not setup the network, use existing log files to re-run a test pass.")
-       parser.add_option("", "--collect", dest="collect", action="store_true", 
-                                               help="No action, just collect dmesg, and bm.log")
-       parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
-                                               help="Do not perform the orginary setup phase.")
-
-       parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
-       config = parsermodule.parse_args(parser)
-
-       if config.nodelist:
-               nodes = config.getListFromFile(config.nodelist)
-       elif config.node:
-               nodes = [ config.node ]
-       else:
-               parser.print_help()
-               sys.exit(1)
-
-       for node in nodes:
-               # get sitehist
-               lb = plccache.plcdb_hn2lb[node]
-               sitehist = SiteInterface.get_or_make(loginbase=lb)
-               #reboot(node, config)
-               restore(sitehist, node, config=None, forced_action=None)
-
 if __name__ == "__main__":
-       main()
+       print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]