clearer names for actions, and infer actions better
[monitor.git] / monitor / bootman.py
index eac2761..9754218 100755 (executable)
@@ -129,6 +129,21 @@ class NodeConnection:
                                print key, " == ", bm.VARS[key]
                else:
                        print "   Unable to read Node Configuration"
+
+       def fprobe_repair_node(self):
+               # When fprobe data gets too much, it fills the root partition and
+               # fails to boot
+               c = self.c
+               self.c.modules.sys.path.append("/tmp/source/")
+
+               # NOTE: assume that the root fs is already mounted...
+               if self.c.modules.os.path.exists('/tmp/mnt/sysimg/var/local/fprobe'):
+                       print "CLEARING FPROBE DATA on %s" % self.node
+                       self.c.modules.os.chdir('/tmp/mnt/sysimg/var/local/fprobe')
+                       cmd = """ ls -lrt . | awk '{if (i<NR/2 && $9) {print "rm "$9;i=i+1;}}' | sh """
+                       self.c.modules.os.system(cmd)
+               else:
+                       print "COULD NOT CLEAR FPROBE DATA on %s" % self.node
                
        def fsck_repair_node(self):
                c = self.c
@@ -291,7 +306,7 @@ class PlanetLabSession:
 
                # COPY Rpyc files to host
                #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
-               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
+               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
                if self.verbose: print cmd
                print cmd
                # TODO: Add timeout
@@ -449,6 +464,7 @@ class DebugInterface:
 
        def getDiskSteps(self):
                steps = [
+                       ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
                        ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
@@ -681,10 +697,13 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                args['saveact'] = True
                args['ccemail'] = True
 
-               sitehist.sendMessage('unknownsequence_notice', **args)
+               if 'nospace' in s:
+                       # NOTE: sequence is unknown and contains nospace, so try the
+                       # fprobe repair trick first.
+                       conn.fprobe_repair_node()
 
+               sitehist.sendMessage('unknownsequence_notice', **args)
                conn.restart_bootmanager('boot')
-
                bootman_action = "restart_bootmanager"
 
                # NOTE: Do not set the pflags value for this sequence if it's unknown.