script for spotting and trashing zombie containers
authorThierry Parmentelat <thierry.parmentelat@inria.fr>
Thu, 29 Oct 2015 21:34:29 +0000 (22:34 +0100)
committerThierry Parmentelat <thierry.parmentelat@inria.fr>
Thu, 29 Oct 2015 21:34:29 +0000 (22:34 +0100)
support-scripts/cleanup-zombies.py [new file with mode: 0755]

diff --git a/support-scripts/cleanup-zombies.py b/support-scripts/cleanup-zombies.py
new file mode 100755 (executable)
index 0000000..c3df0ac
--- /dev/null
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# node manager has a few working assumptions, like
+# if a domain d does not exist, there is no /vservers/d 
+
+# this utility tries to detect and assess potentially
+# conflictual situations, that could prevent nodemanager
+# from recovering properly
+#
+# the logic is simply to find zombie-containers, i.e.
+# VMs that do have a workdir in /vservers/<zombie>
+# but that are not reported as running by virsh --list
+# which suggests they have been improperly trashed
+###
+#
+# then we trash them but for that some subdirs must be
+# btrfs-subvolume-delete'd and not rm-rf'ed
+# 
+
+import subprocess
+import glob
+import os, os.path
+from argparse import ArgumentParser
+
+def running_domains():
+    command = [
+        'virsh',
+       '-c',
+        'lxc:///',
+        'list',
+        '--name',
+    ]
+    names_string = subprocess.check_output(
+        command,
+        universal_newlines = True,
+        stdin = subprocess.DEVNULL,
+        )
+    names = [ name for name in names_string.strip().split("\n") if name ]
+    return names
+
+def existing_vservers():
+    all_dirs = glob.glob("/vservers/*")
+    dirs = ( dir for dir in all_dirs if os.path.isdir(dir) )
+    dirnames = ( path.replace("/vservers/", "") for path in dirs)
+    return dirnames
+
+def display_or_run_commands(commands, run):
+    if commands:
+        if not run:
+            print("========== You should run")
+            for command in commands:
+                print(" ".join(command))
+        else:
+            for command in commands:
+                print("Running {}".format(" ".join(command)))
+                retcod = subprocess.call(command)
+                if retcod != 0:
+                    print("Warning: failed with retcod = {}".format(retcod))
+
+def main():
+    parser = ArgumentParser()
+    # the default is to cowardly show commands to run
+    # use --run to actually do it
+    parser.add_argument("-r", "--run", action='store_true', default=False)
+    args = parser.parse_args()
+
+    running_containers = set(running_domains())
+    existing_containers = set(existing_vservers())
+    zombies_containers = existing_containers - running_containers
+
+    # the prefix used to locate subvolumes
+    flavour_prefixes = [
+        'onelab-',
+        'lxc-',
+        'omf-',
+        ]
+
+    # we need to call 'btrfs subvolume delete' on these remainings
+    # instead of just 'rm'
+    if zombies_containers:
+        commands = []
+        zombie_dirs = ["/vservers/"+z for z in zombies_containers]
+        print("-------- Found {} existing, but not running, containers".format(len(zombies_containers)))
+        print("zombie_dirs='{}'".format(" ".join(zombie_dirs)))
+        subvolumes = [ path
+                       for z in zombies_containers
+                       for prefix in flavour_prefixes
+                       for path in glob.glob("/vservers/{z}/{prefix}*".format(z=z, prefix=prefix))]
+        if subvolumes:
+            print("zombie_subvolumes='{}'".format(" ".join(subvolumes)))
+            for subvolume in subvolumes:
+                commands.append([ 'btrfs', 'subvolume', 'delete', subvolume])
+        for zombie_dir in zombie_dirs:
+            commands.append([ 'btrfs', 'subvolume', 'delete', zombie_dir ])
+        display_or_run_commands(commands, args.run)
+        # find the containers dirs that might still exist
+        zombie_dirs = [ path for path in zombie_dirs if os.path.isdir(path) ]
+        commands = [ ['rm', '-rf', path] for path in zombie_dirs ]
+        display_or_run_commands(commands, args.run)
+        
+    #### should happen much less frequently
+    weirdos_containers = running_containers - existing_containers
+    if weirdos_containers:
+        print("-------- Found {} running but non existing".format(len(weirdos_containers)))
+        for w in weirdos_containers:
+            print("/vservers/{}".format(w))
+
+main()