From: Thierry Parmentelat Date: Wed, 21 May 2014 08:21:38 +0000 (+0200) Subject: more aggressively try to cleanup sliver rootfs upon slice teardown X-Git-Tag: nodemanager-5.2-14~17 X-Git-Url: http://git.onelab.eu/?p=nodemanager.git;a=commitdiff_plain;h=1cb176d4c1d89f39b6683cb53c8e1e5e4917a902 more aggressively try to cleanup sliver rootfs upon slice teardown --- diff --git a/plugins/vsys.py b/plugins/vsys.py index f631d95..02161eb 100644 --- a/plugins/vsys.py +++ b/plugins/vsys.py @@ -46,13 +46,17 @@ def GetSlivers(data, config=None, plc=None): restartService() # check for systemctl, use it if present -def restartService (): +# keyword being 'start', 'stop' or 'restart' +def handleService (keyword): if tools.has_systemctl(): - logger.log("vsys: restarting vsys service through systemctl") - logger.log_call(["systemctl", "restart", "vsys"]) + logger.log("vsys: %s'ing vsys service through systemctl"%keyword) + return logger.log_call(["systemctl", "restart", "vsys"]) else: - logger.log("vsys: restarting vsys service through /etc/init.d/vsys") - logger.log_call(["/etc/init.d/vsys", "restart", ]) + logger.log("vsys: %s'ing vsys service through /etc/init.d/vsys"%keyword) + return logger.log_call(["/etc/init.d/vsys", keyword]) +def startService(): return handleService ('start') +def stopService(): return handleService ('stop') +def restartService(): return handleService ('restart') def createVsysDir(sliver): '''Create /vsys directory in slice. Update vsys conf file.''' @@ -160,16 +164,17 @@ def parseConf(): # before shutting down slivers, it is safe to first remove them from vsys's scope # so that we are sure that no dangling open file remains -# this will also restart vsys if needed +# this will also stop vsys if needed (in which case it return True to tell caller to restart vsys once done) def removeSliverFromVsys (sliver): current_slivers=parseConf() new_slivers= [ s for s in current_slivers if s != sliver ] if writeConf (current_slivers, new_slivers): - restartService() + stopService() trashVsysHandleInSliver (sliver) + return True else: logger.log("vsys.removeSliverFromConf: no need to remove %s"%sliver) - + return False def trashVsysHandleInSliver (sliver): slice_vsys_area = "/vservers/%s/vsys"%sliver diff --git a/sliver_lxc.py b/sliver_lxc.py index a08b0ce..c3b098e 100644 --- a/sliver_lxc.py +++ b/sliver_lxc.py @@ -15,7 +15,7 @@ from string import Template # if slivers get created by doing a,b,c # then they sohuld be delted by doing c,b,a # the current ordering model for vsys plugins completely fails to capture that -from plugins.vsys import removeSliverFromVsys +from plugins.vsys import removeSliverFromVsys, startService as vsysStartService import libvirt @@ -282,11 +282,6 @@ unset pathmunge containerDir = Sliver_LXC.CON_BASE_DIR + '/%s'%(name) - # Slivers with vsys running will fail the subvolume delete - # A more permanent solution may be to ensure that the vsys module - # is called before the sliver is destroyed. - removeSliverFromVsys (name) - try: # Destroy libvirt domain dom = conn.lookupByName(name) @@ -294,6 +289,10 @@ unset pathmunge logger.verbose('sliver_lxc.destroy: Domain %s does not exist!' % name) return + # Slivers with vsys running will fail the subvolume delete + # removeSliverFromVsys return True if it stops vsys, telling us to start it again later + vsys_stopped = removeSliverFromVsys (name) + try: logger.log("sliver_lxc.destroy: destroying domain %s"%name) dom.destroy() @@ -310,23 +309,35 @@ unset pathmunge command = ['/usr/sbin/userdel', '-f', '-r', name] logger.log_call(command, timeout=15*60) + # clean up rootfs as userdel will only take care of /home/ + command = ['rm','-rf', containerDir] + logger.log_call(command, timeout=60) + # at this point we sometimes see one subvolume left in /vservers//vrefname + command = ['btrfs', 'subvolume', 'delete', "%s/*"%containerDir ] + logger.log_call(command, timeout=10) # Remove rootfs of destroyed domain command = ['btrfs', 'subvolume', 'delete', containerDir] - logger.log_call(command, timeout=60) + logger.log_call(command, timeout=10) if not os.path.exists(containerDir): logger.log('sliver_lxc.destroy: %s cleanly destroyed.'%name) else: # oh no, it's still here... - logger.log("sliver_lxc.destroy: 1st warning: could not delete %s" % containerDir) - # this is for debugging but does not seem to be of much use - logger.log_call (['lsof']) - # what I can see on running nodes is that a second subvolume delete seems to do the trick here - # so let's check if that could be a workaround - logger.log("sliver_lxc.destroy: 2nd attempt at btrfs subvolume delete %s" % containerDir) - command = ['btrfs', 'subvolume', 'delete', containerDir] - logger.log_call(command, timeout=60) - if not os.path.exists(containerDir): - logger.log("sliver_lxc.destroy: WARNING: failed to delete %s after 2 attempts"%containerDir) - - + # this is more of a way to try and understand what is going on here + # than a real solution to anything + pass_no=1 + max_passes=2 + while pass_no <= max_passes: + command = ['rm', '-rf', containerDir] + logger.log("sliver_lxc.destroy: cleanup pass %d - command %s"%(pass_no,command)) + logger.log_call(command, timeout=5) + command = ['btrfs', 'subvolume', 'delete', containerDir] + logger.log("sliver_lxc.destroy: cleanup pass %d - command %s"%(pass_no,command)) + logger.log_call(command, timeout=5) + import time + time.sleep(1) + pass_no += 1 + if os.path.exists(containerDir): + logger.log('sliver_lxc.destroy: could not cleanly destroy %s - giving up'%name) + + if vsys_stopped: vsysStartService()