X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fvserver%2Fcontext.c;h=d38b23cb0ba334acdff78dbe324588dec4e906bd;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=8b3cee7cc0630da8b2bd621668d6ca0396da1a6c;hpb=d46bc780027c5439db9f72d42c0732775b53925a;p=linux-2.6.git

diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c
index 8b3cee7cc..d38b23cb0 100644
--- a/kernel/vserver/context.c
+++ b/kernel/vserver/context.c
@@ -3,7 +3,7 @@
  *
  *  Virtual Server: Context Support
  *
- *  Copyright (C) 2003-2004  Herbert Pötzl
+ *  Copyright (C) 2003-2007  Herbert Pötzl
  *
  *  V0.01  context helper
  *  V0.02  vx_ctx_kill syscall command
@@ -13,20 +13,54 @@
  *  V0.06  task_xid and info commands
  *  V0.07  context flags and caps
  *  V0.08  switch to RCU based hash
+ *  V0.09  revert to non RCU for now
+ *  V0.10  and back to working RCU hash
+ *  V0.11  and back to locking again
+ *  V0.12  referenced context store
+ *  V0.13  separate per cpu data
+ *  V0.14  changed vcmds to vxi arg
+ *  V0.15  added context stat
+ *  V0.16  have __create claim() the vxi
  *
  */
 
-#include <linux/config.h>
 #include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mnt_namespace.h>
+#include <linux/pid_namespace.h>
+
+#include <linux/sched.h>
 #include <linux/vserver/context.h>
+#include <linux/vserver/network.h>
 #include <linux/vserver/legacy.h>
-#include <linux/vinline.h>
-#include <linux/kernel_stat.h>
-#include <linux/namespace.h>
-#include <linux/rcupdate.h>
+#include <linux/vserver/debug.h>
+#include <linux/vserver/limit.h>
+#include <linux/vserver/limit_int.h>
+#include <linux/vserver/space.h>
+
+#include <linux/vs_context.h>
+#include <linux/vs_limit.h>
+#include <linux/vserver/context_cmd.h>
 
+#include <linux/err.h>
 #include <asm/errno.h>
 
+#include "cvirt_init.h"
+#include "cacct_init.h"
+#include "limit_init.h"
+#include "sched_init.h"
+
+
+atomic_t vx_global_ctotal	= ATOMIC_INIT(0);
+atomic_t vx_global_cactive	= ATOMIC_INIT(0);
+
+
+/*	now inactive context structures */
+
+static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT;
+
+static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED;
+
 
 /*	__alloc_vx_info()
 
@@ -36,8 +70,9 @@
 static struct vx_info *__alloc_vx_info(xid_t xid)
 {
 	struct vx_info *new = NULL;
-	
-	vxdprintk("alloc_vx_info(%d)\n", xid);
+	int cpu;
+
+	vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
 
 	/* would this benefit from a slab cache? */
 	new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
@@ -45,11 +80,22 @@ static struct vx_info *__alloc_vx_info(xid_t xid)
 		return 0;
 
 	memset (new, 0, sizeof(struct vx_info));
+#ifdef CONFIG_SMP
+	new->ptr_pc = alloc_percpu(struct _vx_info_pc);
+	if (!new->ptr_pc)
+		goto error;
+#endif
 	new->vx_id = xid;
-	INIT_RCU_HEAD(&new->vx_rcu);
 	INIT_HLIST_NODE(&new->vx_hlist);
-	atomic_set(&new->vx_refcnt, 0);
 	atomic_set(&new->vx_usecnt, 0);
+	atomic_set(&new->vx_tasks, 0);
+	new->vx_parent = NULL;
+	new->vx_state = 0;
+	init_waitqueue_head(&new->vx_wait);
+
+	/* prepare reaper */
+	get_task_struct(init_pid_ns.child_reaper);
+	new->vx_reaper = init_pid_ns.child_reaper;
 
 	/* rest of init goes here */
 	vx_info_init_limit(&new->limit);
@@ -57,12 +103,32 @@ static struct vx_info *__alloc_vx_info(xid_t xid)
 	vx_info_init_cvirt(&new->cvirt);
 	vx_info_init_cacct(&new->cacct);
 
-	new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
+	/* per cpu data structures */
+	for_each_possible_cpu(cpu) {
+		vx_info_init_sched_pc(
+			&vx_per_cpu(new, sched_pc, cpu), cpu);
+		vx_info_init_cvirt_pc(
+			&vx_per_cpu(new, cvirt_pc, cpu), cpu);
+	}
+
+	new->vx_flags = VXF_INIT_SET;
 	new->vx_bcaps = CAP_INIT_EFF_SET;
 	new->vx_ccaps = 0;
+	new->vx_cap_bset = cap_bset;
+
+	new->reboot_cmd = 0;
+	new->exit_code = 0;
 
-	vxdprintk("alloc_vx_info(%d) = %p\n", xid, new);
+	vxdprintk(VXD_CBIT(xid, 0),
+		"alloc_vx_info(%d) = %p", xid, new);
+	vxh_alloc_vx_info(new);
+	atomic_inc(&vx_global_ctotal);
 	return new;
+#ifdef CONFIG_SMP
+error:
+	kfree(new);
+	return 0;
+#endif
 }
 
 /*	__dealloc_vx_info()
@@ -71,33 +137,85 @@ static struct vx_info *__alloc_vx_info(xid_t xid)
 
 static void __dealloc_vx_info(struct vx_info *vxi)
 {
-	vxdprintk("dealloc_vx_info(%p)\n", vxi);
+	int cpu;
+
+	vxdprintk(VXD_CBIT(xid, 0),
+		"dealloc_vx_info(%p)", vxi);
+	vxh_dealloc_vx_info(vxi);
 
-	vxi->vx_hlist.next = LIST_POISON1;
 	vxi->vx_id = -1;
 
-	if (vxi->vx_namespace)
-		put_namespace(vxi->vx_namespace);
-	if (vxi->vx_fs)
-		put_fs_struct(vxi->vx_fs);
-	
 	vx_info_exit_limit(&vxi->limit);
 	vx_info_exit_sched(&vxi->sched);
 	vx_info_exit_cvirt(&vxi->cvirt);
 	vx_info_exit_cacct(&vxi->cacct);
-	
-	BUG_ON(atomic_read(&vxi->vx_usecnt));
-	BUG_ON(atomic_read(&vxi->vx_refcnt));
 
+	for_each_possible_cpu(cpu) {
+		vx_info_exit_sched_pc(
+			&vx_per_cpu(vxi, sched_pc, cpu), cpu);
+		vx_info_exit_cvirt_pc(
+			&vx_per_cpu(vxi, cvirt_pc, cpu), cpu);
+	}
+
+	vxi->vx_state |= VXS_RELEASED;
+
+#ifdef CONFIG_SMP
+	free_percpu(vxi->ptr_pc);
+#endif
 	kfree(vxi);
+	atomic_dec(&vx_global_ctotal);
+}
+
+static void __shutdown_vx_info(struct vx_info *vxi)
+{
+	struct nsproxy *nsproxy;
+	struct fs_struct *fs;
+
+	might_sleep();
+
+	vxi->vx_state |= VXS_SHUTDOWN;
+	vs_state_change(vxi, VSC_SHUTDOWN);
+
+	nsproxy = xchg(&vxi->vx_nsproxy, NULL);
+	fs = xchg(&vxi->vx_fs, NULL);
+
+	if (nsproxy)
+		put_nsproxy(nsproxy);
+	if (fs)
+		put_fs_struct(fs);
+}
+
+/* exported stuff */
+
+void free_vx_info(struct vx_info *vxi)
+{
+	unsigned long flags;
+
+	/* context shutdown is mandatory */
+	BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN));
+
+	BUG_ON(atomic_read(&vxi->vx_usecnt));
+	BUG_ON(atomic_read(&vxi->vx_tasks));
+
+	BUG_ON(vx_info_state(vxi, VXS_HASHED));
+
+	BUG_ON(vxi->vx_nsproxy);
+	BUG_ON(vxi->vx_fs);
+
+	spin_lock_irqsave(&vx_info_inactive_lock, flags);
+	hlist_del(&vxi->vx_hlist);
+	spin_unlock_irqrestore(&vx_info_inactive_lock, flags);
+
+	__dealloc_vx_info(vxi);
 }
 
 
 /*	hash table for vx_info hash */
 
-#define	VX_HASH_SIZE	13
+#define VX_HASH_SIZE	13
 
-struct hlist_head vx_info_hash[VX_HASH_SIZE];
+static struct hlist_head vx_info_hash[VX_HASH_SIZE] =
+	{ [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT };
 
 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
 
@@ -117,11 +235,19 @@ static inline unsigned int __hashval(xid_t xid)
 static inline void __hash_vx_info(struct vx_info *vxi)
 {
 	struct hlist_head *head;
-	
-	vxdprintk("__hash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
-	get_vx_info(vxi);
+
+	vxd_assert_lock(&vx_info_hash_lock);
+	vxdprintk(VXD_CBIT(xid, 4),
+		"__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
+	vxh_hash_vx_info(vxi);
+
+	/* context must not be hashed */
+	BUG_ON(vx_info_state(vxi, VXS_HASHED));
+
+	vxi->vx_state |= VXS_HASHED;
 	head = &vx_info_hash[__hashval(vxi->vx_id)];
-	hlist_add_head_rcu(&vxi->vx_hlist, head);
+	hlist_add_head(&vxi->vx_hlist, head);
+	atomic_inc(&vx_global_cactive);
 }
 
 /*	__unhash_vx_info()
@@ -131,31 +257,53 @@ static inline void __hash_vx_info(struct vx_info *vxi)
 
 static inline void __unhash_vx_info(struct vx_info *vxi)
 {
-	vxdprintk("__unhash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
-	hlist_del_rcu(&vxi->vx_hlist);
-	put_vx_info(vxi);
+	unsigned long flags;
+
+	vxd_assert_lock(&vx_info_hash_lock);
+	vxdprintk(VXD_CBIT(xid, 4),
+		"__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id,
+		atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks));
+	vxh_unhash_vx_info(vxi);
+
+	/* context must be hashed */
+	BUG_ON(!vx_info_state(vxi, VXS_HASHED));
+	/* but without tasks */
+	BUG_ON(atomic_read(&vxi->vx_tasks));
+
+	vxi->vx_state &= ~VXS_HASHED;
+	hlist_del_init(&vxi->vx_hlist);
+	spin_lock_irqsave(&vx_info_inactive_lock, flags);
+	hlist_add_head(&vxi->vx_hlist, &vx_info_inactive);
+	spin_unlock_irqrestore(&vx_info_inactive_lock, flags);
+	atomic_dec(&vx_global_cactive);
 }
 
 
 /*	__lookup_vx_info()
 
-	* requires the rcu_read_lock()
+	* requires the hash_lock to be held
 	* doesn't increment the vx_refcnt			*/
 
 static inline struct vx_info *__lookup_vx_info(xid_t xid)
 {
 	struct hlist_head *head = &vx_info_hash[__hashval(xid)];
 	struct hlist_node *pos;
+	struct vx_info *vxi;
 
+	vxd_assert_lock(&vx_info_hash_lock);
 	hlist_for_each(pos, head) {
-		struct vx_info *vxi =
-			hlist_entry(pos, struct vx_info, vx_hlist);
+		vxi = hlist_entry(pos, struct vx_info, vx_hlist);
 
-		if (vxi->vx_id == xid) {
-			return vxi;
-		}
+		if (vxi->vx_id == xid)
+			goto found;
 	}
-	return NULL;
+	vxi = NULL;
+found:
+	vxdprintk(VXD_CBIT(xid, 0),
+		"__lookup_vx_info(#%u): %p[#%u]",
+		xid, vxi, vxi?vxi->vx_id:0);
+	vxh_lookup_vx_info(vxi, xid);
+	return vxi;
 }
 
 
@@ -168,16 +316,22 @@ static inline xid_t __vx_dynamic_id(void)
 {
 	static xid_t seq = MAX_S_CONTEXT;
 	xid_t barrier = seq;
-	
+
+	vxd_assert_lock(&vx_info_hash_lock);
 	do {
 		if (++seq > MAX_S_CONTEXT)
 			seq = MIN_D_CONTEXT;
-		if (!__lookup_vx_info(seq))
+		if (!__lookup_vx_info(seq)) {
+			vxdprintk(VXD_CBIT(xid, 4),
+				"__vx_dynamic_id: [#%d]", seq);
 			return seq;
+		}
 	} while (barrier != seq);
 	return 0;
 }
 
+#ifdef	CONFIG_VSERVER_LEGACY
+
 /*	__loc_vx_info()
 
 	* locate or create the requested context
@@ -186,34 +340,42 @@ static inline xid_t __vx_dynamic_id(void)
 static struct vx_info * __loc_vx_info(int id, int *err)
 {
 	struct vx_info *new, *vxi = NULL;
-	
-	vxdprintk("loc_vx_info(%d)\n", id);
+
+	vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
 
 	if (!(new = __alloc_vx_info(id))) {
 		*err = -ENOMEM;
 		return NULL;
 	}
 
+	/* required to make dynamic xids unique */
 	spin_lock(&vx_info_hash_lock);
 
 	/* dynamic context requested */
 	if (id == VX_DYNAMIC_ID) {
+#ifdef	CONFIG_VSERVER_DYNAMIC_IDS
 		id = __vx_dynamic_id();
 		if (!id) {
 			printk(KERN_ERR "no dynamic context available.\n");
 			goto out_unlock;
 		}
 		new->vx_id = id;
+#else
+		printk(KERN_ERR "dynamic contexts disabled.\n");
+		goto out_unlock;
+#endif
 	}
 	/* existing context requested */
 	else if ((vxi = __lookup_vx_info(id))) {
 		/* context in setup is not available */
 		if (vxi->vx_flags & VXF_STATE_SETUP) {
-			vxdprintk("loc_vx_info(%d) = %p (not available)\n", id, vxi);
+			vxdprintk(VXD_CBIT(xid, 0),
+				"loc_vx_info(%d) = %p (not available)", id, vxi);
 			vxi = NULL;
 			*err = -EBUSY;
 		} else {
-			vxdprintk("loc_vx_info(%d) = %p (found)\n", id, vxi);
+			vxdprintk(VXD_CBIT(xid, 0),
+				"loc_vx_info(%d) = %p (found)", id, vxi);
 			get_vx_info(vxi);
 			*err = 0;
 		}
@@ -221,91 +383,140 @@ static struct vx_info * __loc_vx_info(int id, int *err)
 	}
 
 	/* new context requested */
-	vxdprintk("loc_vx_info(%d) = %p (new)\n", id, new);
+	vxdprintk(VXD_CBIT(xid, 0),
+		"loc_vx_info(%d) = %p (new)", id, new);
 	__hash_vx_info(get_vx_info(new));
 	vxi = new, new = NULL;
 	*err = 1;
 
 out_unlock:
 	spin_unlock(&vx_info_hash_lock);
+	vxh_loc_vx_info(vxi, id);
 	if (new)
 		__dealloc_vx_info(new);
 	return vxi;
 }
 
+#endif
 
+/*	__create_vx_info()
 
-/*	exported stuff						*/
+	* create the requested context
+	* get(), claim() and hash it				*/
 
+static struct vx_info * __create_vx_info(int id)
+{
+	struct vx_info *new, *vxi = NULL;
 
+	vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id);
 
-void rcu_free_vx_info(void *obj)
-{
-	struct vx_info *vxi = obj;
-	int usecnt, refcnt;
+	if (!(new = __alloc_vx_info(id)))
+		return ERR_PTR(-ENOMEM);
 
-	usecnt = atomic_read(&vxi->vx_usecnt);
-	BUG_ON(usecnt < 0);
+	/* required to make dynamic xids unique */
+	spin_lock(&vx_info_hash_lock);
+
+	/* dynamic context requested */
+	if (id == VX_DYNAMIC_ID) {
+#ifdef	CONFIG_VSERVER_DYNAMIC_IDS
+		id = __vx_dynamic_id();
+		if (!id) {
+			printk(KERN_ERR "no dynamic context available.\n");
+			vxi = ERR_PTR(-EAGAIN);
+			goto out_unlock;
+		}
+		new->vx_id = id;
+#else
+		printk(KERN_ERR "dynamic contexts disabled.\n");
+		vxi = ERR_PTR(-EINVAL);
+		goto out_unlock;
+#endif
+	}
+	/* static context requested */
+	else if ((vxi = __lookup_vx_info(id))) {
+		vxdprintk(VXD_CBIT(xid, 0),
+			"create_vx_info(%d) = %p (already there)", id, vxi);
+		if (vx_info_flags(vxi, VXF_STATE_SETUP, 0))
+			vxi = ERR_PTR(-EBUSY);
+		else
+			vxi = ERR_PTR(-EEXIST);
+		goto out_unlock;
+	}
+#ifdef	CONFIG_VSERVER_DYNAMIC_IDS
+	/* dynamic xid creation blocker */
+	else if (id >= MIN_D_CONTEXT) {
+		vxdprintk(VXD_CBIT(xid, 0),
+			"create_vx_info(%d) (dynamic rejected)", id);
+		vxi = ERR_PTR(-EINVAL);
+		goto out_unlock;
+	}
+#endif
 
-	refcnt = atomic_read(&vxi->vx_refcnt);
-	BUG_ON(refcnt < 0);
+	/* new context */
+	vxdprintk(VXD_CBIT(xid, 0),
+		"create_vx_info(%d) = %p (new)", id, new);
+	claim_vx_info(new, NULL);
+	__hash_vx_info(get_vx_info(new));
+	vxi = new, new = NULL;
 
-	if (!usecnt)
-		__dealloc_vx_info(vxi);
-	else
-		printk("!!! rcu didn't free\n");
+out_unlock:
+	spin_unlock(&vx_info_hash_lock);
+	vxh_create_vx_info(IS_ERR(vxi)?NULL:vxi, id);
+	if (new)
+		__dealloc_vx_info(new);
+	return vxi;
 }
 
+
+/*	exported stuff						*/
+
+
 void unhash_vx_info(struct vx_info *vxi)
 {
+	__shutdown_vx_info(vxi);
 	spin_lock(&vx_info_hash_lock);
 	__unhash_vx_info(vxi);
 	spin_unlock(&vx_info_hash_lock);
+	__wakeup_vx_info(vxi);
 }
 
-/*	locate_vx_info()
 
-	* search for a vx_info and get() it			
+/*	lookup_vx_info()
+
+	* search for a vx_info and get() it
 	* negative id means current				*/
 
-struct vx_info *locate_vx_info(int id)
+struct vx_info *lookup_vx_info(int id)
 {
-	struct vx_info *vxi;
-	
+	struct vx_info *vxi = NULL;
+
 	if (id < 0) {
 		vxi = get_vx_info(current->vx_info);
-	} else {
-		rcu_read_lock();
+	} else if (id > 1) {
+		spin_lock(&vx_info_hash_lock);
 		vxi = get_vx_info(__lookup_vx_info(id));
-		rcu_read_unlock();
+		spin_unlock(&vx_info_hash_lock);
 	}
 	return vxi;
 }
 
-/*	vx_info_is_hashed()
+/*	xid_is_hashed()
 
 	* verify that xid is still hashed			*/
 
-int vx_info_is_hashed(xid_t xid)
+int xid_is_hashed(xid_t xid)
 {
 	int hashed;
 
-	rcu_read_lock();
+	spin_lock(&vx_info_hash_lock);
 	hashed = (__lookup_vx_info(xid) != NULL);
-	rcu_read_unlock();
+	spin_unlock(&vx_info_hash_lock);
 	return hashed;
 }
 
 #ifdef	CONFIG_VSERVER_LEGACY
 
-#if 0
-struct vx_info *alloc_vx_info(xid_t xid)
-{
-	return __alloc_vx_info(xid);
-}
-#endif
-
-struct vx_info *locate_or_create_vx_info(int id)
+struct vx_info *lookup_or_create_vx_info(int id)
 {
 	int err;
 
@@ -316,43 +527,75 @@ struct vx_info *locate_or_create_vx_info(int id)
 
 #ifdef	CONFIG_PROC_FS
 
-#define hlist_for_each_rcu(pos, head) \
-        for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
-		pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
+/*	get_xid_list()
+
+	* get a subset of hashed xids for proc
+	* assumes size is at least one				*/
 
 int get_xid_list(int index, unsigned int *xids, int size)
 {
 	int hindex, nr_xids = 0;
 
-	rcu_read_lock();
+	/* only show current and children */
+	if (!vx_check(0, VS_ADMIN|VS_WATCH)) {
+		if (index > 0)
+			return 0;
+		xids[nr_xids] = vx_current_xid();
+		return 1;
+	}
+
 	for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
 		struct hlist_head *head = &vx_info_hash[hindex];
 		struct hlist_node *pos;
 
-		hlist_for_each_rcu(pos, head) {
+		spin_lock(&vx_info_hash_lock);
+		hlist_for_each(pos, head) {
 			struct vx_info *vxi;
 
 			if (--index > 0)
 				continue;
 
 			vxi = hlist_entry(pos, struct vx_info, vx_hlist);
-			xids[nr_xids] = vxi->vx_id;			
-			if (++nr_xids >= size)
+			xids[nr_xids] = vxi->vx_id;
+			if (++nr_xids >= size) {
+				spin_unlock(&vx_info_hash_lock);
 				goto out;
+			}
 		}
+		/* keep the lock time short */
+		spin_unlock(&vx_info_hash_lock);
 	}
 out:
-	rcu_read_unlock();
 	return nr_xids;
 }
 #endif
 
+#ifdef	CONFIG_VSERVER_DEBUG
+
+void	dump_vx_info_inactive(int level)
+{
+	struct hlist_node *entry, *next;
+
+	hlist_for_each_safe(entry, next, &vx_info_inactive) {
+		struct vx_info *vxi =
+			list_entry(entry, struct vx_info, vx_hlist);
+
+		dump_vx_info(vxi, level);
+	}
+}
+
+#endif
+
 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
 {
 	struct user_struct *new_user, *old_user;
-	
+
 	if (!p || !vxi)
 		BUG();
+
+	if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0))
+		return -EACCES;
+
 	new_user = alloc_uid(vxi->vx_id, p->uid);
 	if (!new_user)
 		return -ENOMEM;
@@ -367,49 +610,28 @@ int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
 	return 0;
 }
 
-void vx_mask_bcaps(struct task_struct *p)
+void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p)
 {
-	struct vx_info *vxi = p->vx_info;
-
-	p->cap_effective &= vxi->vx_bcaps;
-	p->cap_inheritable &= vxi->vx_bcaps;
-	p->cap_permitted &= vxi->vx_bcaps;
+	p->cap_effective &= vxi->vx_cap_bset;
+	p->cap_inheritable &= vxi->vx_cap_bset;
+	p->cap_permitted &= vxi->vx_cap_bset;
 }
 
 
 #include <linux/file.h>
 
-static inline int vx_nofiles_task(struct task_struct *tsk)
-{
-	struct files_struct *files = tsk->files;
-	const unsigned long *obptr, *cbptr;
-	int count, total;
-
-	spin_lock(&files->file_lock);
-	obptr = files->open_fds->fds_bits;
-	cbptr = files->close_on_exec->fds_bits;
-	count = files->max_fds / (sizeof(unsigned long) * 8);
-	for (total = 0; count > 0; count--) {
-		if (*obptr)
-			total += hweight_long(*obptr);
-		obptr++;
-	/*	if (*cbptr)
-			total += hweight_long(*cbptr);
-		cbptr++; */
-	}
-	spin_unlock(&files->file_lock);
-	return total;
-}
-
-static inline int vx_openfd_task(struct task_struct *tsk)
+static int vx_openfd_task(struct task_struct *tsk)
 {
 	struct files_struct *files = tsk->files;
+	struct fdtable *fdt;
 	const unsigned long *bptr;
 	int count, total;
 
+	/* no rcu_read_lock() because of spin_lock() */
 	spin_lock(&files->file_lock);
-	bptr = files->open_fds->fds_bits;
-	count = files->max_fds / (sizeof(unsigned long) * 8);
+	fdt = files_fdtable(files);
+	bptr = fdt->open_fds->fds_bits;
+	count = fdt->max_fds / (sizeof(unsigned long) * 8);
 	for (total = 0; count > 0; count--) {
 		if (*bptr)
 			total += hweight_long(*bptr);
@@ -419,62 +641,190 @@ static inline int vx_openfd_task(struct task_struct *tsk)
 	return total;
 }
 
+
+/* 	for *space compatibility */
+
+asmlinkage long sys_unshare(unsigned long);
+
 /*
  *	migrate task to new context
  *	gets vxi, puts old_vxi on change
+ *	optionally unshares namespaces (hack)
  */
 
-int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
+int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare)
 {
 	struct vx_info *old_vxi;
 	int ret = 0;
-	
+
 	if (!p || !vxi)
 		BUG();
 
+	vxdprintk(VXD_CBIT(xid, 5),
+		"vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
+		vxi->vx_id, atomic_read(&vxi->vx_usecnt));
+
+	if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) &&
+		!vx_info_flags(vxi, VXF_STATE_SETUP, 0))
+		return -EACCES;
+
+	if (vx_info_state(vxi, VXS_SHUTDOWN))
+		return -EFAULT;
+
 	old_vxi = task_get_vx_info(p);
 	if (old_vxi == vxi)
 		goto out;
 
-	vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
-		vxi->vx_id, atomic_read(&vxi->vx_usecnt));
-
 	if (!(ret = vx_migrate_user(p, vxi))) {
+		int openfd;
+
 		task_lock(p);
+		openfd = vx_openfd_task(p);
+
 		if (old_vxi) {
-			atomic_dec(&old_vxi->cacct.nr_threads);
-			atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]);
-		}		
-		atomic_inc(&vxi->cacct.nr_threads);
-		atomic_inc(&vxi->limit.res[RLIMIT_NPROC]);
-		atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]);
-		atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]);
-		/* should be handled in set_vx_info !! */
-		if (old_vxi)
+			atomic_dec(&old_vxi->cvirt.nr_threads);
+			atomic_dec(&old_vxi->cvirt.nr_running);
+			__rlim_dec(&old_vxi->limit, RLIMIT_NPROC);
+			/* FIXME: what about the struct files here? */
+			__rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd);
+			/* account for the executable */
+			__rlim_dec(&old_vxi->limit, VLIMIT_DENTRY);
+		}
+		atomic_inc(&vxi->cvirt.nr_threads);
+		atomic_inc(&vxi->cvirt.nr_running);
+		__rlim_inc(&vxi->limit, RLIMIT_NPROC);
+		/* FIXME: what about the struct files here? */
+		__rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd);
+		/* account for the executable */
+		__rlim_inc(&vxi->limit, VLIMIT_DENTRY);
+
+		if (old_vxi) {
+			release_vx_info(old_vxi, p);
 			clr_vx_info(&p->vx_info);
+		}
+		claim_vx_info(vxi, p);
 		set_vx_info(&p->vx_info, vxi);
 		p->xid = vxi->vx_id;
-		vx_mask_bcaps(p);
+
+		vxdprintk(VXD_CBIT(xid, 5),
+			"moved task %p into vxi:%p[#%d]",
+			p, vxi, vxi->vx_id);
+
+		vx_mask_cap_bset(vxi, p);
 		task_unlock(p);
 
-		put_vx_info(old_vxi);
+		/* hack for *spaces to provide compatibility */
+		if (unshare) {
+			ret = sys_unshare(CLONE_NEWUTS|CLONE_NEWIPC);
+			vx_set_space(vxi, CLONE_NEWUTS|CLONE_NEWIPC);
+		}
 	}
 out:
 	put_vx_info(old_vxi);
 	return ret;
 }
 
+int vx_set_reaper(struct vx_info *vxi, struct task_struct *p)
+{
+	struct task_struct *old_reaper;
+
+	if (!vxi)
+		return -EINVAL;
+
+	vxdprintk(VXD_CBIT(xid, 6),
+		"vx_set_reaper(%p[#%d],%p[#%d,%d])",
+		vxi, vxi->vx_id, p, p->xid, p->pid);
+
+	old_reaper = vxi->vx_reaper;
+	if (old_reaper == p)
+		return 0;
+
+	/* set new child reaper */
+	get_task_struct(p);
+	vxi->vx_reaper = p;
+	put_task_struct(old_reaper);
+	return 0;
+}
+
 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
 {
 	if (!vxi)
 		return -EINVAL;
-        if (vxi->vx_initpid)
-                return -EPERM;
 
-        vxi->vx_initpid = p->tgid;
+	vxdprintk(VXD_CBIT(xid, 6),
+		"vx_set_init(%p[#%d],%p[#%d,%d,%d])",
+		vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
+
+	vxi->vx_flags &= ~VXF_STATE_INIT;
+	vxi->vx_initpid = p->tgid;
 	return 0;
 }
 
+void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code)
+{
+	vxdprintk(VXD_CBIT(xid, 6),
+		"vx_exit_init(%p[#%d],%p[#%d,%d,%d])",
+		vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
+
+	vxi->exit_code = code;
+	vxi->vx_initpid = 0;
+}
+
+
+void vx_set_persistent(struct vx_info *vxi)
+{
+	vxdprintk(VXD_CBIT(xid, 6),
+		"vx_set_persistent(%p[#%d])", vxi, vxi->vx_id);
+
+	get_vx_info(vxi);
+	claim_vx_info(vxi, NULL);
+}
+
+void vx_clear_persistent(struct vx_info *vxi)
+{
+	vxdprintk(VXD_CBIT(xid, 6),
+		"vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id);
+
+	release_vx_info(vxi, NULL);
+	put_vx_info(vxi);
+}
+
+void vx_update_persistent(struct vx_info *vxi)
+{
+	if (vx_info_flags(vxi, VXF_PERSISTENT, 0))
+		vx_set_persistent(vxi);
+	else
+		vx_clear_persistent(vxi);
+}
+
+
+/*	task must be current or locked		*/
+
+void	exit_vx_info(struct task_struct *p, int code)
+{
+	struct vx_info *vxi = p->vx_info;
+
+	if (vxi) {
+		atomic_dec(&vxi->cvirt.nr_threads);
+		vx_nproc_dec(p);
+
+		vxi->exit_code = code;
+		release_vx_info(vxi, p);
+	}
+}
+
+void	exit_vx_info_early(struct task_struct *p, int code)
+{
+	struct vx_info *vxi = p->vx_info;
+
+	if (vxi) {
+		if (vxi->vx_initpid == p->tgid)
+			vx_exit_init(vxi, p, code);
+		if (vxi->vx_reaper == p)
+			vx_set_reaper(vxi, init_pid_ns.child_reaper);
+	}
+}
+
 
 /* vserver syscall commands below here */
 
@@ -485,42 +835,44 @@ int vx_set_init(struct vx_info *vxi, struct task_struct *p)
 
 int vc_task_xid(uint32_t id, void __user *data)
 {
-        xid_t xid;
+	xid_t xid;
 
-        if (id) {
-                struct task_struct *tsk;
+	if (id) {
+		struct task_struct *tsk;
 
-                if (!vx_check(0, VX_ADMIN|VX_WATCH))
-                        return -EPERM;
+		if (!vx_check(0, VS_ADMIN|VS_WATCH))
+			return -EPERM;
 
-                read_lock(&tasklist_lock);
-                tsk = find_task_by_pid(id);
-                xid = (tsk) ? tsk->xid : -ESRCH;
-                read_unlock(&tasklist_lock);
-        }
-        else
-                xid = current->xid;
-        return xid;
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_real_pid(id);
+		xid = (tsk) ? tsk->xid : -ESRCH;
+		read_unlock(&tasklist_lock);
+	}
+	else
+		xid = vx_current_xid();
+	return xid;
 }
 
 
-int vc_vx_info(uint32_t id, void __user *data)
+int vc_vx_info(struct vx_info *vxi, void __user *data)
 {
-	struct vx_info *vxi;
 	struct vcmd_vx_info_v0 vc_data;
 
-	if (!vx_check(0, VX_ADMIN))
-		return -ENOSYS;
-	if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	vxi = locate_vx_info(id);
-	if (!vxi)
-		return -ESRCH;
-
 	vc_data.xid = vxi->vx_id;
 	vc_data.initpid = vxi->vx_initpid;
-	put_vx_info(vxi);
+
+	if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+		return -EFAULT;
+	return 0;
+}
+
+
+int vc_ctx_stat(struct vx_info *vxi, void __user *data)
+{
+	struct vcmd_ctx_stat_v0 vc_data;
+
+	vc_data.usecnt = atomic_read(&vxi->vx_usecnt);
+	vc_data.tasks = atomic_read(&vxi->vx_tasks);
 
 	if (copy_to_user (data, &vc_data, sizeof(vc_data)))
 		return -EFAULT;
@@ -532,157 +884,213 @@ int vc_vx_info(uint32_t id, void __user *data)
 
 int vc_ctx_create(uint32_t xid, void __user *data)
 {
+	struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET };
 	struct vx_info *new_vxi;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
+		return -EFAULT;
 
-	if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
+	if ((xid > MAX_S_CONTEXT) && (xid != VX_DYNAMIC_ID))
 		return -EINVAL;
-
-	if (xid < 1)
+	if (xid < 2)
 		return -EINVAL;
 
-	new_vxi = __loc_vx_info(xid, &ret);
-	if (!new_vxi)
-		return ret;
-	if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
-		ret = -EEXIST;
-		goto out_put;
-	}
+	new_vxi = __create_vx_info(xid);
+	if (IS_ERR(new_vxi))
+		return PTR_ERR(new_vxi);
+
+	/* initial flags */
+	new_vxi->vx_flags = vc_data.flagword;
 
+	ret = -ENOEXEC;
+	if (vs_state_change(new_vxi, VSC_STARTUP))
+		goto out;
+
+	ret = vx_migrate_task(current, new_vxi, (!data));
+	if (ret)
+		goto out;
+
+	/* return context id on success */
 	ret = new_vxi->vx_id;
-	vx_migrate_task(current, new_vxi);
-	/* if this fails, we might end up with a hashed vx_info */
-out_put:
+
+	/* get a reference for persistent contexts */
+	if ((vc_data.flagword & VXF_PERSISTENT))
+		vx_set_persistent(new_vxi);
+out:
+	release_vx_info(new_vxi, NULL);
 	put_vx_info(new_vxi);
 	return ret;
 }
 
 
-int vc_ctx_migrate(uint32_t id, void __user *data)
+int vc_ctx_migrate(struct vx_info *vxi, void __user *data)
 {
-	struct vx_info *vxi;
-	
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	struct vcmd_ctx_migrate vc_data = { .flagword = 0 };
+	int ret;
 
-	/* dirty hack until Spectator becomes a cap */
-	if (id == 1) {
-		current->xid = 1;
-		return 0;
-	}
+	if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
+		return -EFAULT;
 
-	vxi = locate_vx_info(id);
-	if (!vxi)
-		return -ESRCH;
-	vx_migrate_task(current, vxi);
-	put_vx_info(vxi);
-	return 0;
+	ret = vx_migrate_task(current, vxi, 0);
+	if (ret)
+		return ret;
+	if (vc_data.flagword & VXM_SET_INIT)
+		ret = vx_set_init(vxi, current);
+	if (ret)
+		return ret;
+	if (vc_data.flagword & VXM_SET_REAPER)
+		ret = vx_set_reaper(vxi, current);
+	return ret;
 }
 
 
-int vc_get_cflags(uint32_t id, void __user *data)
+int vc_get_cflags(struct vx_info *vxi, void __user *data)
 {
-	struct vx_info *vxi;
 	struct vcmd_ctx_flags_v0 vc_data;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	vxi = locate_vx_info(id);
-	if (!vxi)
-		return -ESRCH;
-
 	vc_data.flagword = vxi->vx_flags;
 
 	/* special STATE flag handling */
-	vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
-
-	put_vx_info(vxi);
+	vc_data.mask = vs_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
 
 	if (copy_to_user (data, &vc_data, sizeof(vc_data)))
 		return -EFAULT;
 	return 0;
 }
 
-int vc_set_cflags(uint32_t id, void __user *data)
+int vc_set_cflags(struct vx_info *vxi, void __user *data)
 {
-	struct vx_info *vxi;
 	struct vcmd_ctx_flags_v0 vc_data;
 	uint64_t mask, trigger;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
 		return -EFAULT;
 
-	vxi = locate_vx_info(id);
-	if (!vxi)
-		return -ESRCH;
-
 	/* special STATE flag handling */
-	mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
+	mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
 	trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
 
-	if (trigger & VXF_STATE_SETUP)
-		vx_mask_bcaps(current);
-	if (trigger & VXF_STATE_INIT)
-		if (vxi == current->vx_info)
-			vx_set_init(vxi, current);
+	if (vxi == current->vx_info) {
+		if (trigger & VXF_STATE_SETUP)
+			vx_mask_cap_bset(vxi, current);
+		if (trigger & VXF_STATE_INIT) {
+			int ret;
+
+			ret = vx_set_init(vxi, current);
+			if (ret)
+				return ret;
+			ret = vx_set_reaper(vxi, current);
+			if (ret)
+				return ret;
+		}
+	}
 
-	vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
+	vxi->vx_flags = vs_mask_flags(vxi->vx_flags,
 		vc_data.flagword, mask);
-	put_vx_info(vxi);
+	if (trigger & VXF_PERSISTENT)
+		vx_update_persistent(vxi);
+
 	return 0;
 }
 
-int vc_get_ccaps(uint32_t id, void __user *data)
+static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps)
+{
+	if (bcaps)
+		*bcaps = vxi->vx_bcaps;
+	if (ccaps)
+		*ccaps = vxi->vx_ccaps;
+
+	return 0;
+}
+
+int vc_get_ccaps_v0(struct vx_info *vxi, void __user *data)
 {
-	struct vx_info *vxi;
 	struct vcmd_ctx_caps_v0 vc_data;
+	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	ret = do_get_caps(vxi, &vc_data.bcaps, &vc_data.ccaps);
+	if (ret)
+		return ret;
+	vc_data.cmask = ~0UL;
 
-	vxi = locate_vx_info(id);
-	if (!vxi)
-		return -ESRCH;
+	if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+		return -EFAULT;
+	return 0;
+}
 
-	vc_data.bcaps = vxi->vx_bcaps;
-	vc_data.ccaps = vxi->vx_ccaps;
+int vc_get_ccaps(struct vx_info *vxi, void __user *data)
+{
+	struct vcmd_ctx_caps_v1 vc_data;
+	int ret;
+
+	ret = do_get_caps(vxi, NULL, &vc_data.ccaps);
+	if (ret)
+		return ret;
 	vc_data.cmask = ~0UL;
-	put_vx_info(vxi);
 
 	if (copy_to_user (data, &vc_data, sizeof(vc_data)))
 		return -EFAULT;
 	return 0;
 }
 
-int vc_set_ccaps(uint32_t id, void __user *data)
+static int do_set_caps(struct vx_info *vxi,
+	uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask)
+{
+	vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask);
+	vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask);
+
+	return 0;
+}
+
+int vc_set_ccaps_v0(struct vx_info *vxi, void __user *data)
 {
-	struct vx_info *vxi;
 	struct vcmd_ctx_caps_v0 vc_data;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
 		return -EFAULT;
 
-	vxi = locate_vx_info(id);
-	if (!vxi)
-		return -ESRCH;
-
-	vxi->vx_bcaps &= vc_data.bcaps;
-	vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
+	/* simulate old &= behaviour for bcaps */
+	return do_set_caps(vxi, 0, ~vc_data.bcaps,
 		vc_data.ccaps, vc_data.cmask);
-	put_vx_info(vxi);
+}
+
+int vc_set_ccaps(struct vx_info *vxi, void __user *data)
+{
+	struct vcmd_ctx_caps_v1 vc_data;
+
+	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+		return -EFAULT;
+
+	return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask);
+}
+
+int vc_get_bcaps(struct vx_info *vxi, void __user *data)
+{
+	struct vcmd_bcaps vc_data;
+	int ret;
+
+	ret = do_get_caps(vxi, &vc_data.bcaps, NULL);
+	if (ret)
+		return ret;
+	vc_data.bmask = ~0UL;
+
+	if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+		return -EFAULT;
 	return 0;
 }
 
+int vc_set_bcaps(struct vx_info *vxi, void __user *data)
+{
+	struct vcmd_bcaps vc_data;
+
+	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+		return -EFAULT;
+
+	return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0);
+}
+
 #include <linux/module.h>
 
-EXPORT_SYMBOL_GPL(rcu_free_vx_info);
-EXPORT_SYMBOL_GPL(vx_info_hash_lock);
+EXPORT_SYMBOL_GPL(free_vx_info);