* V0.05 rlimit basic implementation
* V0.06 task_xid and info commands
* V0.07 context flags and caps
+ * V0.08 switch to RCU based hash
*
*/
#include <linux/config.h>
#include <linux/slab.h>
-#include <linux/vserver/context.h>
+#include <linux/vserver.h>
#include <linux/vserver/legacy.h>
-#include <linux/vinline.h>
+#include <linux/vs_base.h>
+#include <linux/vs_context.h>
#include <linux/kernel_stat.h>
#include <linux/namespace.h>
+#include <linux/rcupdate.h>
#include <asm/errno.h>
-/* system functions */
+/* __alloc_vx_info()
+ * allocate an initialized vx_info struct
+ * doesn't make it visible (hash) */
-LIST_HEAD(vx_infos);
-
-spinlock_t vxlist_lock
- __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
-
-/*
- * struct vx_info allocation and deallocation
- */
-
-static struct vx_info *alloc_vx_info(int id)
+static struct vx_info *__alloc_vx_info(xid_t xid)
{
struct vx_info *new = NULL;
-
- vxdprintk("alloc_vx_info(%d)\n", id);
+
+ vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
+
/* would this benefit from a slab cache? */
new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
if (!new)
return 0;
memset (new, 0, sizeof(struct vx_info));
- new->vx_id = id;
- INIT_LIST_HEAD(&new->vx_list);
+ new->vx_id = xid;
+ INIT_RCU_HEAD(&new->vx_rcu);
+ INIT_HLIST_NODE(&new->vx_hlist);
+ atomic_set(&new->vx_refcnt, 0);
+ atomic_set(&new->vx_usecnt, 0);
+ new->vx_parent = NULL;
+ new->vx_state = 0;
+ new->vx_lock = SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&new->vx_exit);
+
/* rest of init goes here */
-
vx_info_init_limit(&new->limit);
vx_info_init_sched(&new->sched);
vx_info_init_cvirt(&new->cvirt);
vx_info_init_cacct(&new->cacct);
+
new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
new->vx_bcaps = CAP_INIT_EFF_SET;
new->vx_ccaps = 0;
- vxdprintk("alloc_vx_info(%d) = %p\n", id, new);
+ vxdprintk(VXD_CBIT(xid, 0),
+ "alloc_vx_info(%d) = %p", xid, new);
return new;
}
-void free_vx_info(struct vx_info *vxi)
+/* __dealloc_vx_info()
+
+ * final disposal of vx_info */
+
+static void __dealloc_vx_info(struct vx_info *vxi)
{
- vxdprintk("free_vx_info(%p)\n", vxi);
- if (vxi->vx_namespace)
- put_namespace(vxi->vx_namespace);
- if (vxi->vx_fs)
- put_fs_struct(vxi->vx_fs);
-
+ vxdprintk(VXD_CBIT(xid, 0),
+ "dealloc_vx_info(%p)", vxi);
+
+ vxi->vx_hlist.next = LIST_POISON1;
+ vxi->vx_id = -1;
+
vx_info_exit_limit(&vxi->limit);
vx_info_exit_sched(&vxi->sched);
vx_info_exit_cvirt(&vxi->cvirt);
vx_info_exit_cacct(&vxi->cacct);
-
- BUG_ON(atomic_read(&vxi->vx_refcount));
- vxi->vx_id = -1;
+
+ BUG_ON(atomic_read(&vxi->vx_usecnt));
+ BUG_ON(atomic_read(&vxi->vx_refcnt));
+
+ BUG_ON(vx_info_state(vxi, VXS_HASHED));
+ // BUG_ON(!vx_state(vxi, VXS_DEFUNCT));
+
+ vxi->vx_state |= VXS_RELEASED;
kfree(vxi);
}
+static inline int __free_vx_info(struct vx_info *vxi)
+{
+ int usecnt, refcnt;
-/*
- * struct vx_info search by id
- * assumes vxlist_lock is held
- */
+ BUG_ON(!vxi);
+
+ usecnt = atomic_read(&vxi->vx_usecnt);
+ BUG_ON(usecnt < 0);
+
+ refcnt = atomic_read(&vxi->vx_refcnt);
+ BUG_ON(refcnt < 0);
+
+ if (!usecnt)
+ __dealloc_vx_info(vxi);
+ return usecnt;
+}
-static __inline__ struct vx_info *__find_vx_info(int id)
+#if 0
+
+static void __rcu_free_vx_info(struct rcu_head *head)
{
- struct vx_info *vxi;
+ struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
- list_for_each_entry(vxi, &vx_infos, vx_list)
- if (vxi->vx_id == id)
- return vxi;
- return 0;
+ BUG_ON(!head);
+ vxdprintk(VXD_CBIT(xid, 3),
+ "rcu_free_vx_info(%p): uc=%d", vxi,
+ atomic_read(&vxi->vx_usecnt));
+
+ __free_vx_info(vxi);
}
+#endif
-/*
- * struct vx_info ref stuff
- */
+void free_vx_info(struct vx_info *vxi)
+{
+ struct namespace *namespace;
+ struct fs_struct *fs;
+
+ /* context shutdown is mandatory */
+ // BUG_ON(vxi->vx_state != VXS_SHUTDOWN);
+
+ namespace = xchg(&vxi->vx_namespace, NULL);
+ fs = xchg(&vxi->vx_fs, NULL);
+
+ if (namespace)
+ put_namespace(namespace);
+ if (fs)
+ put_fs_struct(fs);
+
+ BUG_ON(__free_vx_info(vxi));
+ // call_rcu(&i->vx_rcu, __rcu_free_vx_info);
+}
+
+
+/* hash table for vx_info hash */
+
+#define VX_HASH_SIZE 13
-struct vx_info *find_vx_info(int id)
+struct hlist_head vx_info_hash[VX_HASH_SIZE];
+
+static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
+
+
+static inline unsigned int __hashval(xid_t xid)
{
- struct vx_info *vxi;
-
- if (id < 0) {
- vxi = current->vx_info;
- get_vx_info(vxi);
- } else {
- spin_lock(&vxlist_lock);
- if ((vxi = __find_vx_info(id)))
- get_vx_info(vxi);
- spin_unlock(&vxlist_lock);
- }
- return vxi;
+ return (xid % VX_HASH_SIZE);
}
-/*
- * verify that id is a valid xid
- */
-int vx_info_id_valid(int id)
+
+/* __hash_vx_info()
+
+ * add the vxi to the global hash table
+ * requires the hash_lock to be held */
+
+static inline void __hash_vx_info(struct vx_info *vxi)
{
- int valid;
+ struct hlist_head *head;
+
+ vxdprintk(VXD_CBIT(xid, 4),
+ "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
+ get_vx_info(vxi);
+ vxi->vx_state |= VXS_HASHED;
+ head = &vx_info_hash[__hashval(vxi->vx_id)];
+ hlist_add_head_rcu(&vxi->vx_hlist, head);
+}
+
+/* __unhash_vx_info()
- spin_lock(&vxlist_lock);
- valid = (__find_vx_info(id) != NULL);
- spin_unlock(&vxlist_lock);
- return valid;
+ * remove the vxi from the global hash table
+ * requires the hash_lock to be held */
+
+static inline void __unhash_vx_info(struct vx_info *vxi)
+{
+ vxdprintk(VXD_CBIT(xid, 4),
+ "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
+ vxi->vx_state &= ~VXS_HASHED;
+ hlist_del_rcu(&vxi->vx_hlist);
+ put_vx_info(vxi);
}
-/*
- * dynamic context id ...
- */
+/* __lookup_vx_info()
+
+ * requires the rcu_read_lock()
+ * doesn't increment the vx_refcnt */
-static __inline__ xid_t __vx_dynamic_id(void)
+static inline struct vx_info *__lookup_vx_info(xid_t xid)
+{
+ struct hlist_head *head = &vx_info_hash[__hashval(xid)];
+ struct hlist_node *pos;
+
+ hlist_for_each_rcu(pos, head) {
+ struct vx_info *vxi =
+ hlist_entry(pos, struct vx_info, vx_hlist);
+
+ if ((vxi->vx_id == xid) &&
+ vx_info_state(vxi, VXS_HASHED))
+ return vxi;
+ }
+ return NULL;
+}
+
+
+/* __vx_dynamic_id()
+
+ * find unused dynamic xid
+ * requires the hash_lock to be held */
+
+static inline xid_t __vx_dynamic_id(void)
{
static xid_t seq = MAX_S_CONTEXT;
xid_t barrier = seq;
-
+
do {
if (++seq > MAX_S_CONTEXT)
seq = MIN_D_CONTEXT;
- if (!__find_vx_info(seq))
+ if (!__lookup_vx_info(seq)) {
+ vxdprintk(VXD_CBIT(xid, 4),
+ "__vx_dynamic_id: [#%d]", seq);
return seq;
+ }
} while (barrier != seq);
return 0;
}
-static struct vx_info * __foc_vx_info(int id, int *err)
+/* __loc_vx_info()
+
+ * locate or create the requested context
+ * get() it and if new hash it */
+
+static struct vx_info * __loc_vx_info(int id, int *err)
{
struct vx_info *new, *vxi = NULL;
-
- vxdprintk("foc_vx_info(%d)\n", id);
- if (!(new = alloc_vx_info(id))) {
- *err = -ENOMEM;
- return NULL;
- }
- /* dirty hack until Spectator becomes a cap */
- if (id == 0 || id == 1) {
- *err = -EBUSY;
+ vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
+
+ if (!(new = __alloc_vx_info(id))) {
+ *err = -ENOMEM;
return NULL;
}
- spin_lock(&vxlist_lock);
+ spin_lock(&vx_info_hash_lock);
/* dynamic context requested */
if (id == VX_DYNAMIC_ID) {
new->vx_id = id;
}
/* existing context requested */
- else if ((vxi = __find_vx_info(id))) {
+ else if ((vxi = __lookup_vx_info(id))) {
/* context in setup is not available */
if (vxi->vx_flags & VXF_STATE_SETUP) {
- vxdprintk("foc_vx_info(%d) = %p (not available)\n", id, vxi);
+ vxdprintk(VXD_CBIT(xid, 0),
+ "loc_vx_info(%d) = %p (not available)", id, vxi);
vxi = NULL;
*err = -EBUSY;
} else {
- vxdprintk("foc_vx_info(%d) = %p (found)\n", id, vxi);
+ vxdprintk(VXD_CBIT(xid, 0),
+ "loc_vx_info(%d) = %p (found)", id, vxi);
get_vx_info(vxi);
*err = 0;
}
}
/* new context requested */
- vxdprintk("foc_vx_info(%d) = %p (new)\n", id, new);
- atomic_set(&new->vx_refcount, 1);
- list_add(&new->vx_list, &vx_infos);
+ vxdprintk(VXD_CBIT(xid, 0),
+ "loc_vx_info(%d) = %p (new)", id, new);
+ __hash_vx_info(get_vx_info(new));
vxi = new, new = NULL;
*err = 1;
out_unlock:
- spin_unlock(&vxlist_lock);
+ spin_unlock(&vx_info_hash_lock);
if (new)
- free_vx_info(new);
+ __dealloc_vx_info(new);
return vxi;
}
-struct vx_info *find_or_create_vx_info(int id)
+
+/* exported stuff */
+
+
+void unhash_vx_info(struct vx_info *vxi)
+{
+ spin_lock(&vx_info_hash_lock);
+ __unhash_vx_info(vxi);
+ spin_unlock(&vx_info_hash_lock);
+}
+
+/* locate_vx_info()
+
+ * search for a vx_info and get() it
+ * negative id means current */
+
+struct vx_info *locate_vx_info(int id)
+{
+ struct vx_info *vxi;
+
+ if (id < 0) {
+ vxi = get_vx_info(current->vx_info);
+ } else {
+ rcu_read_lock();
+ vxi = get_vx_info(__lookup_vx_info(id));
+ rcu_read_unlock();
+ }
+ return vxi;
+}
+
+/* vx_info_is_hashed()
+
+ * verify that xid is still hashed */
+
+int vx_info_is_hashed(xid_t xid)
+{
+ int hashed;
+
+ rcu_read_lock();
+ hashed = (__lookup_vx_info(xid) != NULL);
+ rcu_read_unlock();
+ return hashed;
+}
+
+#ifdef CONFIG_VSERVER_LEGACY
+
+#if 0
+struct vx_info *alloc_vx_info(xid_t xid)
+{
+ return __alloc_vx_info(xid);
+}
+#endif
+
+struct vx_info *locate_or_create_vx_info(int id)
{
int err;
- return __foc_vx_info(id, &err);
+ return __loc_vx_info(id, &err);
}
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+int get_xid_list(int index, unsigned int *xids, int size)
+{
+ int hindex, nr_xids = 0;
+
+ rcu_read_lock();
+ for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
+ struct hlist_head *head = &vx_info_hash[hindex];
+ struct hlist_node *pos;
+
+ hlist_for_each_rcu(pos, head) {
+ struct vx_info *vxi;
+
+ if (--index > 0)
+ continue;
+
+ vxi = hlist_entry(pos, struct vx_info, vx_hlist);
+ xids[nr_xids] = vxi->vx_id;
+ if (++nr_xids >= size)
+ goto out;
+ }
+ }
+out:
+ rcu_read_unlock();
+ return nr_xids;
+}
+#endif
int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
{
struct user_struct *new_user, *old_user;
-
+
if (!p || !vxi)
BUG();
new_user = alloc_uid(vxi->vx_id, p->uid);
static inline int vx_nofiles_task(struct task_struct *tsk)
{
struct files_struct *files = tsk->files;
- const unsigned long *obptr, *cbptr;
+ unsigned long *obptr;
int count, total;
spin_lock(&files->file_lock);
obptr = files->open_fds->fds_bits;
- cbptr = files->close_on_exec->fds_bits;
count = files->max_fds / (sizeof(unsigned long) * 8);
for (total = 0; count > 0; count--) {
if (*obptr)
total += hweight_long(*obptr);
obptr++;
- /* if (*cbptr)
- total += hweight_long(*cbptr);
- cbptr++; */
}
spin_unlock(&files->file_lock);
return total;
}
+#if 0
+
static inline int vx_openfd_task(struct task_struct *tsk)
{
struct files_struct *files = tsk->files;
return total;
}
+#endif
+
/*
* migrate task to new context
* gets vxi, puts old_vxi on change
int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
{
- struct vx_info *old_vxi = task_get_vx_info(p);
+ struct vx_info *old_vxi;
int ret = 0;
-
+
if (!p || !vxi)
BUG();
- vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
- vxi->vx_id, atomic_read(&vxi->vx_refcount));
+ old_vxi = task_get_vx_info(p);
if (old_vxi == vxi)
goto out;
+ vxdprintk(VXD_CBIT(xid, 5),
+ "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
+ vxi->vx_id, atomic_read(&vxi->vx_usecnt));
+
if (!(ret = vx_migrate_user(p, vxi))) {
+ int nofiles;
+
task_lock(p);
+ // openfd = vx_openfd_task(p);
+ nofiles = vx_nofiles_task(p);
+
if (old_vxi) {
- atomic_dec(&old_vxi->cacct.nr_threads);
- atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]);
- }
- atomic_inc(&vxi->cacct.nr_threads);
- atomic_inc(&vxi->limit.res[RLIMIT_NPROC]);
- atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]);
- atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]);
+ atomic_dec(&old_vxi->cvirt.nr_threads);
+ atomic_dec(&old_vxi->cvirt.nr_running);
+ atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
+ /* FIXME: what about the struct files here? */
+ // atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
+ // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
+ }
+ atomic_inc(&vxi->cvirt.nr_threads);
+ atomic_inc(&vxi->cvirt.nr_running);
+ atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
+ /* FIXME: what about the struct files here? */
+ // atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
+ // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
+
+ vxdprintk(VXD_CBIT(xid, 5),
+ "moved task %p into vxi:%p[#%d]",
+ p, vxi, vxi->vx_id);
+
+ /* should be handled in set_vx_info !! */
+ if (old_vxi)
+ clr_vx_info(&p->vx_info);
set_vx_info(&p->vx_info, vxi);
p->xid = vxi->vx_id;
vx_mask_bcaps(p);
task_unlock(p);
- put_vx_info(old_vxi);
+ /* obsoleted by clr/set */
+ // put_vx_info(old_vxi);
}
out:
put_vx_info(old_vxi);
{
if (!vxi)
return -EINVAL;
- if (vxi->vx_initpid)
- return -EPERM;
+ if (vxi->vx_initpid)
+ return -EPERM;
- vxi->vx_initpid = p->tgid;
+ vxdprintk(VXD_CBIT(xid, 6),
+ "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
+ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
+
+ vxi->vx_initpid = p->tgid;
return 0;
}
int vc_task_xid(uint32_t id, void __user *data)
{
- xid_t xid;
-
- if (id) {
- struct task_struct *tsk;
-
- if (!vx_check(0, VX_ADMIN|VX_WATCH))
- return -EPERM;
-
- read_lock(&tasklist_lock);
- tsk = find_task_by_pid(id);
- xid = (tsk) ? tsk->xid : -ESRCH;
- read_unlock(&tasklist_lock);
- }
- else
- xid = current->xid;
- return xid;
+ xid_t xid;
+
+ if (id) {
+ struct task_struct *tsk;
+
+ if (!vx_check(0, VX_ADMIN|VX_WATCH))
+ return -EPERM;
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_real_pid(id);
+ xid = (tsk) ? tsk->xid : -ESRCH;
+ read_unlock(&tasklist_lock);
+ }
+ else
+ xid = current->xid;
+ return xid;
}
if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
return -EPERM;
- vxi = find_vx_info(id);
+ vxi = locate_vx_info(id);
if (!vxi)
return -ESRCH;
int vc_ctx_create(uint32_t xid, void __user *data)
{
- // int ret = -ENOMEM;
struct vx_info *new_vxi;
int ret;
if (xid < 1)
return -EINVAL;
- new_vxi = __foc_vx_info(xid, &ret);
+ new_vxi = __loc_vx_info(xid, &ret);
if (!new_vxi)
return ret;
if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
ret = new_vxi->vx_id;
vx_migrate_task(current, new_vxi);
+ /* if this fails, we might end up with a hashed vx_info */
out_put:
put_vx_info(new_vxi);
return ret;
int vc_ctx_migrate(uint32_t id, void __user *data)
{
struct vx_info *vxi;
-
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
- vxi = find_vx_info(id);
+ vxi = locate_vx_info(id);
if (!vxi)
return -ESRCH;
vx_migrate_task(current, vxi);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- vxi = find_vx_info(id);
+ vxi = locate_vx_info(id);
if (!vxi)
return -ESRCH;
vc_data.flagword = vxi->vx_flags;
- // vc_data.mask = ~0UL;
/* special STATE flag handling */
vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
if (copy_from_user (&vc_data, data, sizeof(vc_data)))
return -EFAULT;
- vxi = find_vx_info(id);
+ vxi = locate_vx_info(id);
if (!vxi)
return -ESRCH;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- vxi = find_vx_info(id);
+ vxi = locate_vx_info(id);
if (!vxi)
return -ESRCH;
if (copy_from_user (&vc_data, data, sizeof(vc_data)))
return -EFAULT;
- vxi = find_vx_info(id);
+ vxi = locate_vx_info(id);
if (!vxi)
return -ESRCH;
#include <linux/module.h>
+// EXPORT_SYMBOL_GPL(rcu_free_vx_info);
EXPORT_SYMBOL_GPL(free_vx_info);
-EXPORT_SYMBOL_GPL(vxlist_lock);
+EXPORT_SYMBOL_GPL(vx_info_hash_lock);
+EXPORT_SYMBOL_GPL(unhash_vx_info);