X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fvserver%2Fcontext.c;h=d38b23cb0ba334acdff78dbe324588dec4e906bd;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=8b3cee7cc0630da8b2bd621668d6ca0396da1a6c;hpb=d46bc780027c5439db9f72d42c0732775b53925a;p=linux-2.6.git diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c index 8b3cee7cc..d38b23cb0 100644 --- a/kernel/vserver/context.c +++ b/kernel/vserver/context.c @@ -3,7 +3,7 @@ * * Virtual Server: Context Support * - * Copyright (C) 2003-2004 Herbert Pötzl + * Copyright (C) 2003-2007 Herbert Pötzl * * V0.01 context helper * V0.02 vx_ctx_kill syscall command @@ -13,20 +13,54 @@ * V0.06 task_xid and info commands * V0.07 context flags and caps * V0.08 switch to RCU based hash + * V0.09 revert to non RCU for now + * V0.10 and back to working RCU hash + * V0.11 and back to locking again + * V0.12 referenced context store + * V0.13 separate per cpu data + * V0.14 changed vcmds to vxi arg + * V0.15 added context stat + * V0.16 have __create claim() the vxi * */ -#include #include +#include +#include +#include + +#include #include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include + +#include +#include +#include +#include #include +#include "cvirt_init.h" +#include "cacct_init.h" +#include "limit_init.h" +#include "sched_init.h" + + +atomic_t vx_global_ctotal = ATOMIC_INIT(0); +atomic_t vx_global_cactive = ATOMIC_INIT(0); + + +/* now inactive context structures */ + +static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT; + +static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED; + /* __alloc_vx_info() @@ -36,8 +70,9 @@ static struct vx_info *__alloc_vx_info(xid_t xid) { struct vx_info *new = NULL; - - vxdprintk("alloc_vx_info(%d)\n", xid); + int cpu; + + vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid); /* would this benefit from a slab cache? */ new = kmalloc(sizeof(struct vx_info), GFP_KERNEL); @@ -45,11 +80,22 @@ static struct vx_info *__alloc_vx_info(xid_t xid) return 0; memset (new, 0, sizeof(struct vx_info)); +#ifdef CONFIG_SMP + new->ptr_pc = alloc_percpu(struct _vx_info_pc); + if (!new->ptr_pc) + goto error; +#endif new->vx_id = xid; - INIT_RCU_HEAD(&new->vx_rcu); INIT_HLIST_NODE(&new->vx_hlist); - atomic_set(&new->vx_refcnt, 0); atomic_set(&new->vx_usecnt, 0); + atomic_set(&new->vx_tasks, 0); + new->vx_parent = NULL; + new->vx_state = 0; + init_waitqueue_head(&new->vx_wait); + + /* prepare reaper */ + get_task_struct(init_pid_ns.child_reaper); + new->vx_reaper = init_pid_ns.child_reaper; /* rest of init goes here */ vx_info_init_limit(&new->limit); @@ -57,12 +103,32 @@ static struct vx_info *__alloc_vx_info(xid_t xid) vx_info_init_cvirt(&new->cvirt); vx_info_init_cacct(&new->cacct); - new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT; + /* per cpu data structures */ + for_each_possible_cpu(cpu) { + vx_info_init_sched_pc( + &vx_per_cpu(new, sched_pc, cpu), cpu); + vx_info_init_cvirt_pc( + &vx_per_cpu(new, cvirt_pc, cpu), cpu); + } + + new->vx_flags = VXF_INIT_SET; new->vx_bcaps = CAP_INIT_EFF_SET; new->vx_ccaps = 0; + new->vx_cap_bset = cap_bset; + + new->reboot_cmd = 0; + new->exit_code = 0; - vxdprintk("alloc_vx_info(%d) = %p\n", xid, new); + vxdprintk(VXD_CBIT(xid, 0), + "alloc_vx_info(%d) = %p", xid, new); + vxh_alloc_vx_info(new); + atomic_inc(&vx_global_ctotal); return new; +#ifdef CONFIG_SMP +error: + kfree(new); + return 0; +#endif } /* __dealloc_vx_info() @@ -71,33 +137,85 @@ static struct vx_info *__alloc_vx_info(xid_t xid) static void __dealloc_vx_info(struct vx_info *vxi) { - vxdprintk("dealloc_vx_info(%p)\n", vxi); + int cpu; + + vxdprintk(VXD_CBIT(xid, 0), + "dealloc_vx_info(%p)", vxi); + vxh_dealloc_vx_info(vxi); - vxi->vx_hlist.next = LIST_POISON1; vxi->vx_id = -1; - if (vxi->vx_namespace) - put_namespace(vxi->vx_namespace); - if (vxi->vx_fs) - put_fs_struct(vxi->vx_fs); - vx_info_exit_limit(&vxi->limit); vx_info_exit_sched(&vxi->sched); vx_info_exit_cvirt(&vxi->cvirt); vx_info_exit_cacct(&vxi->cacct); - - BUG_ON(atomic_read(&vxi->vx_usecnt)); - BUG_ON(atomic_read(&vxi->vx_refcnt)); + for_each_possible_cpu(cpu) { + vx_info_exit_sched_pc( + &vx_per_cpu(vxi, sched_pc, cpu), cpu); + vx_info_exit_cvirt_pc( + &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); + } + + vxi->vx_state |= VXS_RELEASED; + +#ifdef CONFIG_SMP + free_percpu(vxi->ptr_pc); +#endif kfree(vxi); + atomic_dec(&vx_global_ctotal); +} + +static void __shutdown_vx_info(struct vx_info *vxi) +{ + struct nsproxy *nsproxy; + struct fs_struct *fs; + + might_sleep(); + + vxi->vx_state |= VXS_SHUTDOWN; + vs_state_change(vxi, VSC_SHUTDOWN); + + nsproxy = xchg(&vxi->vx_nsproxy, NULL); + fs = xchg(&vxi->vx_fs, NULL); + + if (nsproxy) + put_nsproxy(nsproxy); + if (fs) + put_fs_struct(fs); +} + +/* exported stuff */ + +void free_vx_info(struct vx_info *vxi) +{ + unsigned long flags; + + /* context shutdown is mandatory */ + BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN)); + + BUG_ON(atomic_read(&vxi->vx_usecnt)); + BUG_ON(atomic_read(&vxi->vx_tasks)); + + BUG_ON(vx_info_state(vxi, VXS_HASHED)); + + BUG_ON(vxi->vx_nsproxy); + BUG_ON(vxi->vx_fs); + + spin_lock_irqsave(&vx_info_inactive_lock, flags); + hlist_del(&vxi->vx_hlist); + spin_unlock_irqrestore(&vx_info_inactive_lock, flags); + + __dealloc_vx_info(vxi); } /* hash table for vx_info hash */ -#define VX_HASH_SIZE 13 +#define VX_HASH_SIZE 13 -struct hlist_head vx_info_hash[VX_HASH_SIZE]; +static struct hlist_head vx_info_hash[VX_HASH_SIZE] = + { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT }; static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED; @@ -117,11 +235,19 @@ static inline unsigned int __hashval(xid_t xid) static inline void __hash_vx_info(struct vx_info *vxi) { struct hlist_head *head; - - vxdprintk("__hash_vx_info: %p[#%d]\n", vxi, vxi->vx_id); - get_vx_info(vxi); + + vxd_assert_lock(&vx_info_hash_lock); + vxdprintk(VXD_CBIT(xid, 4), + "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); + vxh_hash_vx_info(vxi); + + /* context must not be hashed */ + BUG_ON(vx_info_state(vxi, VXS_HASHED)); + + vxi->vx_state |= VXS_HASHED; head = &vx_info_hash[__hashval(vxi->vx_id)]; - hlist_add_head_rcu(&vxi->vx_hlist, head); + hlist_add_head(&vxi->vx_hlist, head); + atomic_inc(&vx_global_cactive); } /* __unhash_vx_info() @@ -131,31 +257,53 @@ static inline void __hash_vx_info(struct vx_info *vxi) static inline void __unhash_vx_info(struct vx_info *vxi) { - vxdprintk("__unhash_vx_info: %p[#%d]\n", vxi, vxi->vx_id); - hlist_del_rcu(&vxi->vx_hlist); - put_vx_info(vxi); + unsigned long flags; + + vxd_assert_lock(&vx_info_hash_lock); + vxdprintk(VXD_CBIT(xid, 4), + "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id, + atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks)); + vxh_unhash_vx_info(vxi); + + /* context must be hashed */ + BUG_ON(!vx_info_state(vxi, VXS_HASHED)); + /* but without tasks */ + BUG_ON(atomic_read(&vxi->vx_tasks)); + + vxi->vx_state &= ~VXS_HASHED; + hlist_del_init(&vxi->vx_hlist); + spin_lock_irqsave(&vx_info_inactive_lock, flags); + hlist_add_head(&vxi->vx_hlist, &vx_info_inactive); + spin_unlock_irqrestore(&vx_info_inactive_lock, flags); + atomic_dec(&vx_global_cactive); } /* __lookup_vx_info() - * requires the rcu_read_lock() + * requires the hash_lock to be held * doesn't increment the vx_refcnt */ static inline struct vx_info *__lookup_vx_info(xid_t xid) { struct hlist_head *head = &vx_info_hash[__hashval(xid)]; struct hlist_node *pos; + struct vx_info *vxi; + vxd_assert_lock(&vx_info_hash_lock); hlist_for_each(pos, head) { - struct vx_info *vxi = - hlist_entry(pos, struct vx_info, vx_hlist); + vxi = hlist_entry(pos, struct vx_info, vx_hlist); - if (vxi->vx_id == xid) { - return vxi; - } + if (vxi->vx_id == xid) + goto found; } - return NULL; + vxi = NULL; +found: + vxdprintk(VXD_CBIT(xid, 0), + "__lookup_vx_info(#%u): %p[#%u]", + xid, vxi, vxi?vxi->vx_id:0); + vxh_lookup_vx_info(vxi, xid); + return vxi; } @@ -168,16 +316,22 @@ static inline xid_t __vx_dynamic_id(void) { static xid_t seq = MAX_S_CONTEXT; xid_t barrier = seq; - + + vxd_assert_lock(&vx_info_hash_lock); do { if (++seq > MAX_S_CONTEXT) seq = MIN_D_CONTEXT; - if (!__lookup_vx_info(seq)) + if (!__lookup_vx_info(seq)) { + vxdprintk(VXD_CBIT(xid, 4), + "__vx_dynamic_id: [#%d]", seq); return seq; + } } while (barrier != seq); return 0; } +#ifdef CONFIG_VSERVER_LEGACY + /* __loc_vx_info() * locate or create the requested context @@ -186,34 +340,42 @@ static inline xid_t __vx_dynamic_id(void) static struct vx_info * __loc_vx_info(int id, int *err) { struct vx_info *new, *vxi = NULL; - - vxdprintk("loc_vx_info(%d)\n", id); + + vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id); if (!(new = __alloc_vx_info(id))) { *err = -ENOMEM; return NULL; } + /* required to make dynamic xids unique */ spin_lock(&vx_info_hash_lock); /* dynamic context requested */ if (id == VX_DYNAMIC_ID) { +#ifdef CONFIG_VSERVER_DYNAMIC_IDS id = __vx_dynamic_id(); if (!id) { printk(KERN_ERR "no dynamic context available.\n"); goto out_unlock; } new->vx_id = id; +#else + printk(KERN_ERR "dynamic contexts disabled.\n"); + goto out_unlock; +#endif } /* existing context requested */ else if ((vxi = __lookup_vx_info(id))) { /* context in setup is not available */ if (vxi->vx_flags & VXF_STATE_SETUP) { - vxdprintk("loc_vx_info(%d) = %p (not available)\n", id, vxi); + vxdprintk(VXD_CBIT(xid, 0), + "loc_vx_info(%d) = %p (not available)", id, vxi); vxi = NULL; *err = -EBUSY; } else { - vxdprintk("loc_vx_info(%d) = %p (found)\n", id, vxi); + vxdprintk(VXD_CBIT(xid, 0), + "loc_vx_info(%d) = %p (found)", id, vxi); get_vx_info(vxi); *err = 0; } @@ -221,91 +383,140 @@ static struct vx_info * __loc_vx_info(int id, int *err) } /* new context requested */ - vxdprintk("loc_vx_info(%d) = %p (new)\n", id, new); + vxdprintk(VXD_CBIT(xid, 0), + "loc_vx_info(%d) = %p (new)", id, new); __hash_vx_info(get_vx_info(new)); vxi = new, new = NULL; *err = 1; out_unlock: spin_unlock(&vx_info_hash_lock); + vxh_loc_vx_info(vxi, id); if (new) __dealloc_vx_info(new); return vxi; } +#endif +/* __create_vx_info() -/* exported stuff */ + * create the requested context + * get(), claim() and hash it */ +static struct vx_info * __create_vx_info(int id) +{ + struct vx_info *new, *vxi = NULL; + vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id); -void rcu_free_vx_info(void *obj) -{ - struct vx_info *vxi = obj; - int usecnt, refcnt; + if (!(new = __alloc_vx_info(id))) + return ERR_PTR(-ENOMEM); - usecnt = atomic_read(&vxi->vx_usecnt); - BUG_ON(usecnt < 0); + /* required to make dynamic xids unique */ + spin_lock(&vx_info_hash_lock); + + /* dynamic context requested */ + if (id == VX_DYNAMIC_ID) { +#ifdef CONFIG_VSERVER_DYNAMIC_IDS + id = __vx_dynamic_id(); + if (!id) { + printk(KERN_ERR "no dynamic context available.\n"); + vxi = ERR_PTR(-EAGAIN); + goto out_unlock; + } + new->vx_id = id; +#else + printk(KERN_ERR "dynamic contexts disabled.\n"); + vxi = ERR_PTR(-EINVAL); + goto out_unlock; +#endif + } + /* static context requested */ + else if ((vxi = __lookup_vx_info(id))) { + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) = %p (already there)", id, vxi); + if (vx_info_flags(vxi, VXF_STATE_SETUP, 0)) + vxi = ERR_PTR(-EBUSY); + else + vxi = ERR_PTR(-EEXIST); + goto out_unlock; + } +#ifdef CONFIG_VSERVER_DYNAMIC_IDS + /* dynamic xid creation blocker */ + else if (id >= MIN_D_CONTEXT) { + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) (dynamic rejected)", id); + vxi = ERR_PTR(-EINVAL); + goto out_unlock; + } +#endif - refcnt = atomic_read(&vxi->vx_refcnt); - BUG_ON(refcnt < 0); + /* new context */ + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) = %p (new)", id, new); + claim_vx_info(new, NULL); + __hash_vx_info(get_vx_info(new)); + vxi = new, new = NULL; - if (!usecnt) - __dealloc_vx_info(vxi); - else - printk("!!! rcu didn't free\n"); +out_unlock: + spin_unlock(&vx_info_hash_lock); + vxh_create_vx_info(IS_ERR(vxi)?NULL:vxi, id); + if (new) + __dealloc_vx_info(new); + return vxi; } + +/* exported stuff */ + + void unhash_vx_info(struct vx_info *vxi) { + __shutdown_vx_info(vxi); spin_lock(&vx_info_hash_lock); __unhash_vx_info(vxi); spin_unlock(&vx_info_hash_lock); + __wakeup_vx_info(vxi); } -/* locate_vx_info() - * search for a vx_info and get() it +/* lookup_vx_info() + + * search for a vx_info and get() it * negative id means current */ -struct vx_info *locate_vx_info(int id) +struct vx_info *lookup_vx_info(int id) { - struct vx_info *vxi; - + struct vx_info *vxi = NULL; + if (id < 0) { vxi = get_vx_info(current->vx_info); - } else { - rcu_read_lock(); + } else if (id > 1) { + spin_lock(&vx_info_hash_lock); vxi = get_vx_info(__lookup_vx_info(id)); - rcu_read_unlock(); + spin_unlock(&vx_info_hash_lock); } return vxi; } -/* vx_info_is_hashed() +/* xid_is_hashed() * verify that xid is still hashed */ -int vx_info_is_hashed(xid_t xid) +int xid_is_hashed(xid_t xid) { int hashed; - rcu_read_lock(); + spin_lock(&vx_info_hash_lock); hashed = (__lookup_vx_info(xid) != NULL); - rcu_read_unlock(); + spin_unlock(&vx_info_hash_lock); return hashed; } #ifdef CONFIG_VSERVER_LEGACY -#if 0 -struct vx_info *alloc_vx_info(xid_t xid) -{ - return __alloc_vx_info(xid); -} -#endif - -struct vx_info *locate_or_create_vx_info(int id) +struct vx_info *lookup_or_create_vx_info(int id) { int err; @@ -316,43 +527,75 @@ struct vx_info *locate_or_create_vx_info(int id) #ifdef CONFIG_PROC_FS -#define hlist_for_each_rcu(pos, head) \ - for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \ - pos = pos->next, ({ smp_read_barrier_depends(); 0;})) +/* get_xid_list() + + * get a subset of hashed xids for proc + * assumes size is at least one */ int get_xid_list(int index, unsigned int *xids, int size) { int hindex, nr_xids = 0; - rcu_read_lock(); + /* only show current and children */ + if (!vx_check(0, VS_ADMIN|VS_WATCH)) { + if (index > 0) + return 0; + xids[nr_xids] = vx_current_xid(); + return 1; + } + for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) { struct hlist_head *head = &vx_info_hash[hindex]; struct hlist_node *pos; - hlist_for_each_rcu(pos, head) { + spin_lock(&vx_info_hash_lock); + hlist_for_each(pos, head) { struct vx_info *vxi; if (--index > 0) continue; vxi = hlist_entry(pos, struct vx_info, vx_hlist); - xids[nr_xids] = vxi->vx_id; - if (++nr_xids >= size) + xids[nr_xids] = vxi->vx_id; + if (++nr_xids >= size) { + spin_unlock(&vx_info_hash_lock); goto out; + } } + /* keep the lock time short */ + spin_unlock(&vx_info_hash_lock); } out: - rcu_read_unlock(); return nr_xids; } #endif +#ifdef CONFIG_VSERVER_DEBUG + +void dump_vx_info_inactive(int level) +{ + struct hlist_node *entry, *next; + + hlist_for_each_safe(entry, next, &vx_info_inactive) { + struct vx_info *vxi = + list_entry(entry, struct vx_info, vx_hlist); + + dump_vx_info(vxi, level); + } +} + +#endif + int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) { struct user_struct *new_user, *old_user; - + if (!p || !vxi) BUG(); + + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) + return -EACCES; + new_user = alloc_uid(vxi->vx_id, p->uid); if (!new_user) return -ENOMEM; @@ -367,49 +610,28 @@ int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) return 0; } -void vx_mask_bcaps(struct task_struct *p) +void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p) { - struct vx_info *vxi = p->vx_info; - - p->cap_effective &= vxi->vx_bcaps; - p->cap_inheritable &= vxi->vx_bcaps; - p->cap_permitted &= vxi->vx_bcaps; + p->cap_effective &= vxi->vx_cap_bset; + p->cap_inheritable &= vxi->vx_cap_bset; + p->cap_permitted &= vxi->vx_cap_bset; } #include -static inline int vx_nofiles_task(struct task_struct *tsk) -{ - struct files_struct *files = tsk->files; - const unsigned long *obptr, *cbptr; - int count, total; - - spin_lock(&files->file_lock); - obptr = files->open_fds->fds_bits; - cbptr = files->close_on_exec->fds_bits; - count = files->max_fds / (sizeof(unsigned long) * 8); - for (total = 0; count > 0; count--) { - if (*obptr) - total += hweight_long(*obptr); - obptr++; - /* if (*cbptr) - total += hweight_long(*cbptr); - cbptr++; */ - } - spin_unlock(&files->file_lock); - return total; -} - -static inline int vx_openfd_task(struct task_struct *tsk) +static int vx_openfd_task(struct task_struct *tsk) { struct files_struct *files = tsk->files; + struct fdtable *fdt; const unsigned long *bptr; int count, total; + /* no rcu_read_lock() because of spin_lock() */ spin_lock(&files->file_lock); - bptr = files->open_fds->fds_bits; - count = files->max_fds / (sizeof(unsigned long) * 8); + fdt = files_fdtable(files); + bptr = fdt->open_fds->fds_bits; + count = fdt->max_fds / (sizeof(unsigned long) * 8); for (total = 0; count > 0; count--) { if (*bptr) total += hweight_long(*bptr); @@ -419,62 +641,190 @@ static inline int vx_openfd_task(struct task_struct *tsk) return total; } + +/* for *space compatibility */ + +asmlinkage long sys_unshare(unsigned long); + /* * migrate task to new context * gets vxi, puts old_vxi on change + * optionally unshares namespaces (hack) */ -int vx_migrate_task(struct task_struct *p, struct vx_info *vxi) +int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare) { struct vx_info *old_vxi; int ret = 0; - + if (!p || !vxi) BUG(); + vxdprintk(VXD_CBIT(xid, 5), + "vx_migrate_task(%p,%p[#%d.%d])", p, vxi, + vxi->vx_id, atomic_read(&vxi->vx_usecnt)); + + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) && + !vx_info_flags(vxi, VXF_STATE_SETUP, 0)) + return -EACCES; + + if (vx_info_state(vxi, VXS_SHUTDOWN)) + return -EFAULT; + old_vxi = task_get_vx_info(p); if (old_vxi == vxi) goto out; - vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi, - vxi->vx_id, atomic_read(&vxi->vx_usecnt)); - if (!(ret = vx_migrate_user(p, vxi))) { + int openfd; + task_lock(p); + openfd = vx_openfd_task(p); + if (old_vxi) { - atomic_dec(&old_vxi->cacct.nr_threads); - atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]); - } - atomic_inc(&vxi->cacct.nr_threads); - atomic_inc(&vxi->limit.res[RLIMIT_NPROC]); - atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]); - atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]); - /* should be handled in set_vx_info !! */ - if (old_vxi) + atomic_dec(&old_vxi->cvirt.nr_threads); + atomic_dec(&old_vxi->cvirt.nr_running); + __rlim_dec(&old_vxi->limit, RLIMIT_NPROC); + /* FIXME: what about the struct files here? */ + __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd); + /* account for the executable */ + __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY); + } + atomic_inc(&vxi->cvirt.nr_threads); + atomic_inc(&vxi->cvirt.nr_running); + __rlim_inc(&vxi->limit, RLIMIT_NPROC); + /* FIXME: what about the struct files here? */ + __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd); + /* account for the executable */ + __rlim_inc(&vxi->limit, VLIMIT_DENTRY); + + if (old_vxi) { + release_vx_info(old_vxi, p); clr_vx_info(&p->vx_info); + } + claim_vx_info(vxi, p); set_vx_info(&p->vx_info, vxi); p->xid = vxi->vx_id; - vx_mask_bcaps(p); + + vxdprintk(VXD_CBIT(xid, 5), + "moved task %p into vxi:%p[#%d]", + p, vxi, vxi->vx_id); + + vx_mask_cap_bset(vxi, p); task_unlock(p); - put_vx_info(old_vxi); + /* hack for *spaces to provide compatibility */ + if (unshare) { + ret = sys_unshare(CLONE_NEWUTS|CLONE_NEWIPC); + vx_set_space(vxi, CLONE_NEWUTS|CLONE_NEWIPC); + } } out: put_vx_info(old_vxi); return ret; } +int vx_set_reaper(struct vx_info *vxi, struct task_struct *p) +{ + struct task_struct *old_reaper; + + if (!vxi) + return -EINVAL; + + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_reaper(%p[#%d],%p[#%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid); + + old_reaper = vxi->vx_reaper; + if (old_reaper == p) + return 0; + + /* set new child reaper */ + get_task_struct(p); + vxi->vx_reaper = p; + put_task_struct(old_reaper); + return 0; +} + int vx_set_init(struct vx_info *vxi, struct task_struct *p) { if (!vxi) return -EINVAL; - if (vxi->vx_initpid) - return -EPERM; - vxi->vx_initpid = p->tgid; + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_init(%p[#%d],%p[#%d,%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); + + vxi->vx_flags &= ~VXF_STATE_INIT; + vxi->vx_initpid = p->tgid; return 0; } +void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_exit_init(%p[#%d],%p[#%d,%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); + + vxi->exit_code = code; + vxi->vx_initpid = 0; +} + + +void vx_set_persistent(struct vx_info *vxi) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id); + + get_vx_info(vxi); + claim_vx_info(vxi, NULL); +} + +void vx_clear_persistent(struct vx_info *vxi) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id); + + release_vx_info(vxi, NULL); + put_vx_info(vxi); +} + +void vx_update_persistent(struct vx_info *vxi) +{ + if (vx_info_flags(vxi, VXF_PERSISTENT, 0)) + vx_set_persistent(vxi); + else + vx_clear_persistent(vxi); +} + + +/* task must be current or locked */ + +void exit_vx_info(struct task_struct *p, int code) +{ + struct vx_info *vxi = p->vx_info; + + if (vxi) { + atomic_dec(&vxi->cvirt.nr_threads); + vx_nproc_dec(p); + + vxi->exit_code = code; + release_vx_info(vxi, p); + } +} + +void exit_vx_info_early(struct task_struct *p, int code) +{ + struct vx_info *vxi = p->vx_info; + + if (vxi) { + if (vxi->vx_initpid == p->tgid) + vx_exit_init(vxi, p, code); + if (vxi->vx_reaper == p) + vx_set_reaper(vxi, init_pid_ns.child_reaper); + } +} + /* vserver syscall commands below here */ @@ -485,42 +835,44 @@ int vx_set_init(struct vx_info *vxi, struct task_struct *p) int vc_task_xid(uint32_t id, void __user *data) { - xid_t xid; + xid_t xid; - if (id) { - struct task_struct *tsk; + if (id) { + struct task_struct *tsk; - if (!vx_check(0, VX_ADMIN|VX_WATCH)) - return -EPERM; + if (!vx_check(0, VS_ADMIN|VS_WATCH)) + return -EPERM; - read_lock(&tasklist_lock); - tsk = find_task_by_pid(id); - xid = (tsk) ? tsk->xid : -ESRCH; - read_unlock(&tasklist_lock); - } - else - xid = current->xid; - return xid; + read_lock(&tasklist_lock); + tsk = find_task_by_real_pid(id); + xid = (tsk) ? tsk->xid : -ESRCH; + read_unlock(&tasklist_lock); + } + else + xid = vx_current_xid(); + return xid; } -int vc_vx_info(uint32_t id, void __user *data) +int vc_vx_info(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_vx_info_v0 vc_data; - if (!vx_check(0, VX_ADMIN)) - return -ENOSYS; - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - vc_data.xid = vxi->vx_id; vc_data.initpid = vxi->vx_initpid; - put_vx_info(vxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +int vc_ctx_stat(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_stat_v0 vc_data; + + vc_data.usecnt = atomic_read(&vxi->vx_usecnt); + vc_data.tasks = atomic_read(&vxi->vx_tasks); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; @@ -532,157 +884,213 @@ int vc_vx_info(uint32_t id, void __user *data) int vc_ctx_create(uint32_t xid, void __user *data) { + struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET }; struct vx_info *new_vxi; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; - if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID)) + if ((xid > MAX_S_CONTEXT) && (xid != VX_DYNAMIC_ID)) return -EINVAL; - - if (xid < 1) + if (xid < 2) return -EINVAL; - new_vxi = __loc_vx_info(xid, &ret); - if (!new_vxi) - return ret; - if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) { - ret = -EEXIST; - goto out_put; - } + new_vxi = __create_vx_info(xid); + if (IS_ERR(new_vxi)) + return PTR_ERR(new_vxi); + + /* initial flags */ + new_vxi->vx_flags = vc_data.flagword; + ret = -ENOEXEC; + if (vs_state_change(new_vxi, VSC_STARTUP)) + goto out; + + ret = vx_migrate_task(current, new_vxi, (!data)); + if (ret) + goto out; + + /* return context id on success */ ret = new_vxi->vx_id; - vx_migrate_task(current, new_vxi); - /* if this fails, we might end up with a hashed vx_info */ -out_put: + + /* get a reference for persistent contexts */ + if ((vc_data.flagword & VXF_PERSISTENT)) + vx_set_persistent(new_vxi); +out: + release_vx_info(new_vxi, NULL); put_vx_info(new_vxi); return ret; } -int vc_ctx_migrate(uint32_t id, void __user *data) +int vc_ctx_migrate(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + struct vcmd_ctx_migrate vc_data = { .flagword = 0 }; + int ret; - /* dirty hack until Spectator becomes a cap */ - if (id == 1) { - current->xid = 1; - return 0; - } + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - vx_migrate_task(current, vxi); - put_vx_info(vxi); - return 0; + ret = vx_migrate_task(current, vxi, 0); + if (ret) + return ret; + if (vc_data.flagword & VXM_SET_INIT) + ret = vx_set_init(vxi, current); + if (ret) + return ret; + if (vc_data.flagword & VXM_SET_REAPER) + ret = vx_set_reaper(vxi, current); + return ret; } -int vc_get_cflags(uint32_t id, void __user *data) +int vc_get_cflags(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_flags_v0 vc_data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - vc_data.flagword = vxi->vx_flags; /* special STATE flag handling */ - vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME); - - put_vx_info(vxi); + vc_data.mask = vs_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; return 0; } -int vc_set_cflags(uint32_t id, void __user *data) +int vc_set_cflags(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_flags_v0 vc_data; uint64_t mask, trigger; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - /* special STATE flag handling */ - mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); + mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); - if (trigger & VXF_STATE_SETUP) - vx_mask_bcaps(current); - if (trigger & VXF_STATE_INIT) - if (vxi == current->vx_info) - vx_set_init(vxi, current); + if (vxi == current->vx_info) { + if (trigger & VXF_STATE_SETUP) + vx_mask_cap_bset(vxi, current); + if (trigger & VXF_STATE_INIT) { + int ret; + + ret = vx_set_init(vxi, current); + if (ret) + return ret; + ret = vx_set_reaper(vxi, current); + if (ret) + return ret; + } + } - vxi->vx_flags = vx_mask_flags(vxi->vx_flags, + vxi->vx_flags = vs_mask_flags(vxi->vx_flags, vc_data.flagword, mask); - put_vx_info(vxi); + if (trigger & VXF_PERSISTENT) + vx_update_persistent(vxi); + return 0; } -int vc_get_ccaps(uint32_t id, void __user *data) +static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps) +{ + if (bcaps) + *bcaps = vxi->vx_bcaps; + if (ccaps) + *ccaps = vxi->vx_ccaps; + + return 0; +} + +int vc_get_ccaps_v0(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_caps_v0 vc_data; + int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = do_get_caps(vxi, &vc_data.bcaps, &vc_data.ccaps); + if (ret) + return ret; + vc_data.cmask = ~0UL; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} - vc_data.bcaps = vxi->vx_bcaps; - vc_data.ccaps = vxi->vx_ccaps; +int vc_get_ccaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_caps_v1 vc_data; + int ret; + + ret = do_get_caps(vxi, NULL, &vc_data.ccaps); + if (ret) + return ret; vc_data.cmask = ~0UL; - put_vx_info(vxi); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; return 0; } -int vc_set_ccaps(uint32_t id, void __user *data) +static int do_set_caps(struct vx_info *vxi, + uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask) +{ + vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask); + vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask); + + return 0; +} + +int vc_set_ccaps_v0(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_caps_v0 vc_data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - - vxi->vx_bcaps &= vc_data.bcaps; - vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps, + /* simulate old &= behaviour for bcaps */ + return do_set_caps(vxi, 0, ~vc_data.bcaps, vc_data.ccaps, vc_data.cmask); - put_vx_info(vxi); +} + +int vc_set_ccaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_caps_v1 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask); +} + +int vc_get_bcaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_bcaps vc_data; + int ret; + + ret = do_get_caps(vxi, &vc_data.bcaps, NULL); + if (ret) + return ret; + vc_data.bmask = ~0UL; + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; return 0; } +int vc_set_bcaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_bcaps vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0); +} + #include -EXPORT_SYMBOL_GPL(rcu_free_vx_info); -EXPORT_SYMBOL_GPL(vx_info_hash_lock); +EXPORT_SYMBOL_GPL(free_vx_info);