X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fvserver%2Fcontext.c;h=d38b23cb0ba334acdff78dbe324588dec4e906bd;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=c897a8697c77658fc89baa8b145a58c9bc0ced3d;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c index c897a8697..d38b23cb0 100644 --- a/kernel/vserver/context.c +++ b/kernel/vserver/context.c @@ -3,7 +3,7 @@ * * Virtual Server: Context Support * - * Copyright (C) 2003-2004 Herbert Pötzl + * Copyright (C) 2003-2007 Herbert Pötzl * * V0.01 context helper * V0.02 vx_ctx_kill syscall command @@ -13,21 +13,54 @@ * V0.06 task_xid and info commands * V0.07 context flags and caps * V0.08 switch to RCU based hash + * V0.09 revert to non RCU for now + * V0.10 and back to working RCU hash + * V0.11 and back to locking again + * V0.12 referenced context store + * V0.13 separate per cpu data + * V0.14 changed vcmds to vxi arg + * V0.15 added context stat + * V0.16 have __create claim() the vxi * */ -#include #include -#include +#include +#include +#include + +#include +#include +#include #include -#include +#include +#include +#include +#include + #include -#include -#include -#include +#include +#include +#include #include +#include "cvirt_init.h" +#include "cacct_init.h" +#include "limit_init.h" +#include "sched_init.h" + + +atomic_t vx_global_ctotal = ATOMIC_INIT(0); +atomic_t vx_global_cactive = ATOMIC_INIT(0); + + +/* now inactive context structures */ + +static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT; + +static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED; + /* __alloc_vx_info() @@ -37,6 +70,7 @@ static struct vx_info *__alloc_vx_info(xid_t xid) { struct vx_info *new = NULL; + int cpu; vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid); @@ -46,15 +80,22 @@ static struct vx_info *__alloc_vx_info(xid_t xid) return 0; memset (new, 0, sizeof(struct vx_info)); +#ifdef CONFIG_SMP + new->ptr_pc = alloc_percpu(struct _vx_info_pc); + if (!new->ptr_pc) + goto error; +#endif new->vx_id = xid; - INIT_RCU_HEAD(&new->vx_rcu); INIT_HLIST_NODE(&new->vx_hlist); - atomic_set(&new->vx_refcnt, 0); atomic_set(&new->vx_usecnt, 0); + atomic_set(&new->vx_tasks, 0); new->vx_parent = NULL; new->vx_state = 0; - new->vx_lock = SPIN_LOCK_UNLOCKED; - init_waitqueue_head(&new->vx_exit); + init_waitqueue_head(&new->vx_wait); + + /* prepare reaper */ + get_task_struct(init_pid_ns.child_reaper); + new->vx_reaper = init_pid_ns.child_reaper; /* rest of init goes here */ vx_info_init_limit(&new->limit); @@ -62,14 +103,32 @@ static struct vx_info *__alloc_vx_info(xid_t xid) vx_info_init_cvirt(&new->cvirt); vx_info_init_cacct(&new->cacct); + /* per cpu data structures */ + for_each_possible_cpu(cpu) { + vx_info_init_sched_pc( + &vx_per_cpu(new, sched_pc, cpu), cpu); + vx_info_init_cvirt_pc( + &vx_per_cpu(new, cvirt_pc, cpu), cpu); + } - new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT; + new->vx_flags = VXF_INIT_SET; new->vx_bcaps = CAP_INIT_EFF_SET; new->vx_ccaps = 0; + new->vx_cap_bset = cap_bset; + + new->reboot_cmd = 0; + new->exit_code = 0; vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d) = %p", xid, new); + vxh_alloc_vx_info(new); + atomic_inc(&vx_global_ctotal); return new; +#ifdef CONFIG_SMP +error: + kfree(new); + return 0; +#endif } /* __dealloc_vx_info() @@ -78,10 +137,12 @@ static struct vx_info *__alloc_vx_info(xid_t xid) static void __dealloc_vx_info(struct vx_info *vxi) { + int cpu; + vxdprintk(VXD_CBIT(xid, 0), "dealloc_vx_info(%p)", vxi); + vxh_dealloc_vx_info(vxi); - vxi->vx_hlist.next = LIST_POISON1; vxi->vx_id = -1; vx_info_exit_limit(&vxi->limit); @@ -89,68 +150,63 @@ static void __dealloc_vx_info(struct vx_info *vxi) vx_info_exit_cvirt(&vxi->cvirt); vx_info_exit_cacct(&vxi->cacct); - - BUG_ON(atomic_read(&vxi->vx_usecnt)); - BUG_ON(atomic_read(&vxi->vx_refcnt)); - - BUG_ON(vx_info_state(vxi, VXS_HASHED)); - // BUG_ON(!vx_state(vxi, VXS_DEFUNCT)); + for_each_possible_cpu(cpu) { + vx_info_exit_sched_pc( + &vx_per_cpu(vxi, sched_pc, cpu), cpu); + vx_info_exit_cvirt_pc( + &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); + } vxi->vx_state |= VXS_RELEASED; + +#ifdef CONFIG_SMP + free_percpu(vxi->ptr_pc); +#endif kfree(vxi); + atomic_dec(&vx_global_ctotal); } -static inline int __free_vx_info(struct vx_info *vxi) +static void __shutdown_vx_info(struct vx_info *vxi) { - int usecnt, refcnt; - - BUG_ON(!vxi); - - usecnt = atomic_read(&vxi->vx_usecnt); - BUG_ON(usecnt < 0); - - refcnt = atomic_read(&vxi->vx_refcnt); - BUG_ON(refcnt < 0); - - if (!usecnt) - __dealloc_vx_info(vxi); - return usecnt; -} + struct nsproxy *nsproxy; + struct fs_struct *fs; -#if 0 + might_sleep(); -static void __rcu_free_vx_info(struct rcu_head *head) -{ - struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu); + vxi->vx_state |= VXS_SHUTDOWN; + vs_state_change(vxi, VSC_SHUTDOWN); - BUG_ON(!head); - vxdprintk(VXD_CBIT(xid, 3), - "rcu_free_vx_info(%p): uc=%d", vxi, - atomic_read(&vxi->vx_usecnt)); + nsproxy = xchg(&vxi->vx_nsproxy, NULL); + fs = xchg(&vxi->vx_fs, NULL); - __free_vx_info(vxi); + if (nsproxy) + put_nsproxy(nsproxy); + if (fs) + put_fs_struct(fs); } -#endif +/* exported stuff */ void free_vx_info(struct vx_info *vxi) { - struct namespace *namespace; - struct fs_struct *fs; + unsigned long flags; /* context shutdown is mandatory */ - // BUG_ON(vxi->vx_state != VXS_SHUTDOWN); + BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN)); - namespace = xchg(&vxi->vx_namespace, NULL); - fs = xchg(&vxi->vx_fs, NULL); + BUG_ON(atomic_read(&vxi->vx_usecnt)); + BUG_ON(atomic_read(&vxi->vx_tasks)); - if (namespace) - put_namespace(namespace); - if (fs) - put_fs_struct(fs); + BUG_ON(vx_info_state(vxi, VXS_HASHED)); + + BUG_ON(vxi->vx_nsproxy); + BUG_ON(vxi->vx_fs); + + spin_lock_irqsave(&vx_info_inactive_lock, flags); + hlist_del(&vxi->vx_hlist); + spin_unlock_irqrestore(&vx_info_inactive_lock, flags); - BUG_ON(__free_vx_info(vxi)); - // call_rcu(&i->vx_rcu, __rcu_free_vx_info); + __dealloc_vx_info(vxi); } @@ -158,7 +214,8 @@ void free_vx_info(struct vx_info *vxi) #define VX_HASH_SIZE 13 -struct hlist_head vx_info_hash[VX_HASH_SIZE]; +static struct hlist_head vx_info_hash[VX_HASH_SIZE] = + { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT }; static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED; @@ -179,12 +236,18 @@ static inline void __hash_vx_info(struct vx_info *vxi) { struct hlist_head *head; + vxd_assert_lock(&vx_info_hash_lock); vxdprintk(VXD_CBIT(xid, 4), "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); - get_vx_info(vxi); + vxh_hash_vx_info(vxi); + + /* context must not be hashed */ + BUG_ON(vx_info_state(vxi, VXS_HASHED)); + vxi->vx_state |= VXS_HASHED; head = &vx_info_hash[__hashval(vxi->vx_id)]; - hlist_add_head_rcu(&vxi->vx_hlist, head); + hlist_add_head(&vxi->vx_hlist, head); + atomic_inc(&vx_global_cactive); } /* __unhash_vx_info() @@ -194,33 +257,53 @@ static inline void __hash_vx_info(struct vx_info *vxi) static inline void __unhash_vx_info(struct vx_info *vxi) { + unsigned long flags; + + vxd_assert_lock(&vx_info_hash_lock); vxdprintk(VXD_CBIT(xid, 4), - "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id); + "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id, + atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks)); + vxh_unhash_vx_info(vxi); + + /* context must be hashed */ + BUG_ON(!vx_info_state(vxi, VXS_HASHED)); + /* but without tasks */ + BUG_ON(atomic_read(&vxi->vx_tasks)); + vxi->vx_state &= ~VXS_HASHED; - hlist_del_rcu(&vxi->vx_hlist); - put_vx_info(vxi); + hlist_del_init(&vxi->vx_hlist); + spin_lock_irqsave(&vx_info_inactive_lock, flags); + hlist_add_head(&vxi->vx_hlist, &vx_info_inactive); + spin_unlock_irqrestore(&vx_info_inactive_lock, flags); + atomic_dec(&vx_global_cactive); } /* __lookup_vx_info() - * requires the rcu_read_lock() + * requires the hash_lock to be held * doesn't increment the vx_refcnt */ static inline struct vx_info *__lookup_vx_info(xid_t xid) { struct hlist_head *head = &vx_info_hash[__hashval(xid)]; struct hlist_node *pos; + struct vx_info *vxi; - hlist_for_each_rcu(pos, head) { - struct vx_info *vxi = - hlist_entry(pos, struct vx_info, vx_hlist); + vxd_assert_lock(&vx_info_hash_lock); + hlist_for_each(pos, head) { + vxi = hlist_entry(pos, struct vx_info, vx_hlist); - if ((vxi->vx_id == xid) && - vx_info_state(vxi, VXS_HASHED)) - return vxi; + if (vxi->vx_id == xid) + goto found; } - return NULL; + vxi = NULL; +found: + vxdprintk(VXD_CBIT(xid, 0), + "__lookup_vx_info(#%u): %p[#%u]", + xid, vxi, vxi?vxi->vx_id:0); + vxh_lookup_vx_info(vxi, xid); + return vxi; } @@ -234,6 +317,7 @@ static inline xid_t __vx_dynamic_id(void) static xid_t seq = MAX_S_CONTEXT; xid_t barrier = seq; + vxd_assert_lock(&vx_info_hash_lock); do { if (++seq > MAX_S_CONTEXT) seq = MIN_D_CONTEXT; @@ -246,6 +330,8 @@ static inline xid_t __vx_dynamic_id(void) return 0; } +#ifdef CONFIG_VSERVER_LEGACY + /* __loc_vx_info() * locate or create the requested context @@ -262,16 +348,22 @@ static struct vx_info * __loc_vx_info(int id, int *err) return NULL; } + /* required to make dynamic xids unique */ spin_lock(&vx_info_hash_lock); /* dynamic context requested */ if (id == VX_DYNAMIC_ID) { +#ifdef CONFIG_VSERVER_DYNAMIC_IDS id = __vx_dynamic_id(); if (!id) { printk(KERN_ERR "no dynamic context available.\n"); goto out_unlock; } new->vx_id = id; +#else + printk(KERN_ERR "dynamic contexts disabled.\n"); + goto out_unlock; +#endif } /* existing context requested */ else if ((vxi = __lookup_vx_info(id))) { @@ -299,11 +391,81 @@ static struct vx_info * __loc_vx_info(int id, int *err) out_unlock: spin_unlock(&vx_info_hash_lock); + vxh_loc_vx_info(vxi, id); if (new) __dealloc_vx_info(new); return vxi; } +#endif + +/* __create_vx_info() + + * create the requested context + * get(), claim() and hash it */ + +static struct vx_info * __create_vx_info(int id) +{ + struct vx_info *new, *vxi = NULL; + + vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id); + + if (!(new = __alloc_vx_info(id))) + return ERR_PTR(-ENOMEM); + + /* required to make dynamic xids unique */ + spin_lock(&vx_info_hash_lock); + + /* dynamic context requested */ + if (id == VX_DYNAMIC_ID) { +#ifdef CONFIG_VSERVER_DYNAMIC_IDS + id = __vx_dynamic_id(); + if (!id) { + printk(KERN_ERR "no dynamic context available.\n"); + vxi = ERR_PTR(-EAGAIN); + goto out_unlock; + } + new->vx_id = id; +#else + printk(KERN_ERR "dynamic contexts disabled.\n"); + vxi = ERR_PTR(-EINVAL); + goto out_unlock; +#endif + } + /* static context requested */ + else if ((vxi = __lookup_vx_info(id))) { + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) = %p (already there)", id, vxi); + if (vx_info_flags(vxi, VXF_STATE_SETUP, 0)) + vxi = ERR_PTR(-EBUSY); + else + vxi = ERR_PTR(-EEXIST); + goto out_unlock; + } +#ifdef CONFIG_VSERVER_DYNAMIC_IDS + /* dynamic xid creation blocker */ + else if (id >= MIN_D_CONTEXT) { + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) (dynamic rejected)", id); + vxi = ERR_PTR(-EINVAL); + goto out_unlock; + } +#endif + + /* new context */ + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) = %p (new)", id, new); + claim_vx_info(new, NULL); + __hash_vx_info(get_vx_info(new)); + vxi = new, new = NULL; + +out_unlock: + spin_unlock(&vx_info_hash_lock); + vxh_create_vx_info(IS_ERR(vxi)?NULL:vxi, id); + if (new) + __dealloc_vx_info(new); + return vxi; +} /* exported stuff */ @@ -311,54 +473,50 @@ out_unlock: void unhash_vx_info(struct vx_info *vxi) { + __shutdown_vx_info(vxi); spin_lock(&vx_info_hash_lock); __unhash_vx_info(vxi); spin_unlock(&vx_info_hash_lock); + __wakeup_vx_info(vxi); } -/* locate_vx_info() + +/* lookup_vx_info() * search for a vx_info and get() it * negative id means current */ -struct vx_info *locate_vx_info(int id) +struct vx_info *lookup_vx_info(int id) { - struct vx_info *vxi; + struct vx_info *vxi = NULL; if (id < 0) { vxi = get_vx_info(current->vx_info); - } else { - rcu_read_lock(); + } else if (id > 1) { + spin_lock(&vx_info_hash_lock); vxi = get_vx_info(__lookup_vx_info(id)); - rcu_read_unlock(); + spin_unlock(&vx_info_hash_lock); } return vxi; } -/* vx_info_is_hashed() +/* xid_is_hashed() * verify that xid is still hashed */ -int vx_info_is_hashed(xid_t xid) +int xid_is_hashed(xid_t xid) { int hashed; - rcu_read_lock(); + spin_lock(&vx_info_hash_lock); hashed = (__lookup_vx_info(xid) != NULL); - rcu_read_unlock(); + spin_unlock(&vx_info_hash_lock); return hashed; } #ifdef CONFIG_VSERVER_LEGACY -#if 0 -struct vx_info *alloc_vx_info(xid_t xid) -{ - return __alloc_vx_info(xid); -} -#endif - -struct vx_info *locate_or_create_vx_info(int id) +struct vx_info *lookup_or_create_vx_info(int id) { int err; @@ -369,16 +527,29 @@ struct vx_info *locate_or_create_vx_info(int id) #ifdef CONFIG_PROC_FS +/* get_xid_list() + + * get a subset of hashed xids for proc + * assumes size is at least one */ + int get_xid_list(int index, unsigned int *xids, int size) { int hindex, nr_xids = 0; - rcu_read_lock(); + /* only show current and children */ + if (!vx_check(0, VS_ADMIN|VS_WATCH)) { + if (index > 0) + return 0; + xids[nr_xids] = vx_current_xid(); + return 1; + } + for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) { struct hlist_head *head = &vx_info_hash[hindex]; struct hlist_node *pos; - hlist_for_each_rcu(pos, head) { + spin_lock(&vx_info_hash_lock); + hlist_for_each(pos, head) { struct vx_info *vxi; if (--index > 0) @@ -386,22 +557,45 @@ int get_xid_list(int index, unsigned int *xids, int size) vxi = hlist_entry(pos, struct vx_info, vx_hlist); xids[nr_xids] = vxi->vx_id; - if (++nr_xids >= size) + if (++nr_xids >= size) { + spin_unlock(&vx_info_hash_lock); goto out; + } } + /* keep the lock time short */ + spin_unlock(&vx_info_hash_lock); } out: - rcu_read_unlock(); return nr_xids; } #endif +#ifdef CONFIG_VSERVER_DEBUG + +void dump_vx_info_inactive(int level) +{ + struct hlist_node *entry, *next; + + hlist_for_each_safe(entry, next, &vx_info_inactive) { + struct vx_info *vxi = + list_entry(entry, struct vx_info, vx_hlist); + + dump_vx_info(vxi, level); + } +} + +#endif + int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) { struct user_struct *new_user, *old_user; if (!p || !vxi) BUG(); + + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) + return -EACCES; + new_user = alloc_uid(vxi->vx_id, p->uid); if (!new_user) return -ENOMEM; @@ -416,47 +610,28 @@ int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) return 0; } -void vx_mask_bcaps(struct task_struct *p) +void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p) { - struct vx_info *vxi = p->vx_info; - - p->cap_effective &= vxi->vx_bcaps; - p->cap_inheritable &= vxi->vx_bcaps; - p->cap_permitted &= vxi->vx_bcaps; + p->cap_effective &= vxi->vx_cap_bset; + p->cap_inheritable &= vxi->vx_cap_bset; + p->cap_permitted &= vxi->vx_cap_bset; } #include -static inline int vx_nofiles_task(struct task_struct *tsk) -{ - struct files_struct *files = tsk->files; - unsigned long *obptr; - int count, total; - - spin_lock(&files->file_lock); - obptr = files->open_fds->fds_bits; - count = files->max_fds / (sizeof(unsigned long) * 8); - for (total = 0; count > 0; count--) { - if (*obptr) - total += hweight_long(*obptr); - obptr++; - } - spin_unlock(&files->file_lock); - return total; -} - -#if 0 - -static inline int vx_openfd_task(struct task_struct *tsk) +static int vx_openfd_task(struct task_struct *tsk) { struct files_struct *files = tsk->files; + struct fdtable *fdt; const unsigned long *bptr; int count, total; + /* no rcu_read_lock() because of spin_lock() */ spin_lock(&files->file_lock); - bptr = files->open_fds->fds_bits; - count = files->max_fds / (sizeof(unsigned long) * 8); + fdt = files_fdtable(files); + bptr = fdt->open_fds->fds_bits; + count = fdt->max_fds / (sizeof(unsigned long) * 8); for (total = 0; count > 0; count--) { if (*bptr) total += hweight_long(*bptr); @@ -466,14 +641,18 @@ static inline int vx_openfd_task(struct task_struct *tsk) return total; } -#endif + +/* for *space compatibility */ + +asmlinkage long sys_unshare(unsigned long); /* * migrate task to new context * gets vxi, puts old_vxi on change + * optionally unshares namespaces (hack) */ -int vx_migrate_task(struct task_struct *p, struct vx_info *vxi) +int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare) { struct vx_info *old_vxi; int ret = 0; @@ -481,71 +660,171 @@ int vx_migrate_task(struct task_struct *p, struct vx_info *vxi) if (!p || !vxi) BUG(); - old_vxi = task_get_vx_info(p); - if (old_vxi == vxi) - goto out; - vxdprintk(VXD_CBIT(xid, 5), "vx_migrate_task(%p,%p[#%d.%d])", p, vxi, vxi->vx_id, atomic_read(&vxi->vx_usecnt)); + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) && + !vx_info_flags(vxi, VXF_STATE_SETUP, 0)) + return -EACCES; + + if (vx_info_state(vxi, VXS_SHUTDOWN)) + return -EFAULT; + + old_vxi = task_get_vx_info(p); + if (old_vxi == vxi) + goto out; + if (!(ret = vx_migrate_user(p, vxi))) { - int nofiles; + int openfd; task_lock(p); - // openfd = vx_openfd_task(p); - nofiles = vx_nofiles_task(p); + openfd = vx_openfd_task(p); if (old_vxi) { atomic_dec(&old_vxi->cvirt.nr_threads); atomic_dec(&old_vxi->cvirt.nr_running); - atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]); + __rlim_dec(&old_vxi->limit, RLIMIT_NPROC); /* FIXME: what about the struct files here? */ - // atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]); - // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]); + __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd); + /* account for the executable */ + __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY); } atomic_inc(&vxi->cvirt.nr_threads); atomic_inc(&vxi->cvirt.nr_running); - atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]); + __rlim_inc(&vxi->limit, RLIMIT_NPROC); /* FIXME: what about the struct files here? */ - // atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]); - // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]); + __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd); + /* account for the executable */ + __rlim_inc(&vxi->limit, VLIMIT_DENTRY); + + if (old_vxi) { + release_vx_info(old_vxi, p); + clr_vx_info(&p->vx_info); + } + claim_vx_info(vxi, p); + set_vx_info(&p->vx_info, vxi); + p->xid = vxi->vx_id; vxdprintk(VXD_CBIT(xid, 5), "moved task %p into vxi:%p[#%d]", p, vxi, vxi->vx_id); - /* should be handled in set_vx_info !! */ - if (old_vxi) - clr_vx_info(&p->vx_info); - set_vx_info(&p->vx_info, vxi); - p->xid = vxi->vx_id; - vx_mask_bcaps(p); + vx_mask_cap_bset(vxi, p); task_unlock(p); - /* obsoleted by clr/set */ - // put_vx_info(old_vxi); + /* hack for *spaces to provide compatibility */ + if (unshare) { + ret = sys_unshare(CLONE_NEWUTS|CLONE_NEWIPC); + vx_set_space(vxi, CLONE_NEWUTS|CLONE_NEWIPC); + } } out: put_vx_info(old_vxi); return ret; } +int vx_set_reaper(struct vx_info *vxi, struct task_struct *p) +{ + struct task_struct *old_reaper; + + if (!vxi) + return -EINVAL; + + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_reaper(%p[#%d],%p[#%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid); + + old_reaper = vxi->vx_reaper; + if (old_reaper == p) + return 0; + + /* set new child reaper */ + get_task_struct(p); + vxi->vx_reaper = p; + put_task_struct(old_reaper); + return 0; +} + int vx_set_init(struct vx_info *vxi, struct task_struct *p) { if (!vxi) return -EINVAL; - if (vxi->vx_initpid) - return -EPERM; vxdprintk(VXD_CBIT(xid, 6), "vx_set_init(%p[#%d],%p[#%d,%d,%d])", vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); + vxi->vx_flags &= ~VXF_STATE_INIT; vxi->vx_initpid = p->tgid; return 0; } +void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_exit_init(%p[#%d],%p[#%d,%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); + + vxi->exit_code = code; + vxi->vx_initpid = 0; +} + + +void vx_set_persistent(struct vx_info *vxi) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id); + + get_vx_info(vxi); + claim_vx_info(vxi, NULL); +} + +void vx_clear_persistent(struct vx_info *vxi) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id); + + release_vx_info(vxi, NULL); + put_vx_info(vxi); +} + +void vx_update_persistent(struct vx_info *vxi) +{ + if (vx_info_flags(vxi, VXF_PERSISTENT, 0)) + vx_set_persistent(vxi); + else + vx_clear_persistent(vxi); +} + + +/* task must be current or locked */ + +void exit_vx_info(struct task_struct *p, int code) +{ + struct vx_info *vxi = p->vx_info; + + if (vxi) { + atomic_dec(&vxi->cvirt.nr_threads); + vx_nproc_dec(p); + + vxi->exit_code = code; + release_vx_info(vxi, p); + } +} + +void exit_vx_info_early(struct task_struct *p, int code) +{ + struct vx_info *vxi = p->vx_info; + + if (vxi) { + if (vxi->vx_initpid == p->tgid) + vx_exit_init(vxi, p, code); + if (vxi->vx_reaper == p) + vx_set_reaper(vxi, init_pid_ns.child_reaper); + } +} + /* vserver syscall commands below here */ @@ -561,7 +840,7 @@ int vc_task_xid(uint32_t id, void __user *data) if (id) { struct task_struct *tsk; - if (!vx_check(0, VX_ADMIN|VX_WATCH)) + if (!vx_check(0, VS_ADMIN|VS_WATCH)) return -EPERM; read_lock(&tasklist_lock); @@ -570,28 +849,30 @@ int vc_task_xid(uint32_t id, void __user *data) read_unlock(&tasklist_lock); } else - xid = current->xid; + xid = vx_current_xid(); return xid; } -int vc_vx_info(uint32_t id, void __user *data) +int vc_vx_info(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_vx_info_v0 vc_data; - if (!vx_check(0, VX_ADMIN)) - return -ENOSYS; - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - vc_data.xid = vxi->vx_id; vc_data.initpid = vxi->vx_initpid; - put_vx_info(vxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +int vc_ctx_stat(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_stat_v0 vc_data; + + vc_data.usecnt = atomic_read(&vxi->vx_usecnt); + vc_data.tasks = atomic_read(&vxi->vx_tasks); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; @@ -603,159 +884,213 @@ int vc_vx_info(uint32_t id, void __user *data) int vc_ctx_create(uint32_t xid, void __user *data) { + struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET }; struct vx_info *new_vxi; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; - if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID)) + if ((xid > MAX_S_CONTEXT) && (xid != VX_DYNAMIC_ID)) return -EINVAL; - - if (xid < 1) + if (xid < 2) return -EINVAL; - new_vxi = __loc_vx_info(xid, &ret); - if (!new_vxi) - return ret; - if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) { - ret = -EEXIST; - goto out_put; - } + new_vxi = __create_vx_info(xid); + if (IS_ERR(new_vxi)) + return PTR_ERR(new_vxi); + /* initial flags */ + new_vxi->vx_flags = vc_data.flagword; + + ret = -ENOEXEC; + if (vs_state_change(new_vxi, VSC_STARTUP)) + goto out; + + ret = vx_migrate_task(current, new_vxi, (!data)); + if (ret) + goto out; + + /* return context id on success */ ret = new_vxi->vx_id; - vx_migrate_task(current, new_vxi); - /* if this fails, we might end up with a hashed vx_info */ -out_put: + + /* get a reference for persistent contexts */ + if ((vc_data.flagword & VXF_PERSISTENT)) + vx_set_persistent(new_vxi); +out: + release_vx_info(new_vxi, NULL); put_vx_info(new_vxi); return ret; } -int vc_ctx_migrate(uint32_t id, void __user *data) +int vc_ctx_migrate(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + struct vcmd_ctx_migrate vc_data = { .flagword = 0 }; + int ret; - /* dirty hack until Spectator becomes a cap */ - if (id == 1) { - current->xid = 1; - return 0; - } + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - vx_migrate_task(current, vxi); - put_vx_info(vxi); - return 0; + ret = vx_migrate_task(current, vxi, 0); + if (ret) + return ret; + if (vc_data.flagword & VXM_SET_INIT) + ret = vx_set_init(vxi, current); + if (ret) + return ret; + if (vc_data.flagword & VXM_SET_REAPER) + ret = vx_set_reaper(vxi, current); + return ret; } -int vc_get_cflags(uint32_t id, void __user *data) +int vc_get_cflags(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_flags_v0 vc_data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - vc_data.flagword = vxi->vx_flags; /* special STATE flag handling */ - vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME); - - put_vx_info(vxi); + vc_data.mask = vs_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; return 0; } -int vc_set_cflags(uint32_t id, void __user *data) +int vc_set_cflags(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_flags_v0 vc_data; uint64_t mask, trigger; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - /* special STATE flag handling */ - mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); + mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); - if (trigger & VXF_STATE_SETUP) - vx_mask_bcaps(current); - if (trigger & VXF_STATE_INIT) - if (vxi == current->vx_info) - vx_set_init(vxi, current); + if (vxi == current->vx_info) { + if (trigger & VXF_STATE_SETUP) + vx_mask_cap_bset(vxi, current); + if (trigger & VXF_STATE_INIT) { + int ret; + + ret = vx_set_init(vxi, current); + if (ret) + return ret; + ret = vx_set_reaper(vxi, current); + if (ret) + return ret; + } + } - vxi->vx_flags = vx_mask_flags(vxi->vx_flags, + vxi->vx_flags = vs_mask_flags(vxi->vx_flags, vc_data.flagword, mask); - put_vx_info(vxi); + if (trigger & VXF_PERSISTENT) + vx_update_persistent(vxi); + return 0; } -int vc_get_ccaps(uint32_t id, void __user *data) +static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps) +{ + if (bcaps) + *bcaps = vxi->vx_bcaps; + if (ccaps) + *ccaps = vxi->vx_ccaps; + + return 0; +} + +int vc_get_ccaps_v0(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_caps_v0 vc_data; + int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = do_get_caps(vxi, &vc_data.bcaps, &vc_data.ccaps); + if (ret) + return ret; + vc_data.cmask = ~0UL; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} - vc_data.bcaps = vxi->vx_bcaps; - vc_data.ccaps = vxi->vx_ccaps; +int vc_get_ccaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_caps_v1 vc_data; + int ret; + + ret = do_get_caps(vxi, NULL, &vc_data.ccaps); + if (ret) + return ret; vc_data.cmask = ~0UL; - put_vx_info(vxi); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; return 0; } -int vc_set_ccaps(uint32_t id, void __user *data) +static int do_set_caps(struct vx_info *vxi, + uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask) +{ + vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask); + vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask); + + return 0; +} + +int vc_set_ccaps_v0(struct vx_info *vxi, void __user *data) { - struct vx_info *vxi; struct vcmd_ctx_caps_v0 vc_data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - vxi = locate_vx_info(id); - if (!vxi) - return -ESRCH; - - vxi->vx_bcaps &= vc_data.bcaps; - vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps, + /* simulate old &= behaviour for bcaps */ + return do_set_caps(vxi, 0, ~vc_data.bcaps, vc_data.ccaps, vc_data.cmask); - put_vx_info(vxi); +} + +int vc_set_ccaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_caps_v1 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask); +} + +int vc_get_bcaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_bcaps vc_data; + int ret; + + ret = do_get_caps(vxi, &vc_data.bcaps, NULL); + if (ret) + return ret; + vc_data.bmask = ~0UL; + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; return 0; } +int vc_set_bcaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_bcaps vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0); +} + #include -// EXPORT_SYMBOL_GPL(rcu_free_vx_info); EXPORT_SYMBOL_GPL(free_vx_info); -EXPORT_SYMBOL_GPL(vx_info_hash_lock); -EXPORT_SYMBOL_GPL(unhash_vx_info);