2 * linux/kernel/vserver/context.c
4 * Virtual Server: Context Support
6 * Copyright (C) 2003-2005 Herbert Pƶtzl
9 * V0.02 vx_ctx_kill syscall command
10 * V0.03 replaced context_info calls
11 * V0.04 redesign of struct (de)alloc
12 * V0.05 rlimit basic implementation
13 * V0.06 task_xid and info commands
14 * V0.07 context flags and caps
15 * V0.08 switch to RCU based hash
16 * V0.09 revert to non RCU for now
17 * V0.10 and back to working RCU hash
18 * V0.11 and back to locking again
19 * V0.12 have __create claim() the vxi
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/namespace.h>
27 #include <linux/sched.h>
28 #include <linux/vs_base.h>
29 #include <linux/vs_context.h>
30 #include <linux/vs_limit.h>
31 #include <linux/vserver/network.h>
32 #include <linux/vserver/legacy.h>
33 #include <linux/vserver/limit.h>
34 #include <linux/vserver/debug.h>
35 #include <linux/vserver/limit_int.h>
36 #include <linux/vserver/context_cmd.h>
38 #include <linux/err.h>
39 #include <asm/errno.h>
41 #include "cvirt_init.h"
42 #include "limit_init.h"
43 #include "sched_init.h"
48 * allocate an initialized vx_info struct
49 * doesn't make it visible (hash) */
51 static struct vx_info *__alloc_vx_info(xid_t xid)
53 struct vx_info *new = NULL;
55 vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
57 /* would this benefit from a slab cache? */
58 new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
62 memset (new, 0, sizeof(struct vx_info));
64 INIT_HLIST_NODE(&new->vx_hlist);
65 atomic_set(&new->vx_usecnt, 0);
66 atomic_set(&new->vx_tasks, 0);
67 new->vx_parent = NULL;
69 init_waitqueue_head(&new->vx_wait);
72 get_task_struct(child_reaper);
73 new->vx_reaper = child_reaper;
75 /* rest of init goes here */
76 vx_info_init_limit(&new->limit);
77 vx_info_init_sched(&new->sched);
78 vx_info_init_cvirt(&new->cvirt);
79 vx_info_init_cacct(&new->cacct);
81 new->vx_flags = VXF_INIT_SET;
82 new->vx_bcaps = CAP_INIT_EFF_SET;
88 vxdprintk(VXD_CBIT(xid, 0),
89 "alloc_vx_info(%d) = %p", xid, new);
90 vxh_alloc_vx_info(new);
94 /* __dealloc_vx_info()
96 * final disposal of vx_info */
98 static void __dealloc_vx_info(struct vx_info *vxi)
100 vxdprintk(VXD_CBIT(xid, 0),
101 "dealloc_vx_info(%p)", vxi);
102 vxh_dealloc_vx_info(vxi);
104 vxi->vx_hlist.next = LIST_POISON1;
107 vx_info_exit_limit(&vxi->limit);
108 vx_info_exit_sched(&vxi->sched);
109 vx_info_exit_cvirt(&vxi->cvirt);
110 vx_info_exit_cacct(&vxi->cacct);
112 vxi->vx_state |= VXS_RELEASED;
116 static void __shutdown_vx_info(struct vx_info *vxi)
118 struct namespace *namespace;
119 struct fs_struct *fs;
123 vxi->vx_state |= VXS_SHUTDOWN;
124 vs_state_change(vxi, VSC_SHUTDOWN);
126 namespace = xchg(&vxi->vx_namespace, NULL);
128 put_namespace(namespace);
130 fs = xchg(&vxi->vx_fs, NULL);
137 void free_vx_info(struct vx_info *vxi)
139 /* context shutdown is mandatory */
140 BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN));
142 BUG_ON(atomic_read(&vxi->vx_usecnt));
143 BUG_ON(atomic_read(&vxi->vx_tasks));
145 BUG_ON(vx_info_state(vxi, VXS_HASHED));
147 BUG_ON(vxi->vx_namespace);
150 __dealloc_vx_info(vxi);
154 /* hash table for vx_info hash */
156 #define VX_HASH_SIZE 13
158 struct hlist_head vx_info_hash[VX_HASH_SIZE];
160 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
163 static inline unsigned int __hashval(xid_t xid)
165 return (xid % VX_HASH_SIZE);
172 * add the vxi to the global hash table
173 * requires the hash_lock to be held */
175 static inline void __hash_vx_info(struct vx_info *vxi)
177 struct hlist_head *head;
179 vxd_assert_lock(&vx_info_hash_lock);
180 vxdprintk(VXD_CBIT(xid, 4),
181 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
182 vxh_hash_vx_info(vxi);
184 /* context must not be hashed */
185 BUG_ON(vx_info_state(vxi, VXS_HASHED));
187 vxi->vx_state |= VXS_HASHED;
188 head = &vx_info_hash[__hashval(vxi->vx_id)];
189 hlist_add_head(&vxi->vx_hlist, head);
192 /* __unhash_vx_info()
194 * remove the vxi from the global hash table
195 * requires the hash_lock to be held */
197 static inline void __unhash_vx_info(struct vx_info *vxi)
199 vxdprintk(VXD_CBIT(xid, 4),
200 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
201 spin_lock(&vx_info_hash_lock);
202 vxh_unhash_vx_info(vxi);
204 /* context must be hashed */
205 BUG_ON(!vx_info_state(vxi, VXS_HASHED));
207 vxi->vx_state &= ~VXS_HASHED;
208 hlist_del(&vxi->vx_hlist);
209 spin_unlock(&vx_info_hash_lock);
213 /* __lookup_vx_info()
215 * requires the hash_lock to be held
216 * doesn't increment the vx_refcnt */
218 static inline struct vx_info *__lookup_vx_info(xid_t xid)
220 struct hlist_head *head = &vx_info_hash[__hashval(xid)];
221 struct hlist_node *pos;
224 vxd_assert_lock(&vx_info_hash_lock);
225 hlist_for_each(pos, head) {
226 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
228 if (vxi->vx_id == xid)
233 vxdprintk(VXD_CBIT(xid, 0),
234 "__lookup_vx_info(#%u): %p[#%u]",
235 xid, vxi, vxi?vxi->vx_id:0);
236 vxh_lookup_vx_info(vxi, xid);
243 * find unused dynamic xid
244 * requires the hash_lock to be held */
246 static inline xid_t __vx_dynamic_id(void)
248 static xid_t seq = MAX_S_CONTEXT;
251 vxd_assert_lock(&vx_info_hash_lock);
253 if (++seq > MAX_S_CONTEXT)
255 if (!__lookup_vx_info(seq)) {
256 vxdprintk(VXD_CBIT(xid, 4),
257 "__vx_dynamic_id: [#%d]", seq);
260 } while (barrier != seq);
264 #ifdef CONFIG_VSERVER_LEGACY
268 * locate or create the requested context
269 * get() it and if new hash it */
271 static struct vx_info * __loc_vx_info(int id, int *err)
273 struct vx_info *new, *vxi = NULL;
275 vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
277 if (!(new = __alloc_vx_info(id))) {
282 /* required to make dynamic xids unique */
283 spin_lock(&vx_info_hash_lock);
285 /* dynamic context requested */
286 if (id == VX_DYNAMIC_ID) {
287 id = __vx_dynamic_id();
289 printk(KERN_ERR "no dynamic context available.\n");
294 /* existing context requested */
295 else if ((vxi = __lookup_vx_info(id))) {
296 /* context in setup is not available */
297 if (vxi->vx_flags & VXF_STATE_SETUP) {
298 vxdprintk(VXD_CBIT(xid, 0),
299 "loc_vx_info(%d) = %p (not available)", id, vxi);
303 vxdprintk(VXD_CBIT(xid, 0),
304 "loc_vx_info(%d) = %p (found)", id, vxi);
311 /* new context requested */
312 vxdprintk(VXD_CBIT(xid, 0),
313 "loc_vx_info(%d) = %p (new)", id, new);
314 __hash_vx_info(get_vx_info(new));
315 vxi = new, new = NULL;
319 spin_unlock(&vx_info_hash_lock);
320 vxh_loc_vx_info(vxi, id);
322 __dealloc_vx_info(new);
328 /* __create_vx_info()
330 * create the requested context
331 * get(), claim() and hash it */
333 static struct vx_info * __create_vx_info(int id)
335 struct vx_info *new, *vxi = NULL;
337 vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id);
339 if (!(new = __alloc_vx_info(id)))
340 return ERR_PTR(-ENOMEM);
342 /* required to make dynamic xids unique */
343 spin_lock(&vx_info_hash_lock);
345 /* dynamic context requested */
346 if (id == VX_DYNAMIC_ID) {
347 id = __vx_dynamic_id();
349 printk(KERN_ERR "no dynamic context available.\n");
350 vxi = ERR_PTR(-EAGAIN);
355 /* static context requested */
356 else if ((vxi = __lookup_vx_info(id))) {
357 vxdprintk(VXD_CBIT(xid, 0),
358 "create_vx_info(%d) = %p (already there)", id, vxi);
359 if (vx_info_flags(vxi, VXF_STATE_SETUP, 0))
360 vxi = ERR_PTR(-EBUSY);
362 vxi = ERR_PTR(-EEXIST);
365 /* dynamic xid creation blocker */
366 else if (id >= MIN_D_CONTEXT) {
367 vxdprintk(VXD_CBIT(xid, 0),
368 "create_vx_info(%d) (dynamic rejected)", id);
369 vxi = ERR_PTR(-EINVAL);
374 vxdprintk(VXD_CBIT(xid, 0),
375 "create_vx_info(%d) = %p (new)", id, new);
376 claim_vx_info(new, NULL);
377 __hash_vx_info(get_vx_info(new));
378 vxi = new, new = NULL;
381 spin_unlock(&vx_info_hash_lock);
382 vxh_create_vx_info(IS_ERR(vxi)?NULL:vxi, id);
384 __dealloc_vx_info(new);
392 void unhash_vx_info(struct vx_info *vxi)
394 __shutdown_vx_info(vxi);
395 __unhash_vx_info(vxi);
396 __wakeup_vx_info(vxi);
402 * search for a vx_info and get() it
403 * negative id means current */
405 struct vx_info *lookup_vx_info(int id)
407 struct vx_info *vxi = NULL;
410 vxi = get_vx_info(current->vx_info);
412 spin_lock(&vx_info_hash_lock);
413 vxi = get_vx_info(__lookup_vx_info(id));
414 spin_unlock(&vx_info_hash_lock);
421 * verify that xid is still hashed */
423 int xid_is_hashed(xid_t xid)
427 spin_lock(&vx_info_hash_lock);
428 hashed = (__lookup_vx_info(xid) != NULL);
429 spin_unlock(&vx_info_hash_lock);
433 #ifdef CONFIG_VSERVER_LEGACY
435 struct vx_info *lookup_or_create_vx_info(int id)
439 return __loc_vx_info(id, &err);
444 #ifdef CONFIG_PROC_FS
446 int get_xid_list(int index, unsigned int *xids, int size)
448 int hindex, nr_xids = 0;
450 for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
451 struct hlist_head *head = &vx_info_hash[hindex];
452 struct hlist_node *pos;
454 spin_lock(&vx_info_hash_lock);
455 hlist_for_each(pos, head) {
461 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
462 xids[nr_xids] = vxi->vx_id;
463 if (++nr_xids >= size) {
464 spin_unlock(&vx_info_hash_lock);
468 /* keep the lock time short */
469 spin_unlock(&vx_info_hash_lock);
477 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
479 struct user_struct *new_user, *old_user;
483 new_user = alloc_uid(vxi->vx_id, p->uid);
488 if (new_user != old_user) {
489 atomic_inc(&new_user->processes);
490 atomic_dec(&old_user->processes);
497 void vx_mask_bcaps(struct vx_info *vxi, struct task_struct *p)
499 p->cap_effective &= vxi->vx_bcaps;
500 p->cap_inheritable &= vxi->vx_bcaps;
501 p->cap_permitted &= vxi->vx_bcaps;
505 #include <linux/file.h>
507 static int vx_openfd_task(struct task_struct *tsk)
509 struct files_struct *files = tsk->files;
511 const unsigned long *bptr;
514 /* no rcu_read_lock() because of spin_lock() */
515 spin_lock(&files->file_lock);
516 fdt = files_fdtable(files);
517 bptr = fdt->open_fds->fds_bits;
518 count = fdt->max_fds / (sizeof(unsigned long) * 8);
519 for (total = 0; count > 0; count--) {
521 total += hweight_long(*bptr);
524 spin_unlock(&files->file_lock);
529 * migrate task to new context
530 * gets vxi, puts old_vxi on change
533 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
535 struct vx_info *old_vxi;
541 old_vxi = task_get_vx_info(p);
545 vxdprintk(VXD_CBIT(xid, 5),
546 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
547 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
549 if (!(ret = vx_migrate_user(p, vxi))) {
553 openfd = vx_openfd_task(p);
556 atomic_dec(&old_vxi->cvirt.nr_threads);
557 atomic_dec(&old_vxi->cvirt.nr_running);
558 atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
559 /* FIXME: what about the struct files here? */
560 atomic_sub(openfd, &old_vxi->limit.rcur[VLIMIT_OPENFD]);
562 atomic_inc(&vxi->cvirt.nr_threads);
563 atomic_inc(&vxi->cvirt.nr_running);
564 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
565 /* FIXME: what about the struct files here? */
566 atomic_add(openfd, &vxi->limit.rcur[VLIMIT_OPENFD]);
569 release_vx_info(old_vxi, p);
570 clr_vx_info(&p->vx_info);
572 claim_vx_info(vxi, p);
573 set_vx_info(&p->vx_info, vxi);
576 vxdprintk(VXD_CBIT(xid, 5),
577 "moved task %p into vxi:%p[#%d]",
580 vx_mask_bcaps(vxi, p);
584 put_vx_info(old_vxi);
588 int vx_set_reaper(struct vx_info *vxi, struct task_struct *p)
590 struct task_struct *old_reaper;
595 vxdprintk(VXD_CBIT(xid, 6),
596 "vx_set_reaper(%p[#%d],%p[#%d,%d])",
597 vxi, vxi->vx_id, p, p->xid, p->pid);
599 old_reaper = vxi->vx_reaper;
603 /* set new child reaper */
606 put_task_struct(old_reaper);
610 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
615 vxdprintk(VXD_CBIT(xid, 6),
616 "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
617 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
619 vxi->vx_flags &= ~VXF_STATE_INIT;
620 vxi->vx_initpid = p->tgid;
624 void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code)
626 vxdprintk(VXD_CBIT(xid, 6),
627 "vx_exit_init(%p[#%d],%p[#%d,%d,%d])",
628 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
630 vxi->exit_code = code;
634 void vx_set_persistent(struct vx_info *vxi)
636 vxdprintk(VXD_CBIT(xid, 6),
637 "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id);
640 claim_vx_info(vxi, NULL);
643 void vx_clear_persistent(struct vx_info *vxi)
645 vxdprintk(VXD_CBIT(xid, 6),
646 "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id);
648 release_vx_info(vxi, NULL);
652 void vx_update_persistent(struct vx_info *vxi)
654 if (vx_info_flags(vxi, VXF_PERSISTENT, 0))
655 vx_set_persistent(vxi);
657 vx_clear_persistent(vxi);
661 /* task must be current or locked */
663 void exit_vx_info(struct task_struct *p, int code)
665 struct vx_info *vxi = p->vx_info;
668 atomic_dec(&vxi->cvirt.nr_threads);
671 vxi->exit_code = code;
672 release_vx_info(vxi, p);
676 void exit_vx_info_early(struct task_struct *p, int code)
678 struct vx_info *vxi = p->vx_info;
681 if (vxi->vx_initpid == p->tgid)
682 vx_exit_init(vxi, p, code);
683 if (vxi->vx_reaper == p)
684 vx_set_reaper(vxi, child_reaper);
689 /* vserver syscall commands below here */
691 /* taks xid and vx_info functions */
693 #include <asm/uaccess.h>
696 int vc_task_xid(uint32_t id, void __user *data)
701 struct task_struct *tsk;
703 if (!vx_check(0, VX_ADMIN|VX_WATCH))
706 read_lock(&tasklist_lock);
707 tsk = find_task_by_real_pid(id);
708 xid = (tsk) ? tsk->xid : -ESRCH;
709 read_unlock(&tasklist_lock);
712 xid = vx_current_xid();
717 int vc_vx_info(uint32_t id, void __user *data)
720 struct vcmd_vx_info_v0 vc_data;
722 if (!vx_check(0, VX_ADMIN))
724 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
727 vxi = lookup_vx_info(id);
731 vc_data.xid = vxi->vx_id;
732 vc_data.initpid = vxi->vx_initpid;
735 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
741 /* context functions */
743 int vc_ctx_create(uint32_t xid, void __user *data)
745 struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET };
746 struct vx_info *new_vxi;
749 if (!capable(CAP_SYS_ADMIN))
751 if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
754 if ((xid > MAX_S_CONTEXT) && (xid != VX_DYNAMIC_ID))
759 new_vxi = __create_vx_info(xid);
761 return PTR_ERR(new_vxi);
764 new_vxi->vx_flags = vc_data.flagword;
767 if (vs_state_change(new_vxi, VSC_STARTUP))
770 ret = vx_migrate_task(current, new_vxi);
774 /* return context id on success */
775 ret = new_vxi->vx_id;
777 /* get a reference for persistent contexts */
778 if ((vc_data.flagword & VXF_PERSISTENT))
779 vx_set_persistent(new_vxi);
781 release_vx_info(new_vxi, NULL);
782 put_vx_info(new_vxi);
787 int vc_ctx_migrate(uint32_t id, void __user *data)
789 struct vcmd_ctx_migrate vc_data = { .flagword = 0 };
792 if (!capable(CAP_SYS_ADMIN))
794 if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
797 /* dirty hack until Spectator becomes a cap */
803 vxi = lookup_vx_info(id);
806 vx_migrate_task(current, vxi);
807 if (vc_data.flagword & VXM_SET_INIT)
808 vx_set_init(vxi, current);
809 if (vc_data.flagword & VXM_SET_REAPER)
810 vx_set_reaper(vxi, current);
816 int vc_get_cflags(uint32_t id, void __user *data)
819 struct vcmd_ctx_flags_v0 vc_data;
821 if (!capable(CAP_SYS_ADMIN))
824 vxi = lookup_vx_info(id);
828 vc_data.flagword = vxi->vx_flags;
830 /* special STATE flag handling */
831 vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
835 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
840 int vc_set_cflags(uint32_t id, void __user *data)
843 struct vcmd_ctx_flags_v0 vc_data;
844 uint64_t mask, trigger;
846 if (!capable(CAP_SYS_ADMIN))
848 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
851 vxi = lookup_vx_info(id);
855 /* special STATE flag handling */
856 mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
857 trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
859 if (vxi == current->vx_info) {
860 if (trigger & VXF_STATE_SETUP)
861 vx_mask_bcaps(vxi, current);
862 if (trigger & VXF_STATE_INIT) {
863 vx_set_init(vxi, current);
864 vx_set_reaper(vxi, current);
868 vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
869 vc_data.flagword, mask);
870 if (trigger & VXF_PERSISTENT)
871 vx_update_persistent(vxi);
877 int vc_get_ccaps(uint32_t id, void __user *data)
880 struct vcmd_ctx_caps_v0 vc_data;
882 if (!capable(CAP_SYS_ADMIN))
885 vxi = lookup_vx_info(id);
889 vc_data.bcaps = vxi->vx_bcaps;
890 vc_data.ccaps = vxi->vx_ccaps;
891 vc_data.cmask = ~0UL;
894 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
899 int vc_set_ccaps(uint32_t id, void __user *data)
902 struct vcmd_ctx_caps_v0 vc_data;
904 if (!capable(CAP_SYS_ADMIN))
906 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
909 vxi = lookup_vx_info(id);
913 vxi->vx_bcaps &= vc_data.bcaps;
914 vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
915 vc_data.ccaps, vc_data.cmask);
920 #include <linux/module.h>
922 EXPORT_SYMBOL_GPL(free_vx_info);