2 * linux/kernel/vserver/context.c
4 * Virtual Server: Context Support
6 * Copyright (C) 2003-2005 Herbert Pƶtzl
9 * V0.02 vx_ctx_kill syscall command
10 * V0.03 replaced context_info calls
11 * V0.04 redesign of struct (de)alloc
12 * V0.05 rlimit basic implementation
13 * V0.06 task_xid and info commands
14 * V0.07 context flags and caps
15 * V0.08 switch to RCU based hash
16 * V0.09 revert to non RCU for now
17 * V0.10 and back to working RCU hash
18 * V0.11 and back to locking again
22 #include <linux/config.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/namespace.h>
27 #include <linux/sched.h>
28 #include <linux/vserver/network.h>
29 #include <linux/vserver/legacy.h>
30 #include <linux/vserver/limit.h>
31 #include <linux/vserver/debug.h>
33 #include <linux/vs_context.h>
34 #include <linux/vserver/context_cmd.h>
36 #include <linux/err.h>
37 #include <asm/errno.h>
39 #include "cvirt_init.h"
40 #include "limit_init.h"
41 #include "sched_init.h"
46 * allocate an initialized vx_info struct
47 * doesn't make it visible (hash) */
49 static struct vx_info *__alloc_vx_info(xid_t xid)
51 struct vx_info *new = NULL;
53 vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
55 /* would this benefit from a slab cache? */
56 new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
60 memset (new, 0, sizeof(struct vx_info));
62 INIT_HLIST_NODE(&new->vx_hlist);
63 atomic_set(&new->vx_usecnt, 0);
64 atomic_set(&new->vx_tasks, 0);
65 new->vx_parent = NULL;
67 init_waitqueue_head(&new->vx_wait);
69 /* rest of init goes here */
70 vx_info_init_limit(&new->limit);
71 vx_info_init_sched(&new->sched);
72 vx_info_init_cvirt(&new->cvirt);
73 vx_info_init_cacct(&new->cacct);
75 new->vx_flags = VXF_INIT_SET;
76 new->vx_bcaps = CAP_INIT_EFF_SET;
79 vxdprintk(VXD_CBIT(xid, 0),
80 "alloc_vx_info(%d) = %p", xid, new);
81 vxh_alloc_vx_info(new);
85 /* __dealloc_vx_info()
87 * final disposal of vx_info */
89 static void __dealloc_vx_info(struct vx_info *vxi)
91 vxdprintk(VXD_CBIT(xid, 0),
92 "dealloc_vx_info(%p)", vxi);
93 vxh_dealloc_vx_info(vxi);
95 vxi->vx_hlist.next = LIST_POISON1;
98 vx_info_exit_limit(&vxi->limit);
99 vx_info_exit_sched(&vxi->sched);
100 vx_info_exit_cvirt(&vxi->cvirt);
101 vx_info_exit_cacct(&vxi->cacct);
103 vxi->vx_state |= VXS_RELEASED;
107 static void __shutdown_vx_info(struct vx_info *vxi)
109 struct namespace *namespace;
110 struct fs_struct *fs;
114 vxi->vx_state |= VXS_SHUTDOWN;
115 vs_state_change(vxi, VSC_SHUTDOWN);
117 namespace = xchg(&vxi->vx_namespace, NULL);
119 put_namespace(namespace);
121 fs = xchg(&vxi->vx_fs, NULL);
128 void free_vx_info(struct vx_info *vxi)
130 /* context shutdown is mandatory */
131 BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN));
133 BUG_ON(atomic_read(&vxi->vx_usecnt));
134 BUG_ON(atomic_read(&vxi->vx_tasks));
136 BUG_ON(vx_info_state(vxi, VXS_HASHED));
138 BUG_ON(vxi->vx_namespace);
141 __dealloc_vx_info(vxi);
145 /* hash table for vx_info hash */
147 #define VX_HASH_SIZE 13
149 struct hlist_head vx_info_hash[VX_HASH_SIZE];
151 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
154 static inline unsigned int __hashval(xid_t xid)
156 return (xid % VX_HASH_SIZE);
163 * add the vxi to the global hash table
164 * requires the hash_lock to be held */
166 static inline void __hash_vx_info(struct vx_info *vxi)
168 struct hlist_head *head;
170 vxd_assert_lock(&vx_info_hash_lock);
171 vxdprintk(VXD_CBIT(xid, 4),
172 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
173 vxh_hash_vx_info(vxi);
175 /* context must not be hashed */
176 BUG_ON(vx_info_state(vxi, VXS_HASHED));
178 vxi->vx_state |= VXS_HASHED;
179 head = &vx_info_hash[__hashval(vxi->vx_id)];
180 hlist_add_head(&vxi->vx_hlist, head);
183 /* __unhash_vx_info()
185 * remove the vxi from the global hash table
186 * requires the hash_lock to be held */
188 static inline void __unhash_vx_info(struct vx_info *vxi)
190 vxd_assert_lock(&vx_info_hash_lock);
191 vxdprintk(VXD_CBIT(xid, 4),
192 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
193 vxh_unhash_vx_info(vxi);
195 /* context must be hashed */
196 BUG_ON(!vx_info_state(vxi, VXS_HASHED));
198 vxi->vx_state &= ~VXS_HASHED;
199 hlist_del(&vxi->vx_hlist);
203 /* __lookup_vx_info()
205 * requires the hash_lock to be held
206 * doesn't increment the vx_refcnt */
208 static inline struct vx_info *__lookup_vx_info(xid_t xid)
210 struct hlist_head *head = &vx_info_hash[__hashval(xid)];
211 struct hlist_node *pos;
214 vxd_assert_lock(&vx_info_hash_lock);
215 hlist_for_each(pos, head) {
216 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
218 if (vxi->vx_id == xid)
223 vxdprintk(VXD_CBIT(xid, 0),
224 "__lookup_vx_info(#%u): %p[#%u]",
225 xid, vxi, vxi?vxi->vx_id:0);
226 vxh_lookup_vx_info(vxi, xid);
233 * find unused dynamic xid
234 * requires the hash_lock to be held */
236 static inline xid_t __vx_dynamic_id(void)
238 static xid_t seq = MAX_S_CONTEXT;
241 vxd_assert_lock(&vx_info_hash_lock);
243 if (++seq > MAX_S_CONTEXT)
245 if (!__lookup_vx_info(seq)) {
246 vxdprintk(VXD_CBIT(xid, 4),
247 "__vx_dynamic_id: [#%d]", seq);
250 } while (barrier != seq);
254 #ifdef CONFIG_VSERVER_LEGACY
258 * locate or create the requested context
259 * get() it and if new hash it */
261 static struct vx_info * __loc_vx_info(int id, int *err)
263 struct vx_info *new, *vxi = NULL;
265 vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
267 if (!(new = __alloc_vx_info(id))) {
272 /* required to make dynamic xids unique */
273 spin_lock(&vx_info_hash_lock);
275 /* dynamic context requested */
276 if (id == VX_DYNAMIC_ID) {
277 id = __vx_dynamic_id();
279 printk(KERN_ERR "no dynamic context available.\n");
284 /* existing context requested */
285 else if ((vxi = __lookup_vx_info(id))) {
286 /* context in setup is not available */
287 if (vxi->vx_flags & VXF_STATE_SETUP) {
288 vxdprintk(VXD_CBIT(xid, 0),
289 "loc_vx_info(%d) = %p (not available)", id, vxi);
293 vxdprintk(VXD_CBIT(xid, 0),
294 "loc_vx_info(%d) = %p (found)", id, vxi);
301 /* new context requested */
302 vxdprintk(VXD_CBIT(xid, 0),
303 "loc_vx_info(%d) = %p (new)", id, new);
304 __hash_vx_info(get_vx_info(new));
305 vxi = new, new = NULL;
309 spin_unlock(&vx_info_hash_lock);
310 vxh_loc_vx_info(vxi, id);
312 __dealloc_vx_info(new);
318 /* __create_vx_info()
320 * create the requested context
321 * get() and hash it */
323 static struct vx_info * __create_vx_info(int id)
325 struct vx_info *new, *vxi = NULL;
327 vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id);
329 if (!(new = __alloc_vx_info(id)))
330 return ERR_PTR(-ENOMEM);
332 /* required to make dynamic xids unique */
333 spin_lock(&vx_info_hash_lock);
335 /* dynamic context requested */
336 if (id == VX_DYNAMIC_ID) {
337 id = __vx_dynamic_id();
339 printk(KERN_ERR "no dynamic context available.\n");
340 vxi = ERR_PTR(-EAGAIN);
345 /* static context requested */
346 else if ((vxi = __lookup_vx_info(id))) {
347 vxdprintk(VXD_CBIT(xid, 0),
348 "create_vx_info(%d) = %p (already there)", id, vxi);
349 if (vx_info_flags(vxi, VXF_STATE_SETUP, 0))
350 vxi = ERR_PTR(-EBUSY);
352 vxi = ERR_PTR(-EEXIST);
355 /* dynamic xid creation blocker */
356 else if (id >= MIN_D_CONTEXT) {
357 vxdprintk(VXD_CBIT(xid, 0),
358 "create_vx_info(%d) (dynamic rejected)", id);
359 vxi = ERR_PTR(-EINVAL);
364 vxdprintk(VXD_CBIT(xid, 0),
365 "create_vx_info(%d) = %p (new)", id, new);
366 __hash_vx_info(get_vx_info(new));
367 vxi = new, new = NULL;
370 spin_unlock(&vx_info_hash_lock);
371 vxh_create_vx_info(IS_ERR(vxi)?NULL:vxi, id);
373 __dealloc_vx_info(new);
381 void unhash_vx_info(struct vx_info *vxi)
383 __shutdown_vx_info(vxi);
384 spin_lock(&vx_info_hash_lock);
385 __unhash_vx_info(vxi);
386 spin_unlock(&vx_info_hash_lock);
387 __wakeup_vx_info(vxi);
393 * search for a vx_info and get() it
394 * negative id means current */
396 struct vx_info *locate_vx_info(int id)
398 struct vx_info *vxi = NULL;
401 vxi = get_vx_info(current->vx_info);
403 spin_lock(&vx_info_hash_lock);
404 vxi = get_vx_info(__lookup_vx_info(id));
405 spin_unlock(&vx_info_hash_lock);
412 * verify that xid is still hashed */
414 int xid_is_hashed(xid_t xid)
418 spin_lock(&vx_info_hash_lock);
419 hashed = (__lookup_vx_info(xid) != NULL);
420 spin_unlock(&vx_info_hash_lock);
424 #ifdef CONFIG_VSERVER_LEGACY
426 struct vx_info *locate_or_create_vx_info(int id)
430 return __loc_vx_info(id, &err);
435 #ifdef CONFIG_PROC_FS
437 int get_xid_list(int index, unsigned int *xids, int size)
439 int hindex, nr_xids = 0;
441 for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
442 struct hlist_head *head = &vx_info_hash[hindex];
443 struct hlist_node *pos;
445 spin_lock(&vx_info_hash_lock);
446 hlist_for_each(pos, head) {
452 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
453 xids[nr_xids] = vxi->vx_id;
454 if (++nr_xids >= size) {
455 spin_unlock(&vx_info_hash_lock);
459 /* keep the lock time short */
460 spin_unlock(&vx_info_hash_lock);
468 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
470 struct user_struct *new_user, *old_user;
474 new_user = alloc_uid(vxi->vx_id, p->uid);
479 if (new_user != old_user) {
480 atomic_inc(&new_user->processes);
481 atomic_dec(&old_user->processes);
488 void vx_mask_bcaps(struct task_struct *p)
490 struct vx_info *vxi = p->vx_info;
492 p->cap_effective &= vxi->vx_bcaps;
493 p->cap_inheritable &= vxi->vx_bcaps;
494 p->cap_permitted &= vxi->vx_bcaps;
498 #include <linux/file.h>
500 static int vx_openfd_task(struct task_struct *tsk)
502 struct files_struct *files = tsk->files;
503 const unsigned long *bptr;
506 spin_lock(&files->file_lock);
507 bptr = files->open_fds->fds_bits;
508 count = files->max_fds / (sizeof(unsigned long) * 8);
509 for (total = 0; count > 0; count--) {
511 total += hweight_long(*bptr);
514 spin_unlock(&files->file_lock);
519 * migrate task to new context
520 * gets vxi, puts old_vxi on change
523 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
525 struct vx_info *old_vxi;
531 old_vxi = task_get_vx_info(p);
535 vxdprintk(VXD_CBIT(xid, 5),
536 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
537 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
539 if (!(ret = vx_migrate_user(p, vxi))) {
543 openfd = vx_openfd_task(p);
546 atomic_dec(&old_vxi->cvirt.nr_threads);
547 atomic_dec(&old_vxi->cvirt.nr_running);
548 atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
549 /* FIXME: what about the struct files here? */
550 atomic_sub(openfd, &old_vxi->limit.rcur[VLIMIT_OPENFD]);
552 atomic_inc(&vxi->cvirt.nr_threads);
553 atomic_inc(&vxi->cvirt.nr_running);
554 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
555 /* FIXME: what about the struct files here? */
556 atomic_add(openfd, &vxi->limit.rcur[VLIMIT_OPENFD]);
559 release_vx_info(old_vxi, p);
560 clr_vx_info(&p->vx_info);
562 claim_vx_info(vxi, p);
563 set_vx_info(&p->vx_info, vxi);
566 vxdprintk(VXD_CBIT(xid, 5),
567 "moved task %p into vxi:%p[#%d]",
574 put_vx_info(old_vxi);
578 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
585 vxdprintk(VXD_CBIT(xid, 6),
586 "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
587 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
589 vxi->vx_initpid = p->tgid;
594 /* vserver syscall commands below here */
596 /* taks xid and vx_info functions */
598 #include <asm/uaccess.h>
601 int vc_task_xid(uint32_t id, void __user *data)
606 struct task_struct *tsk;
608 if (!vx_check(0, VX_ADMIN|VX_WATCH))
611 read_lock(&tasklist_lock);
612 tsk = find_task_by_real_pid(id);
613 xid = (tsk) ? tsk->xid : -ESRCH;
614 read_unlock(&tasklist_lock);
617 xid = vx_current_xid();
622 int vc_vx_info(uint32_t id, void __user *data)
625 struct vcmd_vx_info_v0 vc_data;
627 if (!vx_check(0, VX_ADMIN))
629 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
632 vxi = locate_vx_info(id);
636 vc_data.xid = vxi->vx_id;
637 vc_data.initpid = vxi->vx_initpid;
640 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
646 /* context functions */
648 int vc_ctx_create(uint32_t xid, void __user *data)
650 struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET };
651 struct vx_info *new_vxi;
654 if (!capable(CAP_SYS_ADMIN))
656 if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
659 if ((xid > MAX_S_CONTEXT) && (xid != VX_DYNAMIC_ID))
664 new_vxi = __create_vx_info(xid);
666 return PTR_ERR(new_vxi);
669 new_vxi->vx_flags = vc_data.flagword;
671 vs_state_change(new_vxi, VSC_STARTUP);
672 ret = new_vxi->vx_id;
673 vx_migrate_task(current, new_vxi);
674 /* if this fails, we might end up with a hashed vx_info */
675 put_vx_info(new_vxi);
680 int vc_ctx_migrate(uint32_t id, void __user *data)
684 if (!capable(CAP_SYS_ADMIN))
687 /* dirty hack until Spectator becomes a cap */
693 vxi = locate_vx_info(id);
696 vx_migrate_task(current, vxi);
702 int vc_get_cflags(uint32_t id, void __user *data)
705 struct vcmd_ctx_flags_v0 vc_data;
707 if (!capable(CAP_SYS_ADMIN))
710 vxi = locate_vx_info(id);
714 vc_data.flagword = vxi->vx_flags;
716 /* special STATE flag handling */
717 vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
721 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
726 int vc_set_cflags(uint32_t id, void __user *data)
729 struct vcmd_ctx_flags_v0 vc_data;
730 uint64_t mask, trigger;
732 if (!capable(CAP_SYS_ADMIN))
734 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
737 vxi = locate_vx_info(id);
741 /* special STATE flag handling */
742 mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
743 trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
745 if (trigger & VXF_STATE_SETUP)
746 vx_mask_bcaps(current);
747 if (trigger & VXF_STATE_INIT)
748 if (vxi == current->vx_info)
749 vx_set_init(vxi, current);
751 vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
752 vc_data.flagword, mask);
757 int vc_get_ccaps(uint32_t id, void __user *data)
760 struct vcmd_ctx_caps_v0 vc_data;
762 if (!capable(CAP_SYS_ADMIN))
765 vxi = locate_vx_info(id);
769 vc_data.bcaps = vxi->vx_bcaps;
770 vc_data.ccaps = vxi->vx_ccaps;
771 vc_data.cmask = ~0UL;
774 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
779 int vc_set_ccaps(uint32_t id, void __user *data)
782 struct vcmd_ctx_caps_v0 vc_data;
784 if (!capable(CAP_SYS_ADMIN))
786 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
789 vxi = locate_vx_info(id);
793 vxi->vx_bcaps &= vc_data.bcaps;
794 vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
795 vc_data.ccaps, vc_data.cmask);
800 #include <linux/module.h>
802 EXPORT_SYMBOL_GPL(free_vx_info);