2 * linux/kernel/vserver/context.c
4 * Virtual Server: Context Support
6 * Copyright (C) 2003-2005 Herbert Pƶtzl
9 * V0.02 vx_ctx_kill syscall command
10 * V0.03 replaced context_info calls
11 * V0.04 redesign of struct (de)alloc
12 * V0.05 rlimit basic implementation
13 * V0.06 task_xid and info commands
14 * V0.07 context flags and caps
15 * V0.08 switch to RCU based hash
16 * V0.09 revert to non RCU for now
17 * V0.10 and back to working RCU hash
18 * V0.11 and back to locking again
22 #include <linux/slab.h>
23 #include <linux/types.h>
24 #include <linux/namespace.h>
26 #include <linux/sched.h>
27 #include <linux/vs_base.h>
28 #include <linux/vs_context.h>
29 #include <linux/vs_limit.h>
30 #include <linux/vserver/network.h>
31 #include <linux/vserver/legacy.h>
32 #include <linux/vserver/limit.h>
33 #include <linux/vserver/debug.h>
34 #include <linux/vserver/limit_int.h>
35 #include <linux/vserver/context_cmd.h>
37 #include <linux/err.h>
38 #include <asm/errno.h>
40 #include "cvirt_init.h"
41 #include "limit_init.h"
42 #include "sched_init.h"
47 * allocate an initialized vx_info struct
48 * doesn't make it visible (hash) */
50 static struct vx_info *__alloc_vx_info(xid_t xid)
52 struct vx_info *new = NULL;
54 vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
56 /* would this benefit from a slab cache? */
57 new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
61 memset (new, 0, sizeof(struct vx_info));
63 INIT_HLIST_NODE(&new->vx_hlist);
64 atomic_set(&new->vx_usecnt, 0);
65 atomic_set(&new->vx_tasks, 0);
66 new->vx_parent = NULL;
68 init_waitqueue_head(&new->vx_wait);
71 get_task_struct(child_reaper);
72 new->vx_reaper = child_reaper;
74 /* rest of init goes here */
75 vx_info_init_limit(&new->limit);
76 vx_info_init_sched(&new->sched);
77 vx_info_init_cvirt(&new->cvirt);
78 vx_info_init_cacct(&new->cacct);
80 new->vx_flags = VXF_INIT_SET;
81 new->vx_bcaps = CAP_INIT_EFF_SET;
87 vxdprintk(VXD_CBIT(xid, 0),
88 "alloc_vx_info(%d) = %p", xid, new);
89 vxh_alloc_vx_info(new);
93 /* __dealloc_vx_info()
95 * final disposal of vx_info */
97 static void __dealloc_vx_info(struct vx_info *vxi)
99 vxdprintk(VXD_CBIT(xid, 0),
100 "dealloc_vx_info(%p)", vxi);
101 vxh_dealloc_vx_info(vxi);
103 vxi->vx_hlist.next = LIST_POISON1;
106 vx_info_exit_limit(&vxi->limit);
107 vx_info_exit_sched(&vxi->sched);
108 vx_info_exit_cvirt(&vxi->cvirt);
109 vx_info_exit_cacct(&vxi->cacct);
111 vxi->vx_state |= VXS_RELEASED;
115 static void __shutdown_vx_info(struct vx_info *vxi)
117 struct namespace *namespace;
118 struct fs_struct *fs;
122 vxi->vx_state |= VXS_SHUTDOWN;
123 vs_state_change(vxi, VSC_SHUTDOWN);
125 namespace = xchg(&vxi->vx_namespace, NULL);
127 put_namespace(namespace);
129 fs = xchg(&vxi->vx_fs, NULL);
136 void free_vx_info(struct vx_info *vxi)
138 /* context shutdown is mandatory */
139 BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN));
141 BUG_ON(atomic_read(&vxi->vx_usecnt));
142 BUG_ON(atomic_read(&vxi->vx_tasks));
144 BUG_ON(vx_info_state(vxi, VXS_HASHED));
146 BUG_ON(vxi->vx_namespace);
149 __dealloc_vx_info(vxi);
153 /* hash table for vx_info hash */
155 #define VX_HASH_SIZE 13
157 struct hlist_head vx_info_hash[VX_HASH_SIZE];
159 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
162 static inline unsigned int __hashval(xid_t xid)
164 return (xid % VX_HASH_SIZE);
171 * add the vxi to the global hash table
172 * requires the hash_lock to be held */
174 static inline void __hash_vx_info(struct vx_info *vxi)
176 struct hlist_head *head;
178 vxd_assert_lock(&vx_info_hash_lock);
179 vxdprintk(VXD_CBIT(xid, 4),
180 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
181 vxh_hash_vx_info(vxi);
183 /* context must not be hashed */
184 BUG_ON(vx_info_state(vxi, VXS_HASHED));
186 vxi->vx_state |= VXS_HASHED;
187 head = &vx_info_hash[__hashval(vxi->vx_id)];
188 hlist_add_head(&vxi->vx_hlist, head);
191 /* __unhash_vx_info()
193 * remove the vxi from the global hash table
194 * requires the hash_lock to be held */
196 static inline void __unhash_vx_info(struct vx_info *vxi)
198 vxd_assert_lock(&vx_info_hash_lock);
199 vxdprintk(VXD_CBIT(xid, 4),
200 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
201 vxh_unhash_vx_info(vxi);
203 /* context must be hashed */
204 BUG_ON(!vx_info_state(vxi, VXS_HASHED));
206 vxi->vx_state &= ~VXS_HASHED;
207 hlist_del(&vxi->vx_hlist);
211 /* __lookup_vx_info()
213 * requires the hash_lock to be held
214 * doesn't increment the vx_refcnt */
216 static inline struct vx_info *__lookup_vx_info(xid_t xid)
218 struct hlist_head *head = &vx_info_hash[__hashval(xid)];
219 struct hlist_node *pos;
222 vxd_assert_lock(&vx_info_hash_lock);
223 hlist_for_each(pos, head) {
224 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
226 if (vxi->vx_id == xid)
231 vxdprintk(VXD_CBIT(xid, 0),
232 "__lookup_vx_info(#%u): %p[#%u]",
233 xid, vxi, vxi?vxi->vx_id:0);
234 vxh_lookup_vx_info(vxi, xid);
241 * find unused dynamic xid
242 * requires the hash_lock to be held */
244 static inline xid_t __vx_dynamic_id(void)
246 static xid_t seq = MAX_S_CONTEXT;
249 vxd_assert_lock(&vx_info_hash_lock);
251 if (++seq > MAX_S_CONTEXT)
253 if (!__lookup_vx_info(seq)) {
254 vxdprintk(VXD_CBIT(xid, 4),
255 "__vx_dynamic_id: [#%d]", seq);
258 } while (barrier != seq);
262 #ifdef CONFIG_VSERVER_LEGACY
266 * locate or create the requested context
267 * get() it and if new hash it */
269 static struct vx_info * __loc_vx_info(int id, int *err)
271 struct vx_info *new, *vxi = NULL;
273 vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
275 if (!(new = __alloc_vx_info(id))) {
280 /* required to make dynamic xids unique */
281 spin_lock(&vx_info_hash_lock);
283 /* dynamic context requested */
284 if (id == VX_DYNAMIC_ID) {
285 id = __vx_dynamic_id();
287 printk(KERN_ERR "no dynamic context available.\n");
292 /* existing context requested */
293 else if ((vxi = __lookup_vx_info(id))) {
294 /* context in setup is not available */
295 if (vxi->vx_flags & VXF_STATE_SETUP) {
296 vxdprintk(VXD_CBIT(xid, 0),
297 "loc_vx_info(%d) = %p (not available)", id, vxi);
301 vxdprintk(VXD_CBIT(xid, 0),
302 "loc_vx_info(%d) = %p (found)", id, vxi);
309 /* new context requested */
310 vxdprintk(VXD_CBIT(xid, 0),
311 "loc_vx_info(%d) = %p (new)", id, new);
312 __hash_vx_info(get_vx_info(new));
313 vxi = new, new = NULL;
317 spin_unlock(&vx_info_hash_lock);
318 vxh_loc_vx_info(vxi, id);
320 __dealloc_vx_info(new);
326 /* __create_vx_info()
328 * create the requested context
329 * get() and hash it */
331 static struct vx_info * __create_vx_info(int id)
333 struct vx_info *new, *vxi = NULL;
335 vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id);
337 if (!(new = __alloc_vx_info(id)))
338 return ERR_PTR(-ENOMEM);
340 /* required to make dynamic xids unique */
341 spin_lock(&vx_info_hash_lock);
343 /* dynamic context requested */
344 if (id == VX_DYNAMIC_ID) {
345 id = __vx_dynamic_id();
347 printk(KERN_ERR "no dynamic context available.\n");
348 vxi = ERR_PTR(-EAGAIN);
353 /* static context requested */
354 else if ((vxi = __lookup_vx_info(id))) {
355 vxdprintk(VXD_CBIT(xid, 0),
356 "create_vx_info(%d) = %p (already there)", id, vxi);
357 if (vx_info_flags(vxi, VXF_STATE_SETUP, 0))
358 vxi = ERR_PTR(-EBUSY);
360 vxi = ERR_PTR(-EEXIST);
363 /* dynamic xid creation blocker */
364 else if (id >= MIN_D_CONTEXT) {
365 vxdprintk(VXD_CBIT(xid, 0),
366 "create_vx_info(%d) (dynamic rejected)", id);
367 vxi = ERR_PTR(-EINVAL);
372 vxdprintk(VXD_CBIT(xid, 0),
373 "create_vx_info(%d) = %p (new)", id, new);
374 __hash_vx_info(get_vx_info(new));
375 vxi = new, new = NULL;
378 spin_unlock(&vx_info_hash_lock);
379 vxh_create_vx_info(IS_ERR(vxi)?NULL:vxi, id);
381 __dealloc_vx_info(new);
389 void unhash_vx_info(struct vx_info *vxi)
391 __shutdown_vx_info(vxi);
392 spin_lock(&vx_info_hash_lock);
393 __unhash_vx_info(vxi);
394 spin_unlock(&vx_info_hash_lock);
395 __wakeup_vx_info(vxi);
401 * search for a vx_info and get() it
402 * negative id means current */
404 struct vx_info *lookup_vx_info(int id)
406 struct vx_info *vxi = NULL;
409 vxi = get_vx_info(current->vx_info);
411 spin_lock(&vx_info_hash_lock);
412 vxi = get_vx_info(__lookup_vx_info(id));
413 spin_unlock(&vx_info_hash_lock);
420 * verify that xid is still hashed */
422 int xid_is_hashed(xid_t xid)
426 spin_lock(&vx_info_hash_lock);
427 hashed = (__lookup_vx_info(xid) != NULL);
428 spin_unlock(&vx_info_hash_lock);
432 #ifdef CONFIG_VSERVER_LEGACY
434 struct vx_info *lookup_or_create_vx_info(int id)
438 return __loc_vx_info(id, &err);
443 #ifdef CONFIG_PROC_FS
445 int get_xid_list(int index, unsigned int *xids, int size)
447 int hindex, nr_xids = 0;
449 for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
450 struct hlist_head *head = &vx_info_hash[hindex];
451 struct hlist_node *pos;
453 spin_lock(&vx_info_hash_lock);
454 hlist_for_each(pos, head) {
460 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
461 xids[nr_xids] = vxi->vx_id;
462 if (++nr_xids >= size) {
463 spin_unlock(&vx_info_hash_lock);
467 /* keep the lock time short */
468 spin_unlock(&vx_info_hash_lock);
476 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
478 struct user_struct *new_user, *old_user;
482 new_user = alloc_uid(vxi->vx_id, p->uid);
487 if (new_user != old_user) {
488 atomic_inc(&new_user->processes);
489 atomic_dec(&old_user->processes);
496 void vx_mask_bcaps(struct vx_info *vxi, struct task_struct *p)
498 p->cap_effective &= vxi->vx_bcaps;
499 p->cap_inheritable &= vxi->vx_bcaps;
500 p->cap_permitted &= vxi->vx_bcaps;
504 #include <linux/file.h>
506 static int vx_openfd_task(struct task_struct *tsk)
508 struct files_struct *files = tsk->files;
510 const unsigned long *bptr;
513 /* no rcu_read_lock() because of spin_lock() */
514 spin_lock(&files->file_lock);
515 fdt = files_fdtable(files);
516 bptr = fdt->open_fds->fds_bits;
517 count = fdt->max_fds / (sizeof(unsigned long) * 8);
518 for (total = 0; count > 0; count--) {
520 total += hweight_long(*bptr);
523 spin_unlock(&files->file_lock);
528 * migrate task to new context
529 * gets vxi, puts old_vxi on change
532 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
534 struct vx_info *old_vxi;
540 old_vxi = task_get_vx_info(p);
544 vxdprintk(VXD_CBIT(xid, 5),
545 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
546 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
548 if (!(ret = vx_migrate_user(p, vxi))) {
552 openfd = vx_openfd_task(p);
555 atomic_dec(&old_vxi->cvirt.nr_threads);
556 atomic_dec(&old_vxi->cvirt.nr_running);
557 atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
558 /* FIXME: what about the struct files here? */
559 atomic_sub(openfd, &old_vxi->limit.rcur[VLIMIT_OPENFD]);
561 atomic_inc(&vxi->cvirt.nr_threads);
562 atomic_inc(&vxi->cvirt.nr_running);
563 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
564 /* FIXME: what about the struct files here? */
565 atomic_add(openfd, &vxi->limit.rcur[VLIMIT_OPENFD]);
568 release_vx_info(old_vxi, p);
569 clr_vx_info(&p->vx_info);
571 claim_vx_info(vxi, p);
572 set_vx_info(&p->vx_info, vxi);
575 vxdprintk(VXD_CBIT(xid, 5),
576 "moved task %p into vxi:%p[#%d]",
579 vx_mask_bcaps(vxi, p);
583 put_vx_info(old_vxi);
587 int vx_set_reaper(struct vx_info *vxi, struct task_struct *p)
589 struct task_struct *old_reaper;
594 vxdprintk(VXD_CBIT(xid, 6),
595 "vx_set_reaper(%p[#%d],%p[#%d,%d])",
596 vxi, vxi->vx_id, p, p->xid, p->pid);
598 old_reaper = vxi->vx_reaper;
602 /* set new child reaper */
605 put_task_struct(old_reaper);
609 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
614 vxdprintk(VXD_CBIT(xid, 6),
615 "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
616 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
618 vxi->vx_flags &= ~VXF_STATE_INIT;
619 vxi->vx_initpid = p->tgid;
623 void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code)
625 vxdprintk(VXD_CBIT(xid, 6),
626 "vx_exit_init(%p[#%d],%p[#%d,%d,%d])",
627 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
629 vxi->exit_code = code;
633 void vx_set_persistent(struct vx_info *vxi)
635 vxdprintk(VXD_CBIT(xid, 6),
636 "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id);
639 claim_vx_info(vxi, current);
642 void vx_clear_persistent(struct vx_info *vxi)
644 vxdprintk(VXD_CBIT(xid, 6),
645 "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id);
647 release_vx_info(vxi, current);
651 void vx_update_persistent(struct vx_info *vxi)
653 if (vx_info_flags(vxi, VXF_PERSISTENT, 0))
654 vx_set_persistent(vxi);
656 vx_clear_persistent(vxi);
660 /* task must be current or locked */
662 void exit_vx_info(struct task_struct *p, int code)
664 struct vx_info *vxi = p->vx_info;
667 atomic_dec(&vxi->cvirt.nr_threads);
670 vxi->exit_code = code;
671 release_vx_info(vxi, p);
675 void exit_vx_info_early(struct task_struct *p, int code)
677 struct vx_info *vxi = p->vx_info;
680 if (vxi->vx_initpid == p->tgid)
681 vx_exit_init(vxi, p, code);
682 if (vxi->vx_reaper == p)
683 vx_set_reaper(vxi, child_reaper);
688 /* vserver syscall commands below here */
690 /* taks xid and vx_info functions */
692 #include <asm/uaccess.h>
695 int vc_task_xid(uint32_t id, void __user *data)
700 struct task_struct *tsk;
702 if (!vx_check(0, VX_ADMIN|VX_WATCH))
705 read_lock(&tasklist_lock);
706 tsk = find_task_by_real_pid(id);
707 xid = (tsk) ? tsk->xid : -ESRCH;
708 read_unlock(&tasklist_lock);
711 xid = vx_current_xid();
716 int vc_vx_info(uint32_t id, void __user *data)
719 struct vcmd_vx_info_v0 vc_data;
721 if (!vx_check(0, VX_ADMIN))
723 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
726 vxi = lookup_vx_info(id);
730 vc_data.xid = vxi->vx_id;
731 vc_data.initpid = vxi->vx_initpid;
734 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
740 /* context functions */
742 int vc_ctx_create(uint32_t xid, void __user *data)
744 struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET };
745 struct vx_info *new_vxi;
748 if (!capable(CAP_SYS_ADMIN))
750 if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
753 if ((xid > MAX_S_CONTEXT) && (xid != VX_DYNAMIC_ID))
758 new_vxi = __create_vx_info(xid);
760 return PTR_ERR(new_vxi);
763 new_vxi->vx_flags = vc_data.flagword;
765 /* get a reference for persistent contexts */
766 if ((vc_data.flagword & VXF_PERSISTENT))
767 vx_set_persistent(new_vxi);
770 if (vs_state_change(new_vxi, VSC_STARTUP))
772 ret = vx_migrate_task(current, new_vxi);
774 /* return context id on success */
775 ret = new_vxi->vx_id;
779 /* prepare for context disposal */
780 new_vxi->vx_state |= VXS_SHUTDOWN;
781 if ((vc_data.flagword & VXF_PERSISTENT))
782 vx_clear_persistent(new_vxi);
783 __unhash_vx_info(new_vxi);
785 put_vx_info(new_vxi);
790 int vc_ctx_migrate(uint32_t id, void __user *data)
792 struct vcmd_ctx_migrate vc_data = { .flagword = 0 };
795 if (!capable(CAP_SYS_ADMIN))
797 if (data && copy_from_user (&vc_data, data, sizeof(vc_data)))
800 /* dirty hack until Spectator becomes a cap */
806 vxi = lookup_vx_info(id);
809 vx_migrate_task(current, vxi);
810 if (vc_data.flagword & VXM_SET_INIT)
811 vx_set_init(vxi, current);
812 if (vc_data.flagword & VXM_SET_REAPER)
813 vx_set_reaper(vxi, current);
819 int vc_get_cflags(uint32_t id, void __user *data)
822 struct vcmd_ctx_flags_v0 vc_data;
824 if (!capable(CAP_SYS_ADMIN))
827 vxi = lookup_vx_info(id);
831 vc_data.flagword = vxi->vx_flags;
833 /* special STATE flag handling */
834 vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
838 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
843 int vc_set_cflags(uint32_t id, void __user *data)
846 struct vcmd_ctx_flags_v0 vc_data;
847 uint64_t mask, trigger;
849 if (!capable(CAP_SYS_ADMIN))
851 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
854 vxi = lookup_vx_info(id);
858 /* special STATE flag handling */
859 mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
860 trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
862 if (vxi == current->vx_info) {
863 if (trigger & VXF_STATE_SETUP)
864 vx_mask_bcaps(vxi, current);
865 if (trigger & VXF_STATE_INIT) {
866 vx_set_init(vxi, current);
867 vx_set_reaper(vxi, current);
871 vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
872 vc_data.flagword, mask);
873 if (trigger & VXF_PERSISTENT)
874 vx_update_persistent(vxi);
880 int vc_get_ccaps(uint32_t id, void __user *data)
883 struct vcmd_ctx_caps_v0 vc_data;
885 if (!capable(CAP_SYS_ADMIN))
888 vxi = lookup_vx_info(id);
892 vc_data.bcaps = vxi->vx_bcaps;
893 vc_data.ccaps = vxi->vx_ccaps;
894 vc_data.cmask = ~0UL;
897 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
902 int vc_set_ccaps(uint32_t id, void __user *data)
905 struct vcmd_ctx_caps_v0 vc_data;
907 if (!capable(CAP_SYS_ADMIN))
909 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
912 vxi = lookup_vx_info(id);
916 vxi->vx_bcaps &= vc_data.bcaps;
917 vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
918 vc_data.ccaps, vc_data.cmask);
923 #include <linux/module.h>
925 EXPORT_SYMBOL_GPL(free_vx_info);