2 * linux/kernel/vserver/context.c
4 * Virtual Server: Context Support
6 * Copyright (C) 2003-2005 Herbert Pƶtzl
9 * V0.02 vx_ctx_kill syscall command
10 * V0.03 replaced context_info calls
11 * V0.04 redesign of struct (de)alloc
12 * V0.05 rlimit basic implementation
13 * V0.06 task_xid and info commands
14 * V0.07 context flags and caps
15 * V0.08 switch to RCU based hash
16 * V0.09 revert to non RCU for now
17 * V0.10 and back to working RCU hash
21 #include <linux/config.h>
22 #include <linux/slab.h>
23 #include <linux/types.h>
24 #include <linux/namespace.h>
26 #include <linux/sched.h>
27 #include <linux/vserver/network.h>
28 #include <linux/vserver/legacy.h>
29 #include <linux/vserver/limit.h>
30 #include <linux/vserver/debug.h>
31 #include <linux/vs_context.h>
32 #include <linux/vserver/context_cmd.h>
33 #include <linux/ckrm_events.h> /* needed for ckrm_cb_xid() */
35 #include <asm/errno.h>
37 #include "cvirt_init.h"
38 #include "limit_init.h"
39 #include "sched_init.h"
44 * allocate an initialized vx_info struct
45 * doesn't make it visible (hash) */
47 static struct vx_info *__alloc_vx_info(xid_t xid)
49 struct vx_info *new = NULL;
51 vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
53 /* would this benefit from a slab cache? */
54 new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
58 memset (new, 0, sizeof(struct vx_info));
60 INIT_RCU_HEAD(&new->vx_rcu);
61 INIT_HLIST_NODE(&new->vx_hlist);
62 atomic_set(&new->vx_refcnt, 0);
63 atomic_set(&new->vx_usecnt, 0);
64 new->vx_parent = NULL;
66 new->vx_lock = SPIN_LOCK_UNLOCKED;
67 init_waitqueue_head(&new->vx_exit);
69 /* rest of init goes here */
70 vx_info_init_limit(&new->limit);
71 vx_info_init_sched(&new->sched);
72 vx_info_init_cvirt(&new->cvirt);
73 vx_info_init_cacct(&new->cacct);
76 new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
77 new->vx_bcaps = CAP_INIT_EFF_SET;
80 vxdprintk(VXD_CBIT(xid, 0),
81 "alloc_vx_info(%d) = %p", xid, new);
82 vxh_alloc_vx_info(new);
86 /* __dealloc_vx_info()
88 * final disposal of vx_info */
90 static void __dealloc_vx_info(struct vx_info *vxi)
92 vxdprintk(VXD_CBIT(xid, 0),
93 "dealloc_vx_info(%p)", vxi);
94 vxh_dealloc_vx_info(vxi);
96 vxi->vx_hlist.next = LIST_POISON1;
99 vx_info_exit_limit(&vxi->limit);
100 vx_info_exit_sched(&vxi->sched);
101 vx_info_exit_cvirt(&vxi->cvirt);
102 vx_info_exit_cacct(&vxi->cacct);
105 BUG_ON(atomic_read(&vxi->vx_usecnt));
106 BUG_ON(atomic_read(&vxi->vx_refcnt));
108 BUG_ON(vx_info_state(vxi, VXS_HASHED));
109 // BUG_ON(!vx_state(vxi, VXS_DEFUNCT));
111 vxi->vx_state |= VXS_RELEASED;
115 static inline int __free_vx_info(struct vx_info *vxi)
121 usecnt = atomic_read(&vxi->vx_usecnt);
124 refcnt = atomic_read(&vxi->vx_refcnt);
128 __dealloc_vx_info(vxi);
132 static void __rcu_put_vx_info(struct rcu_head *head)
134 struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
136 vxdprintk(VXD_CBIT(xid, 3),
137 "__rcu_put_vx_info(%p[#%d]): %d,%d",
139 atomic_read(&vxi->vx_usecnt),
140 atomic_read(&vxi->vx_refcnt));
144 void __shutdown_vx_info(struct vx_info *vxi)
146 struct namespace *namespace;
147 struct fs_struct *fs;
151 namespace = xchg(&vxi->vx_namespace, NULL);
153 put_namespace(namespace);
155 fs = xchg(&vxi->vx_fs, NULL);
162 void free_vx_info(struct vx_info *vxi)
164 /* context shutdown is mandatory */
165 // BUG_ON(vxi->vx_state != VXS_SHUTDOWN);
167 BUG_ON(vxi->vx_state & VXS_HASHED);
169 BUG_ON(vxi->vx_namespace);
172 BUG_ON(__free_vx_info(vxi));
176 /* hash table for vx_info hash */
178 #define VX_HASH_SIZE 13
180 struct hlist_head vx_info_hash[VX_HASH_SIZE];
182 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
185 static inline unsigned int __hashval(xid_t xid)
187 return (xid % VX_HASH_SIZE);
194 * add the vxi to the global hash table
195 * requires the hash_lock to be held */
197 static inline void __hash_vx_info(struct vx_info *vxi)
199 struct hlist_head *head;
201 vxdprintk(VXD_CBIT(xid, 4),
202 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
203 vxh_hash_vx_info(vxi);
206 vxi->vx_state |= VXS_HASHED;
207 head = &vx_info_hash[__hashval(vxi->vx_id)];
208 hlist_add_head_rcu(&vxi->vx_hlist, head);
211 /* __unhash_vx_info()
213 * remove the vxi from the global hash table
214 * requires the hash_lock to be held */
216 static inline void __unhash_vx_info(struct vx_info *vxi)
218 vxdprintk(VXD_CBIT(xid, 4),
219 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
220 vxh_unhash_vx_info(vxi);
222 vxi->vx_state &= ~VXS_HASHED;
223 hlist_del_rcu(&vxi->vx_hlist);
225 call_rcu(&vxi->vx_rcu, __rcu_put_vx_info);
229 /* __lookup_vx_info()
231 * requires the rcu_read_lock()
232 * doesn't increment the vx_refcnt */
234 static inline struct vx_info *__lookup_vx_info(xid_t xid)
236 struct hlist_head *head = &vx_info_hash[__hashval(xid)];
237 struct hlist_node *pos;
240 hlist_for_each_rcu(pos, head) {
241 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
243 if ((vxi->vx_id == xid) &&
244 vx_info_state(vxi, VXS_HASHED))
249 vxdprintk(VXD_CBIT(xid, 0),
250 "__lookup_vx_info(#%u): %p[#%u]",
251 xid, vxi, vxi?vxi->vx_id:0);
252 vxh_lookup_vx_info(xid, vxi);
259 * find unused dynamic xid
260 * requires the rcu_read_lock()
261 * requires the hash_lock to be held */
263 static inline xid_t __vx_dynamic_id(void)
265 static xid_t seq = MAX_S_CONTEXT;
269 if (++seq > MAX_S_CONTEXT)
271 if (!__lookup_vx_info(seq)) {
272 vxdprintk(VXD_CBIT(xid, 4),
273 "__vx_dynamic_id: [#%d]", seq);
276 } while (barrier != seq);
282 * locate or create the requested context
283 * get() it and if new hash it */
285 static struct vx_info * __loc_vx_info(int id, int *err)
287 struct vx_info *new, *vxi = NULL;
289 vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
291 if (!(new = __alloc_vx_info(id))) {
296 /* FIXME is this required at all ? */
298 /* required to make dynamic xids unique */
299 spin_lock(&vx_info_hash_lock);
301 /* dynamic context requested */
302 if (id == VX_DYNAMIC_ID) {
303 id = __vx_dynamic_id();
305 printk(KERN_ERR "no dynamic context available.\n");
310 /* existing context requested */
311 else if ((vxi = __lookup_vx_info(id))) {
312 /* context in setup is not available */
313 if (vxi->vx_flags & VXF_STATE_SETUP) {
314 vxdprintk(VXD_CBIT(xid, 0),
315 "loc_vx_info(%d) = %p (not available)", id, vxi);
319 vxdprintk(VXD_CBIT(xid, 0),
320 "loc_vx_info(%d) = %p (found)", id, vxi);
327 /* new context requested */
328 vxdprintk(VXD_CBIT(xid, 0),
329 "loc_vx_info(%d) = %p (new)", id, new);
330 __hash_vx_info(get_vx_info(new));
331 vxi = new, new = NULL;
335 spin_unlock(&vx_info_hash_lock);
337 vxh_loc_vx_info(id, vxi);
339 __dealloc_vx_info(new);
348 void unhash_vx_info(struct vx_info *vxi)
350 __shutdown_vx_info(vxi);
351 spin_lock(&vx_info_hash_lock);
352 __unhash_vx_info(vxi);
353 spin_unlock(&vx_info_hash_lock);
358 * search for a vx_info and get() it
359 * negative id means current */
361 struct vx_info *locate_vx_info(int id)
366 vxi = get_vx_info(current->vx_info);
369 vxi = get_vx_info(__lookup_vx_info(id));
375 /* vx_info_is_hashed()
377 * verify that xid is still hashed */
379 int vx_info_is_hashed(xid_t xid)
384 hashed = (__lookup_vx_info(xid) != NULL);
389 #ifdef CONFIG_VSERVER_LEGACY
392 struct vx_info *alloc_vx_info(xid_t xid)
394 return __alloc_vx_info(xid);
398 struct vx_info *locate_or_create_vx_info(int id)
402 return __loc_vx_info(id, &err);
407 #ifdef CONFIG_PROC_FS
409 int get_xid_list(int index, unsigned int *xids, int size)
411 int hindex, nr_xids = 0;
414 for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
415 struct hlist_head *head = &vx_info_hash[hindex];
416 struct hlist_node *pos;
418 hlist_for_each_rcu(pos, head) {
424 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
425 xids[nr_xids] = vxi->vx_id;
426 if (++nr_xids >= size)
436 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
438 struct user_struct *new_user, *old_user;
442 new_user = alloc_uid(vxi->vx_id, p->uid);
447 if (new_user != old_user) {
448 atomic_inc(&new_user->processes);
449 atomic_dec(&old_user->processes);
456 void vx_mask_bcaps(struct task_struct *p)
458 struct vx_info *vxi = p->vx_info;
460 p->cap_effective &= vxi->vx_bcaps;
461 p->cap_inheritable &= vxi->vx_bcaps;
462 p->cap_permitted &= vxi->vx_bcaps;
466 #include <linux/file.h>
468 static inline int vx_nofiles_task(struct task_struct *tsk)
470 struct files_struct *files = tsk->files;
471 unsigned long *obptr;
474 spin_lock(&files->file_lock);
475 obptr = files->open_fds->fds_bits;
476 count = files->max_fds / (sizeof(unsigned long) * 8);
477 for (total = 0; count > 0; count--) {
479 total += hweight_long(*obptr);
482 spin_unlock(&files->file_lock);
488 static inline int vx_openfd_task(struct task_struct *tsk)
490 struct files_struct *files = tsk->files;
491 const unsigned long *bptr;
494 spin_lock(&files->file_lock);
495 bptr = files->open_fds->fds_bits;
496 count = files->max_fds / (sizeof(unsigned long) * 8);
497 for (total = 0; count > 0; count--) {
499 total += hweight_long(*bptr);
502 spin_unlock(&files->file_lock);
509 * migrate task to new context
510 * gets vxi, puts old_vxi on change
513 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
515 struct vx_info *old_vxi;
521 old_vxi = task_get_vx_info(p);
525 vxdprintk(VXD_CBIT(xid, 5),
526 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
527 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
529 if (!(ret = vx_migrate_user(p, vxi))) {
533 // openfd = vx_openfd_task(p);
534 nofiles = vx_nofiles_task(p);
537 atomic_dec(&old_vxi->cvirt.nr_threads);
538 atomic_dec(&old_vxi->cvirt.nr_running);
539 atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
540 /* FIXME: what about the struct files here? */
541 // atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
542 // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
544 atomic_inc(&vxi->cvirt.nr_threads);
545 atomic_inc(&vxi->cvirt.nr_running);
546 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
547 /* FIXME: what about the struct files here? */
548 // atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
549 // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
551 vxdprintk(VXD_CBIT(xid, 5),
552 "moved task %p into vxi:%p[#%d]",
555 /* should be handled in set_vx_info !! */
557 clr_vx_info(&p->vx_info);
558 set_vx_info(&p->vx_info, vxi);
563 /* obsoleted by clr/set */
564 // put_vx_info(old_vxi);
571 put_vx_info(old_vxi);
575 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
582 vxdprintk(VXD_CBIT(xid, 6),
583 "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
584 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
586 vxi->vx_initpid = p->tgid;
591 /* vserver syscall commands below here */
593 /* taks xid and vx_info functions */
595 #include <asm/uaccess.h>
598 int vc_task_xid(uint32_t id, void __user *data)
603 struct task_struct *tsk;
605 if (!vx_check(0, VX_ADMIN|VX_WATCH))
608 read_lock(&tasklist_lock);
609 tsk = find_task_by_real_pid(id);
610 xid = (tsk) ? tsk->xid : -ESRCH;
611 read_unlock(&tasklist_lock);
614 xid = vx_current_xid();
619 int vc_vx_info(uint32_t id, void __user *data)
622 struct vcmd_vx_info_v0 vc_data;
624 if (!vx_check(0, VX_ADMIN))
626 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
629 vxi = locate_vx_info(id);
633 vc_data.xid = vxi->vx_id;
634 vc_data.initpid = vxi->vx_initpid;
637 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
643 /* context functions */
645 int vc_ctx_create(uint32_t xid, void __user *data)
647 struct vx_info *new_vxi;
650 if (!capable(CAP_SYS_ADMIN))
653 if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
659 new_vxi = __loc_vx_info(xid, &ret);
662 if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
667 ret = new_vxi->vx_id;
668 vx_migrate_task(current, new_vxi);
669 /* if this fails, we might end up with a hashed vx_info */
671 put_vx_info(new_vxi);
676 int vc_ctx_migrate(uint32_t id, void __user *data)
680 if (!capable(CAP_SYS_ADMIN))
683 /* dirty hack until Spectator becomes a cap */
689 vxi = locate_vx_info(id);
692 vx_migrate_task(current, vxi);
698 int vc_get_cflags(uint32_t id, void __user *data)
701 struct vcmd_ctx_flags_v0 vc_data;
703 if (!capable(CAP_SYS_ADMIN))
706 vxi = locate_vx_info(id);
710 vc_data.flagword = vxi->vx_flags;
712 /* special STATE flag handling */
713 vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
717 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
722 int vc_set_cflags(uint32_t id, void __user *data)
725 struct vcmd_ctx_flags_v0 vc_data;
726 uint64_t mask, trigger;
728 if (!capable(CAP_SYS_ADMIN))
730 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
733 vxi = locate_vx_info(id);
737 /* special STATE flag handling */
738 mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
739 trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
741 if (trigger & VXF_STATE_SETUP)
742 vx_mask_bcaps(current);
743 if (trigger & VXF_STATE_INIT)
744 if (vxi == current->vx_info)
745 vx_set_init(vxi, current);
747 vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
748 vc_data.flagword, mask);
753 int vc_get_ccaps(uint32_t id, void __user *data)
756 struct vcmd_ctx_caps_v0 vc_data;
758 if (!capable(CAP_SYS_ADMIN))
761 vxi = locate_vx_info(id);
765 vc_data.bcaps = vxi->vx_bcaps;
766 vc_data.ccaps = vxi->vx_ccaps;
767 vc_data.cmask = ~0UL;
770 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
775 int vc_set_ccaps(uint32_t id, void __user *data)
778 struct vcmd_ctx_caps_v0 vc_data;
780 if (!capable(CAP_SYS_ADMIN))
782 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
785 vxi = locate_vx_info(id);
789 vxi->vx_bcaps &= vc_data.bcaps;
790 vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
791 vc_data.ccaps, vc_data.cmask);
796 #include <linux/module.h>
798 EXPORT_SYMBOL_GPL(free_vx_info);
799 EXPORT_SYMBOL_GPL(unhash_vx_info);