2 * linux/kernel/vserver/context.c
4 * Virtual Server: Context Support
6 * Copyright (C) 2003-2004 Herbert Pƶtzl
9 * V0.02 vx_ctx_kill syscall command
10 * V0.03 replaced context_info calls
11 * V0.04 redesign of struct (de)alloc
12 * V0.05 rlimit basic implementation
13 * V0.06 task_xid and info commands
14 * V0.07 context flags and caps
15 * V0.08 switch to RCU based hash
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
29 #define CKRM_VSERVER_INTEGRATION
30 #ifdef CKRM_VSERVER_INTEGRATION
31 #include <linux/ckrm.h>
32 #endif //CKRM_VSERVER_INTEGRATION
34 #include <asm/errno.h>
39 * allocate an initialized vx_info struct
40 * doesn't make it visible (hash) */
42 static struct vx_info *__alloc_vx_info(xid_t xid)
44 struct vx_info *new = NULL;
46 vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
48 /* would this benefit from a slab cache? */
49 new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
53 memset (new, 0, sizeof(struct vx_info));
55 INIT_RCU_HEAD(&new->vx_rcu);
56 INIT_HLIST_NODE(&new->vx_hlist);
57 atomic_set(&new->vx_refcnt, 0);
58 atomic_set(&new->vx_usecnt, 0);
60 /* rest of init goes here */
61 vx_info_init_limit(&new->limit);
62 vx_info_init_sched(&new->sched);
63 vx_info_init_cvirt(&new->cvirt);
64 vx_info_init_cacct(&new->cacct);
66 new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
67 new->vx_bcaps = CAP_INIT_EFF_SET;
70 vxdprintk(VXD_CBIT(xid, 0),
71 "alloc_vx_info(%d) = %p", xid, new);
75 /* __dealloc_vx_info()
77 * final disposal of vx_info */
79 static void __dealloc_vx_info(struct vx_info *vxi)
81 vxdprintk(VXD_CBIT(xid, 0),
82 "dealloc_vx_info(%p)", vxi);
84 vxi->vx_hlist.next = LIST_POISON1;
87 if (vxi->vx_namespace)
88 put_namespace(vxi->vx_namespace);
90 put_fs_struct(vxi->vx_fs);
92 vx_info_exit_limit(&vxi->limit);
93 vx_info_exit_sched(&vxi->sched);
94 vx_info_exit_cvirt(&vxi->cvirt);
95 vx_info_exit_cacct(&vxi->cacct);
97 BUG_ON(atomic_read(&vxi->vx_usecnt));
98 BUG_ON(atomic_read(&vxi->vx_refcnt));
104 /* hash table for vx_info hash */
106 #define VX_HASH_SIZE 13
108 struct hlist_head vx_info_hash[VX_HASH_SIZE];
110 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
113 static inline unsigned int __hashval(xid_t xid)
115 return (xid % VX_HASH_SIZE);
122 * add the vxi to the global hash table
123 * requires the hash_lock to be held */
125 static inline void __hash_vx_info(struct vx_info *vxi)
127 struct hlist_head *head;
129 vxdprintk(VXD_CBIT(xid, 4),
130 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
132 head = &vx_info_hash[__hashval(vxi->vx_id)];
133 hlist_add_head_rcu(&vxi->vx_hlist, head);
136 /* __unhash_vx_info()
138 * remove the vxi from the global hash table
139 * requires the hash_lock to be held */
141 static inline void __unhash_vx_info(struct vx_info *vxi)
143 vxdprintk(VXD_CBIT(xid, 4),
144 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
145 hlist_del_rcu(&vxi->vx_hlist);
150 /* __lookup_vx_info()
152 * requires the rcu_read_lock()
153 * doesn't increment the vx_refcnt */
155 static inline struct vx_info *__lookup_vx_info(xid_t xid)
157 struct hlist_head *head = &vx_info_hash[__hashval(xid)];
158 struct hlist_node *pos;
160 hlist_for_each_rcu(pos, head) {
161 struct vx_info *vxi =
162 hlist_entry(pos, struct vx_info, vx_hlist);
164 if (vxi->vx_id == xid) {
174 * find unused dynamic xid
175 * requires the hash_lock to be held */
177 static inline xid_t __vx_dynamic_id(void)
179 static xid_t seq = MAX_S_CONTEXT;
183 if (++seq > MAX_S_CONTEXT)
185 if (!__lookup_vx_info(seq)) {
186 vxdprintk(VXD_CBIT(xid, 4),
187 "__vx_dynamic_id: [#%d]", seq);
190 } while (barrier != seq);
196 * locate or create the requested context
197 * get() it and if new hash it */
199 static struct vx_info * __loc_vx_info(int id, int *err)
201 struct vx_info *new, *vxi = NULL;
203 vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
205 if (!(new = __alloc_vx_info(id))) {
210 spin_lock(&vx_info_hash_lock);
212 /* dynamic context requested */
213 if (id == VX_DYNAMIC_ID) {
214 id = __vx_dynamic_id();
216 printk(KERN_ERR "no dynamic context available.\n");
221 /* existing context requested */
222 else if ((vxi = __lookup_vx_info(id))) {
223 /* context in setup is not available */
224 if (vxi->vx_flags & VXF_STATE_SETUP) {
225 vxdprintk(VXD_CBIT(xid, 0),
226 "loc_vx_info(%d) = %p (not available)", id, vxi);
230 vxdprintk(VXD_CBIT(xid, 0),
231 "loc_vx_info(%d) = %p (found)", id, vxi);
238 /* new context requested */
239 vxdprintk(VXD_CBIT(xid, 0),
240 "loc_vx_info(%d) = %p (new)", id, new);
241 __hash_vx_info(get_vx_info(new));
242 vxi = new, new = NULL;
246 spin_unlock(&vx_info_hash_lock);
248 __dealloc_vx_info(new);
258 void rcu_free_vx_info(struct rcu_head *head)
260 struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
263 BUG_ON(!vxi || !head);
265 usecnt = atomic_read(&vxi->vx_usecnt);
268 refcnt = atomic_read(&vxi->vx_refcnt);
271 vxdprintk(VXD_CBIT(xid, 3),
272 "rcu_free_vx_info(%p): uc=%d", vxi, usecnt);
274 __dealloc_vx_info(vxi);
276 printk("!!! rcu didn't free\n");
279 void unhash_vx_info(struct vx_info *vxi)
281 spin_lock(&vx_info_hash_lock);
282 __unhash_vx_info(vxi);
283 spin_unlock(&vx_info_hash_lock);
288 * search for a vx_info and get() it
289 * negative id means current */
291 struct vx_info *locate_vx_info(int id)
296 vxi = get_vx_info(current->vx_info);
299 vxi = get_vx_info(__lookup_vx_info(id));
305 /* vx_info_is_hashed()
307 * verify that xid is still hashed */
309 int vx_info_is_hashed(xid_t xid)
314 hashed = (__lookup_vx_info(xid) != NULL);
319 #ifdef CONFIG_VSERVER_LEGACY
322 struct vx_info *alloc_vx_info(xid_t xid)
324 return __alloc_vx_info(xid);
328 struct vx_info *locate_or_create_vx_info(int id)
332 return __loc_vx_info(id, &err);
337 #ifdef CONFIG_PROC_FS
339 #define hlist_for_each_rcu(pos, head) \
340 for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
341 pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
343 int get_xid_list(int index, unsigned int *xids, int size)
345 int hindex, nr_xids = 0;
348 for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
349 struct hlist_head *head = &vx_info_hash[hindex];
350 struct hlist_node *pos;
352 hlist_for_each_rcu(pos, head) {
358 vxi = hlist_entry(pos, struct vx_info, vx_hlist);
359 xids[nr_xids] = vxi->vx_id;
360 if (++nr_xids >= size)
370 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
372 struct user_struct *new_user, *old_user;
376 new_user = alloc_uid(vxi->vx_id, p->uid);
381 if (new_user != old_user) {
382 atomic_inc(&new_user->processes);
383 atomic_dec(&old_user->processes);
390 void vx_mask_bcaps(struct task_struct *p)
392 struct vx_info *vxi = p->vx_info;
394 p->cap_effective &= vxi->vx_bcaps;
395 p->cap_inheritable &= vxi->vx_bcaps;
396 p->cap_permitted &= vxi->vx_bcaps;
400 #include <linux/file.h>
402 static inline int vx_nofiles_task(struct task_struct *tsk)
404 struct files_struct *files = tsk->files;
405 const unsigned long *obptr;
408 spin_lock(&files->file_lock);
409 obptr = files->open_fds->fds_bits;
410 count = files->max_fds / (sizeof(unsigned long) * 8);
411 for (total = 0; count > 0; count--) {
413 total += hweight_long(*obptr);
416 spin_unlock(&files->file_lock);
422 static inline int vx_openfd_task(struct task_struct *tsk)
424 struct files_struct *files = tsk->files;
425 const unsigned long *bptr;
428 spin_lock(&files->file_lock);
429 bptr = files->open_fds->fds_bits;
430 count = files->max_fds / (sizeof(unsigned long) * 8);
431 for (total = 0; count > 0; count--) {
433 total += hweight_long(*bptr);
436 spin_unlock(&files->file_lock);
443 * migrate task to new context
444 * gets vxi, puts old_vxi on change
447 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
449 struct vx_info *old_vxi;
455 old_vxi = task_get_vx_info(p);
459 vxdprintk(VXD_CBIT(xid, 5),
460 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
461 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
463 if (!(ret = vx_migrate_user(p, vxi))) {
467 // openfd = vx_openfd_task(p);
468 nofiles = vx_nofiles_task(p);
471 atomic_dec(&old_vxi->cacct.nr_threads);
472 atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
473 atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
474 // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
476 atomic_inc(&vxi->cacct.nr_threads);
477 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
478 atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
479 // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
481 vxdprintk(VXD_CBIT(xid, 5),
482 "moved task %p into vxi:%p[#%d]",
485 /* should be handled in set_vx_info !! */
487 clr_vx_info(&p->vx_info);
488 set_vx_info(&p->vx_info, vxi);
493 /* obsoleted by clr/set */
494 // put_vx_info(old_vxi);
499 #ifdef CKRM_VSERVER_INTEGRATION
503 #endif //CKRM_VSERVER_INTEGRATION
506 put_vx_info(old_vxi);
510 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
517 vxi->vx_initpid = p->tgid;
522 /* vserver syscall commands below here */
524 /* taks xid and vx_info functions */
526 #include <asm/uaccess.h>
529 int vc_task_xid(uint32_t id, void __user *data)
534 struct task_struct *tsk;
536 if (!vx_check(0, VX_ADMIN|VX_WATCH))
539 read_lock(&tasklist_lock);
540 tsk = find_task_by_pid(id);
541 xid = (tsk) ? tsk->xid : -ESRCH;
542 read_unlock(&tasklist_lock);
550 int vc_vx_info(uint32_t id, void __user *data)
553 struct vcmd_vx_info_v0 vc_data;
555 if (!vx_check(0, VX_ADMIN))
557 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
560 vxi = locate_vx_info(id);
564 vc_data.xid = vxi->vx_id;
565 vc_data.initpid = vxi->vx_initpid;
568 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
574 /* context functions */
576 int vc_ctx_create(uint32_t xid, void __user *data)
578 struct vx_info *new_vxi;
581 if (!capable(CAP_SYS_ADMIN))
584 if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
590 new_vxi = __loc_vx_info(xid, &ret);
593 if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
598 ret = new_vxi->vx_id;
599 vx_migrate_task(current, new_vxi);
600 /* if this fails, we might end up with a hashed vx_info */
602 put_vx_info(new_vxi);
607 int vc_ctx_migrate(uint32_t id, void __user *data)
611 if (!capable(CAP_SYS_ADMIN))
614 /* dirty hack until Spectator becomes a cap */
620 vxi = locate_vx_info(id);
623 vx_migrate_task(current, vxi);
629 int vc_get_cflags(uint32_t id, void __user *data)
632 struct vcmd_ctx_flags_v0 vc_data;
634 if (!capable(CAP_SYS_ADMIN))
637 vxi = locate_vx_info(id);
641 vc_data.flagword = vxi->vx_flags;
643 /* special STATE flag handling */
644 vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
648 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
653 int vc_set_cflags(uint32_t id, void __user *data)
656 struct vcmd_ctx_flags_v0 vc_data;
657 uint64_t mask, trigger;
659 if (!capable(CAP_SYS_ADMIN))
661 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
664 vxi = locate_vx_info(id);
668 /* special STATE flag handling */
669 mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
670 trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
672 if (trigger & VXF_STATE_SETUP)
673 vx_mask_bcaps(current);
674 if (trigger & VXF_STATE_INIT)
675 if (vxi == current->vx_info)
676 vx_set_init(vxi, current);
678 vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
679 vc_data.flagword, mask);
684 int vc_get_ccaps(uint32_t id, void __user *data)
687 struct vcmd_ctx_caps_v0 vc_data;
689 if (!capable(CAP_SYS_ADMIN))
692 vxi = locate_vx_info(id);
696 vc_data.bcaps = vxi->vx_bcaps;
697 vc_data.ccaps = vxi->vx_ccaps;
698 vc_data.cmask = ~0UL;
701 if (copy_to_user (data, &vc_data, sizeof(vc_data)))
706 int vc_set_ccaps(uint32_t id, void __user *data)
709 struct vcmd_ctx_caps_v0 vc_data;
711 if (!capable(CAP_SYS_ADMIN))
713 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
716 vxi = locate_vx_info(id);
720 vxi->vx_bcaps &= vc_data.bcaps;
721 vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
722 vc_data.ccaps, vc_data.cmask);
727 #include <linux/module.h>
729 EXPORT_SYMBOL_GPL(rcu_free_vx_info);
730 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
731 EXPORT_SYMBOL_GPL(unhash_vx_info);