a1860ef93536adc5433124688aba59478234a0a5
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
28
29 #define CKRM_VSERVER_INTEGRATION
30 #ifdef CKRM_VSERVER_INTEGRATION
31 #include <linux/ckrm.h>
32 #endif //CKRM_VSERVER_INTEGRATION
33
34 #include <asm/errno.h>
35
36
37 /*      __alloc_vx_info()
38
39         * allocate an initialized vx_info struct
40         * doesn't make it visible (hash)                        */
41
42 static struct vx_info *__alloc_vx_info(xid_t xid)
43 {
44         struct vx_info *new = NULL;
45         
46         vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
47
48         /* would this benefit from a slab cache? */
49         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
50         if (!new)
51                 return 0;
52
53         memset (new, 0, sizeof(struct vx_info));
54         new->vx_id = xid;
55         INIT_RCU_HEAD(&new->vx_rcu);
56         INIT_HLIST_NODE(&new->vx_hlist);
57         atomic_set(&new->vx_refcnt, 0);
58         atomic_set(&new->vx_usecnt, 0);
59
60         /* rest of init goes here */
61         vx_info_init_limit(&new->limit);
62         vx_info_init_sched(&new->sched);
63         vx_info_init_cvirt(&new->cvirt);
64         vx_info_init_cacct(&new->cacct);
65
66         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
67         new->vx_bcaps = CAP_INIT_EFF_SET;
68         new->vx_ccaps = 0;
69
70         vxdprintk(VXD_CBIT(xid, 0),
71                 "alloc_vx_info(%d) = %p", xid, new);
72         return new;
73 }
74
75 /*      __dealloc_vx_info()
76
77         * final disposal of vx_info                             */
78
79 static void __dealloc_vx_info(struct vx_info *vxi)
80 {
81         vxdprintk(VXD_CBIT(xid, 0),
82                 "dealloc_vx_info(%p)", vxi);
83
84         vxi->vx_hlist.next = LIST_POISON1;
85         vxi->vx_id = -1;
86
87         if (vxi->vx_namespace)
88                 put_namespace(vxi->vx_namespace);
89         if (vxi->vx_fs)
90                 put_fs_struct(vxi->vx_fs);
91         
92         vx_info_exit_limit(&vxi->limit);
93         vx_info_exit_sched(&vxi->sched);
94         vx_info_exit_cvirt(&vxi->cvirt);
95         vx_info_exit_cacct(&vxi->cacct);
96         
97         BUG_ON(atomic_read(&vxi->vx_usecnt));
98         BUG_ON(atomic_read(&vxi->vx_refcnt));
99
100         kfree(vxi);
101 }
102
103
104 /*      hash table for vx_info hash */
105
106 #define VX_HASH_SIZE    13
107
108 struct hlist_head vx_info_hash[VX_HASH_SIZE];
109
110 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
111
112
113 static inline unsigned int __hashval(xid_t xid)
114 {
115         return (xid % VX_HASH_SIZE);
116 }
117
118
119
120 /*      __hash_vx_info()
121
122         * add the vxi to the global hash table
123         * requires the hash_lock to be held                     */
124
125 static inline void __hash_vx_info(struct vx_info *vxi)
126 {
127         struct hlist_head *head;
128         
129         vxdprintk(VXD_CBIT(xid, 4),
130                 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
131         get_vx_info(vxi);
132         head = &vx_info_hash[__hashval(vxi->vx_id)];
133         hlist_add_head_rcu(&vxi->vx_hlist, head);
134 }
135
136 /*      __unhash_vx_info()
137
138         * remove the vxi from the global hash table
139         * requires the hash_lock to be held                     */
140
141 static inline void __unhash_vx_info(struct vx_info *vxi)
142 {
143         vxdprintk(VXD_CBIT(xid, 4),
144                 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
145         hlist_del_rcu(&vxi->vx_hlist);
146         put_vx_info(vxi);
147 }
148
149
150 /*      __lookup_vx_info()
151
152         * requires the rcu_read_lock()
153         * doesn't increment the vx_refcnt                       */
154
155 static inline struct vx_info *__lookup_vx_info(xid_t xid)
156 {
157         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
158         struct hlist_node *pos;
159
160         hlist_for_each_rcu(pos, head) {
161                 struct vx_info *vxi =
162                         hlist_entry(pos, struct vx_info, vx_hlist);
163
164                 if (vxi->vx_id == xid) {
165                         return vxi;
166                 }
167         }
168         return NULL;
169 }
170
171
172 /*      __vx_dynamic_id()
173
174         * find unused dynamic xid
175         * requires the hash_lock to be held                     */
176
177 static inline xid_t __vx_dynamic_id(void)
178 {
179         static xid_t seq = MAX_S_CONTEXT;
180         xid_t barrier = seq;
181         
182         do {
183                 if (++seq > MAX_S_CONTEXT)
184                         seq = MIN_D_CONTEXT;
185                 if (!__lookup_vx_info(seq)) {
186                         vxdprintk(VXD_CBIT(xid, 4),
187                                 "__vx_dynamic_id: [#%d]", seq);
188                         return seq;
189                 }
190         } while (barrier != seq);
191         return 0;
192 }
193
194 /*      __loc_vx_info()
195
196         * locate or create the requested context
197         * get() it and if new hash it                           */
198
199 static struct vx_info * __loc_vx_info(int id, int *err)
200 {
201         struct vx_info *new, *vxi = NULL;
202         
203         vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
204
205         if (!(new = __alloc_vx_info(id))) {
206                 *err = -ENOMEM;
207                 return NULL;
208         }
209
210         spin_lock(&vx_info_hash_lock);
211
212         /* dynamic context requested */
213         if (id == VX_DYNAMIC_ID) {
214                 id = __vx_dynamic_id();
215                 if (!id) {
216                         printk(KERN_ERR "no dynamic context available.\n");
217                         goto out_unlock;
218                 }
219                 new->vx_id = id;
220         }
221         /* existing context requested */
222         else if ((vxi = __lookup_vx_info(id))) {
223                 /* context in setup is not available */
224                 if (vxi->vx_flags & VXF_STATE_SETUP) {
225                         vxdprintk(VXD_CBIT(xid, 0),
226                                 "loc_vx_info(%d) = %p (not available)", id, vxi);
227                         vxi = NULL;
228                         *err = -EBUSY;
229                 } else {
230                         vxdprintk(VXD_CBIT(xid, 0),
231                                 "loc_vx_info(%d) = %p (found)", id, vxi);
232                         get_vx_info(vxi);
233                         *err = 0;
234                 }
235                 goto out_unlock;
236         }
237
238         /* new context requested */
239         vxdprintk(VXD_CBIT(xid, 0),
240                 "loc_vx_info(%d) = %p (new)", id, new);
241         __hash_vx_info(get_vx_info(new));
242         vxi = new, new = NULL;
243         *err = 1;
244
245 out_unlock:
246         spin_unlock(&vx_info_hash_lock);
247         if (new)
248                 __dealloc_vx_info(new);
249         return vxi;
250 }
251
252
253
254 /*      exported stuff                                          */
255
256
257
258 void rcu_free_vx_info(struct rcu_head *head)
259 {
260         struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
261         int usecnt, refcnt;
262
263         BUG_ON(!vxi || !head);
264
265         usecnt = atomic_read(&vxi->vx_usecnt);
266         BUG_ON(usecnt < 0);
267
268         refcnt = atomic_read(&vxi->vx_refcnt);
269         BUG_ON(refcnt < 0);
270
271         vxdprintk(VXD_CBIT(xid, 3),
272                 "rcu_free_vx_info(%p): uc=%d", vxi, usecnt);
273         if (!usecnt)
274                 __dealloc_vx_info(vxi);
275         else
276                 printk("!!! rcu didn't free\n");
277 }
278
279 void unhash_vx_info(struct vx_info *vxi)
280 {
281         spin_lock(&vx_info_hash_lock);
282         __unhash_vx_info(vxi);
283         spin_unlock(&vx_info_hash_lock);
284 }
285
286 /*      locate_vx_info()
287
288         * search for a vx_info and get() it                     
289         * negative id means current                             */
290
291 struct vx_info *locate_vx_info(int id)
292 {
293         struct vx_info *vxi;
294         
295         if (id < 0) {
296                 vxi = get_vx_info(current->vx_info);
297         } else {
298                 rcu_read_lock();
299                 vxi = get_vx_info(__lookup_vx_info(id));
300                 rcu_read_unlock();
301         }
302         return vxi;
303 }
304
305 /*      vx_info_is_hashed()
306
307         * verify that xid is still hashed                       */
308
309 int vx_info_is_hashed(xid_t xid)
310 {
311         int hashed;
312
313         rcu_read_lock();
314         hashed = (__lookup_vx_info(xid) != NULL);
315         rcu_read_unlock();
316         return hashed;
317 }
318
319 #ifdef  CONFIG_VSERVER_LEGACY
320
321 #if 0
322 struct vx_info *alloc_vx_info(xid_t xid)
323 {
324         return __alloc_vx_info(xid);
325 }
326 #endif
327
328 struct vx_info *locate_or_create_vx_info(int id)
329 {
330         int err;
331
332         return __loc_vx_info(id, &err);
333 }
334
335 #endif
336
337 #ifdef  CONFIG_PROC_FS
338
339 #define hlist_for_each_rcu(pos, head) \
340         for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
341                 pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
342
343 int get_xid_list(int index, unsigned int *xids, int size)
344 {
345         int hindex, nr_xids = 0;
346
347         rcu_read_lock();
348         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
349                 struct hlist_head *head = &vx_info_hash[hindex];
350                 struct hlist_node *pos;
351
352                 hlist_for_each_rcu(pos, head) {
353                         struct vx_info *vxi;
354
355                         if (--index > 0)
356                                 continue;
357
358                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
359                         xids[nr_xids] = vxi->vx_id;                     
360                         if (++nr_xids >= size)
361                                 goto out;
362                 }
363         }
364 out:
365         rcu_read_unlock();
366         return nr_xids;
367 }
368 #endif
369
370 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
371 {
372         struct user_struct *new_user, *old_user;
373         
374         if (!p || !vxi)
375                 BUG();
376         new_user = alloc_uid(vxi->vx_id, p->uid);
377         if (!new_user)
378                 return -ENOMEM;
379
380         old_user = p->user;
381         if (new_user != old_user) {
382                 atomic_inc(&new_user->processes);
383                 atomic_dec(&old_user->processes);
384                 p->user = new_user;
385         }
386         free_uid(old_user);
387         return 0;
388 }
389
390 void vx_mask_bcaps(struct task_struct *p)
391 {
392         struct vx_info *vxi = p->vx_info;
393
394         p->cap_effective &= vxi->vx_bcaps;
395         p->cap_inheritable &= vxi->vx_bcaps;
396         p->cap_permitted &= vxi->vx_bcaps;
397 }
398
399
400 #include <linux/file.h>
401
402 static inline int vx_nofiles_task(struct task_struct *tsk)
403 {
404         struct files_struct *files = tsk->files;
405         const unsigned long *obptr;
406         int count, total;
407
408         spin_lock(&files->file_lock);
409         obptr = files->open_fds->fds_bits;
410         count = files->max_fds / (sizeof(unsigned long) * 8);
411         for (total = 0; count > 0; count--) {
412                 if (*obptr)
413                         total += hweight_long(*obptr);
414                 obptr++;
415         }
416         spin_unlock(&files->file_lock);
417         return total;
418 }
419
420 #if 0
421
422 static inline int vx_openfd_task(struct task_struct *tsk)
423 {
424         struct files_struct *files = tsk->files;
425         const unsigned long *bptr;
426         int count, total;
427
428         spin_lock(&files->file_lock);
429         bptr = files->open_fds->fds_bits;
430         count = files->max_fds / (sizeof(unsigned long) * 8);
431         for (total = 0; count > 0; count--) {
432                 if (*bptr)
433                         total += hweight_long(*bptr);
434                 bptr++;
435         }
436         spin_unlock(&files->file_lock);
437         return total;
438 }
439
440 #endif
441
442 /*
443  *      migrate task to new context
444  *      gets vxi, puts old_vxi on change
445  */
446
447 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
448 {
449         struct vx_info *old_vxi;
450         int ret = 0;
451         
452         if (!p || !vxi)
453                 BUG();
454
455         old_vxi = task_get_vx_info(p);
456         if (old_vxi == vxi)
457                 goto out;
458
459         vxdprintk(VXD_CBIT(xid, 5),
460                 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
461                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
462
463         if (!(ret = vx_migrate_user(p, vxi))) {
464                 int nofiles;
465
466                 task_lock(p);
467                 // openfd = vx_openfd_task(p);
468                 nofiles = vx_nofiles_task(p);
469
470                 if (old_vxi) {
471                         atomic_dec(&old_vxi->cacct.nr_threads);
472                         atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
473                         atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
474                         // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
475                 }               
476                 atomic_inc(&vxi->cacct.nr_threads);
477                 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
478                 atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
479                 // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
480
481                 vxdprintk(VXD_CBIT(xid, 5),
482                         "moved task %p into vxi:%p[#%d]",
483                         p, vxi, vxi->vx_id);
484
485                 /* should be handled in set_vx_info !! */
486                 if (old_vxi)
487                         clr_vx_info(&p->vx_info);
488                 set_vx_info(&p->vx_info, vxi);
489                 p->xid = vxi->vx_id;
490                 vx_mask_bcaps(p);
491                 task_unlock(p);
492
493                 /* obsoleted by clr/set */
494                 // put_vx_info(old_vxi);
495         }
496 out:
497
498
499 #ifdef CKRM_VSERVER_INTEGRATION
500         do {
501           ckrm_cb_xid(p);
502         } while (0);
503 #endif //CKRM_VSERVER_INTEGRATION
504
505
506         put_vx_info(old_vxi);
507         return ret;
508 }
509
510 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
511 {
512         if (!vxi)
513                 return -EINVAL;
514         if (vxi->vx_initpid)
515                 return -EPERM;
516
517         vxi->vx_initpid = p->tgid;
518         return 0;
519 }
520
521
522 /* vserver syscall commands below here */
523
524 /* taks xid and vx_info functions */
525
526 #include <asm/uaccess.h>
527
528
529 int vc_task_xid(uint32_t id, void __user *data)
530 {
531         xid_t xid;
532
533         if (id) {
534                 struct task_struct *tsk;
535
536                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
537                         return -EPERM;
538
539                 read_lock(&tasklist_lock);
540                 tsk = find_task_by_pid(id);
541                 xid = (tsk) ? tsk->xid : -ESRCH;
542                 read_unlock(&tasklist_lock);
543         }
544         else
545                 xid = current->xid;
546         return xid;
547 }
548
549
550 int vc_vx_info(uint32_t id, void __user *data)
551 {
552         struct vx_info *vxi;
553         struct vcmd_vx_info_v0 vc_data;
554
555         if (!vx_check(0, VX_ADMIN))
556                 return -ENOSYS;
557         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
558                 return -EPERM;
559
560         vxi = locate_vx_info(id);
561         if (!vxi)
562                 return -ESRCH;
563
564         vc_data.xid = vxi->vx_id;
565         vc_data.initpid = vxi->vx_initpid;
566         put_vx_info(vxi);
567
568         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
569                 return -EFAULT;
570         return 0;
571 }
572
573
574 /* context functions */
575
576 int vc_ctx_create(uint32_t xid, void __user *data)
577 {
578         struct vx_info *new_vxi;
579         int ret;
580
581         if (!capable(CAP_SYS_ADMIN))
582                 return -EPERM;
583
584         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
585                 return -EINVAL;
586
587         if (xid < 1)
588                 return -EINVAL;
589
590         new_vxi = __loc_vx_info(xid, &ret);
591         if (!new_vxi)
592                 return ret;
593         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
594                 ret = -EEXIST;
595                 goto out_put;
596         }
597
598         ret = new_vxi->vx_id;
599         vx_migrate_task(current, new_vxi);
600         /* if this fails, we might end up with a hashed vx_info */
601 out_put:
602         put_vx_info(new_vxi);
603         return ret;
604 }
605
606
607 int vc_ctx_migrate(uint32_t id, void __user *data)
608 {
609         struct vx_info *vxi;
610         
611         if (!capable(CAP_SYS_ADMIN))
612                 return -EPERM;
613
614         /* dirty hack until Spectator becomes a cap */
615         if (id == 1) {
616                 current->xid = 1;
617                 return 0;
618         }
619
620         vxi = locate_vx_info(id);
621         if (!vxi)
622                 return -ESRCH;
623         vx_migrate_task(current, vxi);
624         put_vx_info(vxi);
625         return 0;
626 }
627
628
629 int vc_get_cflags(uint32_t id, void __user *data)
630 {
631         struct vx_info *vxi;
632         struct vcmd_ctx_flags_v0 vc_data;
633
634         if (!capable(CAP_SYS_ADMIN))
635                 return -EPERM;
636
637         vxi = locate_vx_info(id);
638         if (!vxi)
639                 return -ESRCH;
640
641         vc_data.flagword = vxi->vx_flags;
642
643         /* special STATE flag handling */
644         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
645
646         put_vx_info(vxi);
647
648         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
649                 return -EFAULT;
650         return 0;
651 }
652
653 int vc_set_cflags(uint32_t id, void __user *data)
654 {
655         struct vx_info *vxi;
656         struct vcmd_ctx_flags_v0 vc_data;
657         uint64_t mask, trigger;
658
659         if (!capable(CAP_SYS_ADMIN))
660                 return -EPERM;
661         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
662                 return -EFAULT;
663
664         vxi = locate_vx_info(id);
665         if (!vxi)
666                 return -ESRCH;
667
668         /* special STATE flag handling */
669         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
670         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
671
672         if (trigger & VXF_STATE_SETUP)
673                 vx_mask_bcaps(current);
674         if (trigger & VXF_STATE_INIT)
675                 if (vxi == current->vx_info)
676                         vx_set_init(vxi, current);
677
678         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
679                 vc_data.flagword, mask);
680         put_vx_info(vxi);
681         return 0;
682 }
683
684 int vc_get_ccaps(uint32_t id, void __user *data)
685 {
686         struct vx_info *vxi;
687         struct vcmd_ctx_caps_v0 vc_data;
688
689         if (!capable(CAP_SYS_ADMIN))
690                 return -EPERM;
691
692         vxi = locate_vx_info(id);
693         if (!vxi)
694                 return -ESRCH;
695
696         vc_data.bcaps = vxi->vx_bcaps;
697         vc_data.ccaps = vxi->vx_ccaps;
698         vc_data.cmask = ~0UL;
699         put_vx_info(vxi);
700
701         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
702                 return -EFAULT;
703         return 0;
704 }
705
706 int vc_set_ccaps(uint32_t id, void __user *data)
707 {
708         struct vx_info *vxi;
709         struct vcmd_ctx_caps_v0 vc_data;
710
711         if (!capable(CAP_SYS_ADMIN))
712                 return -EPERM;
713         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
714                 return -EFAULT;
715
716         vxi = locate_vx_info(id);
717         if (!vxi)
718                 return -ESRCH;
719
720         vxi->vx_bcaps &= vc_data.bcaps;
721         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
722                 vc_data.ccaps, vc_data.cmask);
723         put_vx_info(vxi);
724         return 0;
725 }
726
727 #include <linux/module.h>
728
729 EXPORT_SYMBOL_GPL(rcu_free_vx_info);
730 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
731 EXPORT_SYMBOL_GPL(unhash_vx_info);
732