patch-2.6.6-vs1.9.1
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver/context.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vinline.h>
24 #include <linux/kernel_stat.h>
25 #include <linux/namespace.h>
26 #include <linux/rcupdate.h>
27
28 #include <asm/errno.h>
29
30
31 /*      __alloc_vx_info()
32
33         * allocate an initialized vx_info struct
34         * doesn't make it visible (hash)                        */
35
36 static struct vx_info *__alloc_vx_info(xid_t xid)
37 {
38         struct vx_info *new = NULL;
39         
40         vxdprintk("alloc_vx_info(%d)\n", xid);
41
42         /* would this benefit from a slab cache? */
43         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
44         if (!new)
45                 return 0;
46
47         memset (new, 0, sizeof(struct vx_info));
48         new->vx_id = xid;
49         INIT_RCU_HEAD(&new->vx_rcu);
50         INIT_HLIST_NODE(&new->vx_hlist);
51         atomic_set(&new->vx_refcnt, 0);
52         atomic_set(&new->vx_usecnt, 0);
53
54         /* rest of init goes here */
55         vx_info_init_limit(&new->limit);
56         vx_info_init_sched(&new->sched);
57         vx_info_init_cvirt(&new->cvirt);
58         vx_info_init_cacct(&new->cacct);
59
60         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
61         new->vx_bcaps = CAP_INIT_EFF_SET;
62         new->vx_ccaps = 0;
63
64         vxdprintk("alloc_vx_info(%d) = %p\n", xid, new);
65         return new;
66 }
67
68 /*      __dealloc_vx_info()
69
70         * final disposal of vx_info                             */
71
72 static void __dealloc_vx_info(struct vx_info *vxi)
73 {
74         vxdprintk("dealloc_vx_info(%p)\n", vxi);
75
76         vxi->vx_hlist.next = LIST_POISON1;
77         vxi->vx_id = -1;
78
79         if (vxi->vx_namespace)
80                 put_namespace(vxi->vx_namespace);
81         if (vxi->vx_fs)
82                 put_fs_struct(vxi->vx_fs);
83         
84         vx_info_exit_limit(&vxi->limit);
85         vx_info_exit_sched(&vxi->sched);
86         vx_info_exit_cvirt(&vxi->cvirt);
87         vx_info_exit_cacct(&vxi->cacct);
88         
89         BUG_ON(atomic_read(&vxi->vx_usecnt));
90         BUG_ON(atomic_read(&vxi->vx_refcnt));
91
92         kfree(vxi);
93 }
94
95
96 /*      hash table for vx_info hash */
97
98 #define VX_HASH_SIZE    13
99
100 struct hlist_head vx_info_hash[VX_HASH_SIZE];
101
102 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
103
104
105 static inline unsigned int __hashval(xid_t xid)
106 {
107         return (xid % VX_HASH_SIZE);
108 }
109
110
111
112 /*      __hash_vx_info()
113
114         * add the vxi to the global hash table
115         * requires the hash_lock to be held                     */
116
117 static inline void __hash_vx_info(struct vx_info *vxi)
118 {
119         struct hlist_head *head;
120         
121         vxdprintk("__hash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
122         get_vx_info(vxi);
123         head = &vx_info_hash[__hashval(vxi->vx_id)];
124         hlist_add_head_rcu(&vxi->vx_hlist, head);
125 }
126
127 /*      __unhash_vx_info()
128
129         * remove the vxi from the global hash table
130         * requires the hash_lock to be held                     */
131
132 static inline void __unhash_vx_info(struct vx_info *vxi)
133 {
134         vxdprintk("__unhash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
135         hlist_del_rcu(&vxi->vx_hlist);
136         put_vx_info(vxi);
137 }
138
139
140 /*      __lookup_vx_info()
141
142         * requires the rcu_read_lock()
143         * doesn't increment the vx_refcnt                       */
144
145 static inline struct vx_info *__lookup_vx_info(xid_t xid)
146 {
147         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
148         struct hlist_node *pos;
149
150         hlist_for_each(pos, head) {
151                 struct vx_info *vxi =
152                         hlist_entry(pos, struct vx_info, vx_hlist);
153
154                 if (vxi->vx_id == xid) {
155                         return vxi;
156                 }
157         }
158         return NULL;
159 }
160
161
162 /*      __vx_dynamic_id()
163
164         * find unused dynamic xid
165         * requires the hash_lock to be held                     */
166
167 static inline xid_t __vx_dynamic_id(void)
168 {
169         static xid_t seq = MAX_S_CONTEXT;
170         xid_t barrier = seq;
171         
172         do {
173                 if (++seq > MAX_S_CONTEXT)
174                         seq = MIN_D_CONTEXT;
175                 if (!__lookup_vx_info(seq))
176                         return seq;
177         } while (barrier != seq);
178         return 0;
179 }
180
181 /*      __loc_vx_info()
182
183         * locate or create the requested context
184         * get() it and if new hash it                           */
185
186 static struct vx_info * __loc_vx_info(int id, int *err)
187 {
188         struct vx_info *new, *vxi = NULL;
189         
190         vxdprintk("loc_vx_info(%d)\n", id);
191
192         if (!(new = __alloc_vx_info(id))) {
193                 *err = -ENOMEM;
194                 return NULL;
195         }
196
197         spin_lock(&vx_info_hash_lock);
198
199         /* dynamic context requested */
200         if (id == VX_DYNAMIC_ID) {
201                 id = __vx_dynamic_id();
202                 if (!id) {
203                         printk(KERN_ERR "no dynamic context available.\n");
204                         goto out_unlock;
205                 }
206                 new->vx_id = id;
207         }
208         /* existing context requested */
209         else if ((vxi = __lookup_vx_info(id))) {
210                 /* context in setup is not available */
211                 if (vxi->vx_flags & VXF_STATE_SETUP) {
212                         vxdprintk("loc_vx_info(%d) = %p (not available)\n", id, vxi);
213                         vxi = NULL;
214                         *err = -EBUSY;
215                 } else {
216                         vxdprintk("loc_vx_info(%d) = %p (found)\n", id, vxi);
217                         get_vx_info(vxi);
218                         *err = 0;
219                 }
220                 goto out_unlock;
221         }
222
223         /* new context requested */
224         vxdprintk("loc_vx_info(%d) = %p (new)\n", id, new);
225         __hash_vx_info(get_vx_info(new));
226         vxi = new, new = NULL;
227         *err = 1;
228
229 out_unlock:
230         spin_unlock(&vx_info_hash_lock);
231         if (new)
232                 __dealloc_vx_info(new);
233         return vxi;
234 }
235
236
237
238 /*      exported stuff                                          */
239
240
241
242 void rcu_free_vx_info(void *obj)
243 {
244         struct vx_info *vxi = obj;
245         int usecnt, refcnt;
246
247         usecnt = atomic_read(&vxi->vx_usecnt);
248         BUG_ON(usecnt < 0);
249
250         refcnt = atomic_read(&vxi->vx_refcnt);
251         BUG_ON(refcnt < 0);
252
253         if (!usecnt)
254                 __dealloc_vx_info(vxi);
255         else
256                 printk("!!! rcu didn't free\n");
257 }
258
259 void unhash_vx_info(struct vx_info *vxi)
260 {
261         spin_lock(&vx_info_hash_lock);
262         __unhash_vx_info(vxi);
263         spin_unlock(&vx_info_hash_lock);
264 }
265
266 /*      locate_vx_info()
267
268         * search for a vx_info and get() it                     
269         * negative id means current                             */
270
271 struct vx_info *locate_vx_info(int id)
272 {
273         struct vx_info *vxi;
274         
275         if (id < 0) {
276                 vxi = get_vx_info(current->vx_info);
277         } else {
278                 rcu_read_lock();
279                 vxi = get_vx_info(__lookup_vx_info(id));
280                 rcu_read_unlock();
281         }
282         return vxi;
283 }
284
285 /*      vx_info_is_hashed()
286
287         * verify that xid is still hashed                       */
288
289 int vx_info_is_hashed(xid_t xid)
290 {
291         int hashed;
292
293         rcu_read_lock();
294         hashed = (__lookup_vx_info(xid) != NULL);
295         rcu_read_unlock();
296         return hashed;
297 }
298
299 #ifdef  CONFIG_VSERVER_LEGACY
300
301 #if 0
302 struct vx_info *alloc_vx_info(xid_t xid)
303 {
304         return __alloc_vx_info(xid);
305 }
306 #endif
307
308 struct vx_info *locate_or_create_vx_info(int id)
309 {
310         int err;
311
312         return __loc_vx_info(id, &err);
313 }
314
315 #endif
316
317 #ifdef  CONFIG_PROC_FS
318
319 #define hlist_for_each_rcu(pos, head) \
320         for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
321                 pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
322
323 int get_xid_list(int index, unsigned int *xids, int size)
324 {
325         int hindex, nr_xids = 0;
326
327         rcu_read_lock();
328         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
329                 struct hlist_head *head = &vx_info_hash[hindex];
330                 struct hlist_node *pos;
331
332                 hlist_for_each_rcu(pos, head) {
333                         struct vx_info *vxi;
334
335                         if (--index > 0)
336                                 continue;
337
338                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
339                         xids[nr_xids] = vxi->vx_id;                     
340                         if (++nr_xids >= size)
341                                 goto out;
342                 }
343         }
344 out:
345         rcu_read_unlock();
346         return nr_xids;
347 }
348 #endif
349
350 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
351 {
352         struct user_struct *new_user, *old_user;
353         
354         if (!p || !vxi)
355                 BUG();
356         new_user = alloc_uid(vxi->vx_id, p->uid);
357         if (!new_user)
358                 return -ENOMEM;
359
360         old_user = p->user;
361         if (new_user != old_user) {
362                 atomic_inc(&new_user->processes);
363                 atomic_dec(&old_user->processes);
364                 p->user = new_user;
365         }
366         free_uid(old_user);
367         return 0;
368 }
369
370 void vx_mask_bcaps(struct task_struct *p)
371 {
372         struct vx_info *vxi = p->vx_info;
373
374         p->cap_effective &= vxi->vx_bcaps;
375         p->cap_inheritable &= vxi->vx_bcaps;
376         p->cap_permitted &= vxi->vx_bcaps;
377 }
378
379
380 #include <linux/file.h>
381
382 static inline int vx_nofiles_task(struct task_struct *tsk)
383 {
384         struct files_struct *files = tsk->files;
385         const unsigned long *obptr, *cbptr;
386         int count, total;
387
388         spin_lock(&files->file_lock);
389         obptr = files->open_fds->fds_bits;
390         cbptr = files->close_on_exec->fds_bits;
391         count = files->max_fds / (sizeof(unsigned long) * 8);
392         for (total = 0; count > 0; count--) {
393                 if (*obptr)
394                         total += hweight_long(*obptr);
395                 obptr++;
396         /*      if (*cbptr)
397                         total += hweight_long(*cbptr);
398                 cbptr++; */
399         }
400         spin_unlock(&files->file_lock);
401         return total;
402 }
403
404 static inline int vx_openfd_task(struct task_struct *tsk)
405 {
406         struct files_struct *files = tsk->files;
407         const unsigned long *bptr;
408         int count, total;
409
410         spin_lock(&files->file_lock);
411         bptr = files->open_fds->fds_bits;
412         count = files->max_fds / (sizeof(unsigned long) * 8);
413         for (total = 0; count > 0; count--) {
414                 if (*bptr)
415                         total += hweight_long(*bptr);
416                 bptr++;
417         }
418         spin_unlock(&files->file_lock);
419         return total;
420 }
421
422 /*
423  *      migrate task to new context
424  *      gets vxi, puts old_vxi on change
425  */
426
427 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
428 {
429         struct vx_info *old_vxi;
430         int ret = 0;
431         
432         if (!p || !vxi)
433                 BUG();
434
435         old_vxi = task_get_vx_info(p);
436         if (old_vxi == vxi)
437                 goto out;
438
439         vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
440                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
441
442         if (!(ret = vx_migrate_user(p, vxi))) {
443                 task_lock(p);
444                 if (old_vxi) {
445                         atomic_dec(&old_vxi->cacct.nr_threads);
446                         atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]);
447                 }               
448                 atomic_inc(&vxi->cacct.nr_threads);
449                 atomic_inc(&vxi->limit.res[RLIMIT_NPROC]);
450                 atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]);
451                 atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]);
452                 /* should be handled in set_vx_info !! */
453                 if (old_vxi)
454                         clr_vx_info(&p->vx_info);
455                 set_vx_info(&p->vx_info, vxi);
456                 p->xid = vxi->vx_id;
457                 vx_mask_bcaps(p);
458                 task_unlock(p);
459
460                 put_vx_info(old_vxi);
461         }
462 out:
463         put_vx_info(old_vxi);
464         return ret;
465 }
466
467 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
468 {
469         if (!vxi)
470                 return -EINVAL;
471         if (vxi->vx_initpid)
472                 return -EPERM;
473
474         vxi->vx_initpid = p->tgid;
475         return 0;
476 }
477
478
479 /* vserver syscall commands below here */
480
481 /* taks xid and vx_info functions */
482
483 #include <asm/uaccess.h>
484
485
486 int vc_task_xid(uint32_t id, void __user *data)
487 {
488         xid_t xid;
489
490         if (id) {
491                 struct task_struct *tsk;
492
493                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
494                         return -EPERM;
495
496                 read_lock(&tasklist_lock);
497                 tsk = find_task_by_pid(id);
498                 xid = (tsk) ? tsk->xid : -ESRCH;
499                 read_unlock(&tasklist_lock);
500         }
501         else
502                 xid = current->xid;
503         return xid;
504 }
505
506
507 int vc_vx_info(uint32_t id, void __user *data)
508 {
509         struct vx_info *vxi;
510         struct vcmd_vx_info_v0 vc_data;
511
512         if (!vx_check(0, VX_ADMIN))
513                 return -ENOSYS;
514         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
515                 return -EPERM;
516
517         vxi = locate_vx_info(id);
518         if (!vxi)
519                 return -ESRCH;
520
521         vc_data.xid = vxi->vx_id;
522         vc_data.initpid = vxi->vx_initpid;
523         put_vx_info(vxi);
524
525         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
526                 return -EFAULT;
527         return 0;
528 }
529
530
531 /* context functions */
532
533 int vc_ctx_create(uint32_t xid, void __user *data)
534 {
535         struct vx_info *new_vxi;
536         int ret;
537
538         if (!capable(CAP_SYS_ADMIN))
539                 return -EPERM;
540
541         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
542                 return -EINVAL;
543
544         if (xid < 1)
545                 return -EINVAL;
546
547         new_vxi = __loc_vx_info(xid, &ret);
548         if (!new_vxi)
549                 return ret;
550         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
551                 ret = -EEXIST;
552                 goto out_put;
553         }
554
555         ret = new_vxi->vx_id;
556         vx_migrate_task(current, new_vxi);
557         /* if this fails, we might end up with a hashed vx_info */
558 out_put:
559         put_vx_info(new_vxi);
560         return ret;
561 }
562
563
564 int vc_ctx_migrate(uint32_t id, void __user *data)
565 {
566         struct vx_info *vxi;
567         
568         if (!capable(CAP_SYS_ADMIN))
569                 return -EPERM;
570
571         /* dirty hack until Spectator becomes a cap */
572         if (id == 1) {
573                 current->xid = 1;
574                 return 0;
575         }
576
577         vxi = locate_vx_info(id);
578         if (!vxi)
579                 return -ESRCH;
580         vx_migrate_task(current, vxi);
581         put_vx_info(vxi);
582         return 0;
583 }
584
585
586 int vc_get_cflags(uint32_t id, void __user *data)
587 {
588         struct vx_info *vxi;
589         struct vcmd_ctx_flags_v0 vc_data;
590
591         if (!capable(CAP_SYS_ADMIN))
592                 return -EPERM;
593
594         vxi = locate_vx_info(id);
595         if (!vxi)
596                 return -ESRCH;
597
598         vc_data.flagword = vxi->vx_flags;
599
600         /* special STATE flag handling */
601         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
602
603         put_vx_info(vxi);
604
605         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
606                 return -EFAULT;
607         return 0;
608 }
609
610 int vc_set_cflags(uint32_t id, void __user *data)
611 {
612         struct vx_info *vxi;
613         struct vcmd_ctx_flags_v0 vc_data;
614         uint64_t mask, trigger;
615
616         if (!capable(CAP_SYS_ADMIN))
617                 return -EPERM;
618         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
619                 return -EFAULT;
620
621         vxi = locate_vx_info(id);
622         if (!vxi)
623                 return -ESRCH;
624
625         /* special STATE flag handling */
626         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
627         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
628
629         if (trigger & VXF_STATE_SETUP)
630                 vx_mask_bcaps(current);
631         if (trigger & VXF_STATE_INIT)
632                 if (vxi == current->vx_info)
633                         vx_set_init(vxi, current);
634
635         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
636                 vc_data.flagword, mask);
637         put_vx_info(vxi);
638         return 0;
639 }
640
641 int vc_get_ccaps(uint32_t id, void __user *data)
642 {
643         struct vx_info *vxi;
644         struct vcmd_ctx_caps_v0 vc_data;
645
646         if (!capable(CAP_SYS_ADMIN))
647                 return -EPERM;
648
649         vxi = locate_vx_info(id);
650         if (!vxi)
651                 return -ESRCH;
652
653         vc_data.bcaps = vxi->vx_bcaps;
654         vc_data.ccaps = vxi->vx_ccaps;
655         vc_data.cmask = ~0UL;
656         put_vx_info(vxi);
657
658         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
659                 return -EFAULT;
660         return 0;
661 }
662
663 int vc_set_ccaps(uint32_t id, void __user *data)
664 {
665         struct vx_info *vxi;
666         struct vcmd_ctx_caps_v0 vc_data;
667
668         if (!capable(CAP_SYS_ADMIN))
669                 return -EPERM;
670         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
671                 return -EFAULT;
672
673         vxi = locate_vx_info(id);
674         if (!vxi)
675                 return -ESRCH;
676
677         vxi->vx_bcaps &= vc_data.bcaps;
678         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
679                 vc_data.ccaps, vc_data.cmask);
680         put_vx_info(vxi);
681         return 0;
682 }
683
684 #include <linux/module.h>
685
686 EXPORT_SYMBOL_GPL(rcu_free_vx_info);
687 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
688