This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *
16  */
17
18 #include <linux/config.h>
19 #include <linux/slab.h>
20 #include <linux/vserver/context.h>
21 #include <linux/vserver/legacy.h>
22 #include <linux/vinline.h>
23 #include <linux/kernel_stat.h>
24 #include <linux/namespace.h>
25
26 #include <asm/errno.h>
27
28
29 /*  system functions */
30
31
32 LIST_HEAD(vx_infos);
33
34 spinlock_t vxlist_lock
35         __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
36
37
38 /*
39  *      struct vx_info allocation and deallocation
40  */
41
42 static struct vx_info *alloc_vx_info(int id)
43 {
44         struct vx_info *new = NULL;
45         
46         vxdprintk("alloc_vx_info(%d)\n", id);
47         /* would this benefit from a slab cache? */
48         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
49         if (!new)
50                 return 0;
51
52         memset (new, 0, sizeof(struct vx_info));
53         new->vx_id = id;
54         INIT_LIST_HEAD(&new->vx_list);
55         /* rest of init goes here */
56         
57         vx_info_init_limit(&new->limit);
58         vx_info_init_sched(&new->sched);
59         vx_info_init_cvirt(&new->cvirt);
60         vx_info_init_cacct(&new->cacct);
61
62         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
63         new->vx_bcaps = CAP_INIT_EFF_SET;
64         new->vx_ccaps = 0;
65
66         vxdprintk("alloc_vx_info(%d) = %p\n", id, new);
67         return new;
68 }
69
70 void free_vx_info(struct vx_info *vxi)
71 {
72         vxdprintk("free_vx_info(%p)\n", vxi);
73         if (vxi->vx_namespace)
74                 put_namespace(vxi->vx_namespace);
75         if (vxi->vx_fs)
76                 put_fs_struct(vxi->vx_fs);
77         
78         vx_info_exit_limit(&vxi->limit);
79         vx_info_exit_sched(&vxi->sched);
80         vx_info_exit_cvirt(&vxi->cvirt);
81         vx_info_exit_cacct(&vxi->cacct);
82         
83         BUG_ON(atomic_read(&vxi->vx_refcount));
84         vxi->vx_id = -1;
85
86         kfree(vxi);
87 }
88
89
90 /*
91  *      struct vx_info search by id
92  *      assumes vxlist_lock is held
93  */
94
95 static __inline__ struct vx_info *__find_vx_info(int id)
96 {
97         struct vx_info *vxi;
98
99         list_for_each_entry(vxi, &vx_infos, vx_list)
100                 if (vxi->vx_id == id)
101                         return vxi;
102         return 0;
103 }
104
105
106 /*
107  *      struct vx_info ref stuff
108  */
109
110 struct vx_info *find_vx_info(int id)
111 {
112         struct vx_info *vxi;
113         
114         if (id < 0) {
115                 vxi = current->vx_info;
116                 get_vx_info(vxi);
117         } else {
118                 spin_lock(&vxlist_lock);
119                 if ((vxi = __find_vx_info(id)))
120                         get_vx_info(vxi);
121                 spin_unlock(&vxlist_lock);
122         }
123         return vxi;
124 }
125
126 /*
127  *      verify that id is a valid xid
128  */
129
130 int vx_info_id_valid(int id)
131 {
132         int valid;
133
134         spin_lock(&vxlist_lock);
135         valid = (__find_vx_info(id) != NULL);
136         spin_unlock(&vxlist_lock);
137         return valid;
138 }
139
140
141 /*
142  *      dynamic context id ...
143  */
144
145 static __inline__ xid_t __vx_dynamic_id(void)
146 {
147         static xid_t seq = MAX_S_CONTEXT;
148         xid_t barrier = seq;
149         
150         do {
151                 if (++seq > MAX_S_CONTEXT)
152                         seq = MIN_D_CONTEXT;
153                 if (!__find_vx_info(seq))
154                         return seq;
155         } while (barrier != seq);
156         return 0;
157 }
158
159 static struct vx_info * __foc_vx_info(int id, int *err)
160 {
161         struct vx_info *new, *vxi = NULL;
162         
163         vxdprintk("foc_vx_info(%d)\n", id);
164         if (!(new = alloc_vx_info(id))) {
165                 *err = -ENOMEM;
166                 return NULL;
167         }
168
169         /* dirty hack until Spectator becomes a cap */
170         if (id == 0 || id == 1) {
171                 *err = -EBUSY;
172                 return NULL;
173         }
174
175         spin_lock(&vxlist_lock);
176
177         /* dynamic context requested */
178         if (id == VX_DYNAMIC_ID) {
179                 id = __vx_dynamic_id();
180                 if (!id) {
181                         printk(KERN_ERR "no dynamic context available.\n");
182                         goto out_unlock;
183                 }
184                 new->vx_id = id;
185         }
186         /* existing context requested */
187         else if ((vxi = __find_vx_info(id))) {
188                 /* context in setup is not available */
189                 if (vxi->vx_flags & VXF_STATE_SETUP) {
190                         vxdprintk("foc_vx_info(%d) = %p (not available)\n", id, vxi);
191                         vxi = NULL;
192                         *err = -EBUSY;
193                 } else {
194                         vxdprintk("foc_vx_info(%d) = %p (found)\n", id, vxi);
195                         get_vx_info(vxi);
196                         *err = 0;
197                 }
198                 goto out_unlock;
199         }
200
201         /* new context requested */
202         vxdprintk("foc_vx_info(%d) = %p (new)\n", id, new);
203         atomic_set(&new->vx_refcount, 1);
204         list_add(&new->vx_list, &vx_infos);
205         vxi = new, new = NULL;
206         *err = 1;
207
208 out_unlock:
209         spin_unlock(&vxlist_lock);
210         if (new)
211                 free_vx_info(new);
212         return vxi;
213 }
214
215
216 struct vx_info *find_or_create_vx_info(int id)
217 {
218         int err;
219
220         return __foc_vx_info(id, &err);
221 }
222
223
224 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
225 {
226         struct user_struct *new_user, *old_user;
227         
228         if (!p || !vxi)
229                 BUG();
230         new_user = alloc_uid(vxi->vx_id, p->uid);
231         if (!new_user)
232                 return -ENOMEM;
233
234         old_user = p->user;
235         if (new_user != old_user) {
236                 atomic_inc(&new_user->processes);
237                 atomic_dec(&old_user->processes);
238                 p->user = new_user;
239         }
240         free_uid(old_user);
241         return 0;
242 }
243
244 void vx_mask_bcaps(struct task_struct *p)
245 {
246         struct vx_info *vxi = p->vx_info;
247
248         p->cap_effective &= vxi->vx_bcaps;
249         p->cap_inheritable &= vxi->vx_bcaps;
250         p->cap_permitted &= vxi->vx_bcaps;
251 }
252
253
254 #include <linux/file.h>
255
256 static inline int vx_nofiles_task(struct task_struct *tsk)
257 {
258         struct files_struct *files = tsk->files;
259         const unsigned long *obptr, *cbptr;
260         int count, total;
261
262         spin_lock(&files->file_lock);
263         obptr = files->open_fds->fds_bits;
264         cbptr = files->close_on_exec->fds_bits;
265         count = files->max_fds / (sizeof(unsigned long) * 8);
266         for (total = 0; count > 0; count--) {
267                 if (*obptr)
268                         total += hweight_long(*obptr);
269                 obptr++;
270         /*      if (*cbptr)
271                         total += hweight_long(*cbptr);
272                 cbptr++; */
273         }
274         spin_unlock(&files->file_lock);
275         return total;
276 }
277
278 static inline int vx_openfd_task(struct task_struct *tsk)
279 {
280         struct files_struct *files = tsk->files;
281         const unsigned long *bptr;
282         int count, total;
283
284         spin_lock(&files->file_lock);
285         bptr = files->open_fds->fds_bits;
286         count = files->max_fds / (sizeof(unsigned long) * 8);
287         for (total = 0; count > 0; count--) {
288                 if (*bptr)
289                         total += hweight_long(*bptr);
290                 bptr++;
291         }
292         spin_unlock(&files->file_lock);
293         return total;
294 }
295
296 /*
297  *      migrate task to new context
298  *      gets vxi, puts old_vxi on change
299  */
300
301 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
302 {
303         struct vx_info *old_vxi = task_get_vx_info(p);
304         int ret = 0;
305         
306         if (!p || !vxi)
307                 BUG();
308
309         vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
310                 vxi->vx_id, atomic_read(&vxi->vx_refcount));
311         if (old_vxi == vxi)
312                 goto out;
313
314         if (!(ret = vx_migrate_user(p, vxi))) {
315                 task_lock(p);
316                 if (old_vxi) {
317                         atomic_dec(&old_vxi->cacct.nr_threads);
318                         atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]);
319                 }               
320                 atomic_inc(&vxi->cacct.nr_threads);
321                 atomic_inc(&vxi->limit.res[RLIMIT_NPROC]);
322                 atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]);
323                 atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]);
324                 set_vx_info(&p->vx_info, vxi);
325                 p->xid = vxi->vx_id;
326                 vx_mask_bcaps(p);
327                 task_unlock(p);
328
329                 put_vx_info(old_vxi);
330         }
331 out:
332         put_vx_info(old_vxi);
333         return ret;
334 }
335
336 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
337 {
338         if (!vxi)
339                 return -EINVAL;
340         if (vxi->vx_initpid)
341                 return -EPERM;
342
343         vxi->vx_initpid = p->tgid;
344         return 0;
345 }
346
347
348 /* vserver syscall commands below here */
349
350 /* taks xid and vx_info functions */
351
352 #include <asm/uaccess.h>
353
354
355 int vc_task_xid(uint32_t id, void __user *data)
356 {
357         xid_t xid;
358
359         if (id) {
360                 struct task_struct *tsk;
361
362                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
363                         return -EPERM;
364
365                 read_lock(&tasklist_lock);
366                 tsk = find_task_by_pid(id);
367                 xid = (tsk) ? tsk->xid : -ESRCH;
368                 read_unlock(&tasklist_lock);
369         }
370         else
371                 xid = current->xid;
372         return xid;
373 }
374
375
376 int vc_vx_info(uint32_t id, void __user *data)
377 {
378         struct vx_info *vxi;
379         struct vcmd_vx_info_v0 vc_data;
380
381         if (!vx_check(0, VX_ADMIN))
382                 return -ENOSYS;
383         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
384                 return -EPERM;
385
386         vxi = find_vx_info(id);
387         if (!vxi)
388                 return -ESRCH;
389
390         vc_data.xid = vxi->vx_id;
391         vc_data.initpid = vxi->vx_initpid;
392         put_vx_info(vxi);
393
394         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
395                 return -EFAULT;
396         return 0;
397 }
398
399
400 /* context functions */
401
402 int vc_ctx_create(uint32_t xid, void __user *data)
403 {
404         // int ret = -ENOMEM;
405         struct vx_info *new_vxi;
406         int ret;
407
408         if (!capable(CAP_SYS_ADMIN))
409                 return -EPERM;
410
411         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
412                 return -EINVAL;
413
414         if (xid < 1)
415                 return -EINVAL;
416
417         new_vxi = __foc_vx_info(xid, &ret);
418         if (!new_vxi)
419                 return ret;
420         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
421                 ret = -EEXIST;
422                 goto out_put;
423         }
424
425         ret = new_vxi->vx_id;
426         vx_migrate_task(current, new_vxi);
427 out_put:
428         put_vx_info(new_vxi);
429         return ret;
430 }
431
432
433 int vc_ctx_migrate(uint32_t id, void __user *data)
434 {
435         struct vx_info *vxi;
436         
437         if (!capable(CAP_SYS_ADMIN))
438                 return -EPERM;
439
440         /* dirty hack until Spectator becomes a cap */
441         if (id == 1) {
442                 current->xid = 1;
443                 return 0;
444         }
445
446         vxi = find_vx_info(id);
447         if (!vxi)
448                 return -ESRCH;
449         vx_migrate_task(current, vxi);
450         put_vx_info(vxi);
451         return 0;
452 }
453
454
455 int vc_get_cflags(uint32_t id, void __user *data)
456 {
457         struct vx_info *vxi;
458         struct vcmd_ctx_flags_v0 vc_data;
459
460         if (!capable(CAP_SYS_ADMIN))
461                 return -EPERM;
462
463         vxi = find_vx_info(id);
464         if (!vxi)
465                 return -ESRCH;
466
467         vc_data.flagword = vxi->vx_flags;
468
469         // vc_data.mask = ~0UL;
470         /* special STATE flag handling */
471         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
472
473         put_vx_info(vxi);
474
475         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
476                 return -EFAULT;
477         return 0;
478 }
479
480 int vc_set_cflags(uint32_t id, void __user *data)
481 {
482         struct vx_info *vxi;
483         struct vcmd_ctx_flags_v0 vc_data;
484         uint64_t mask, trigger;
485
486         if (!capable(CAP_SYS_ADMIN))
487                 return -EPERM;
488         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
489                 return -EFAULT;
490
491         vxi = find_vx_info(id);
492         if (!vxi)
493                 return -ESRCH;
494
495         /* special STATE flag handling */
496         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
497         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
498
499         if (trigger & VXF_STATE_SETUP)
500                 vx_mask_bcaps(current);
501         if (trigger & VXF_STATE_INIT)
502                 if (vxi == current->vx_info)
503                         vx_set_init(vxi, current);
504
505         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
506                 vc_data.flagword, mask);
507         put_vx_info(vxi);
508         return 0;
509 }
510
511 int vc_get_ccaps(uint32_t id, void __user *data)
512 {
513         struct vx_info *vxi;
514         struct vcmd_ctx_caps_v0 vc_data;
515
516         if (!capable(CAP_SYS_ADMIN))
517                 return -EPERM;
518
519         vxi = find_vx_info(id);
520         if (!vxi)
521                 return -ESRCH;
522
523         vc_data.bcaps = vxi->vx_bcaps;
524         vc_data.ccaps = vxi->vx_ccaps;
525         vc_data.cmask = ~0UL;
526         put_vx_info(vxi);
527
528         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
529                 return -EFAULT;
530         return 0;
531 }
532
533 int vc_set_ccaps(uint32_t id, void __user *data)
534 {
535         struct vx_info *vxi;
536         struct vcmd_ctx_caps_v0 vc_data;
537
538         if (!capable(CAP_SYS_ADMIN))
539                 return -EPERM;
540         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
541                 return -EFAULT;
542
543         vxi = find_vx_info(id);
544         if (!vxi)
545                 return -ESRCH;
546
547         vxi->vx_bcaps &= vc_data.bcaps;
548         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
549                 vc_data.ccaps, vc_data.cmask);
550         put_vx_info(vxi);
551         return 0;
552 }
553
554 #include <linux/module.h>
555
556 EXPORT_SYMBOL_GPL(free_vx_info);
557 EXPORT_SYMBOL_GPL(vxlist_lock);
558