patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
28
29 #include <asm/errno.h>
30
31
32 /*      __alloc_vx_info()
33
34         * allocate an initialized vx_info struct
35         * doesn't make it visible (hash)                        */
36
37 static struct vx_info *__alloc_vx_info(xid_t xid)
38 {
39         struct vx_info *new = NULL;
40         
41         vxdprintk("alloc_vx_info(%d)\n", xid);
42
43         /* would this benefit from a slab cache? */
44         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
45         if (!new)
46                 return 0;
47
48         memset (new, 0, sizeof(struct vx_info));
49         new->vx_id = xid;
50         INIT_RCU_HEAD(&new->vx_rcu);
51         INIT_HLIST_NODE(&new->vx_hlist);
52         atomic_set(&new->vx_refcnt, 0);
53         atomic_set(&new->vx_usecnt, 0);
54
55         /* rest of init goes here */
56         vx_info_init_limit(&new->limit);
57         vx_info_init_sched(&new->sched);
58         vx_info_init_cvirt(&new->cvirt);
59         vx_info_init_cacct(&new->cacct);
60
61         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
62         new->vx_bcaps = CAP_INIT_EFF_SET;
63         new->vx_ccaps = 0;
64
65         vxdprintk("alloc_vx_info(%d) = %p\n", xid, new);
66         return new;
67 }
68
69 /*      __dealloc_vx_info()
70
71         * final disposal of vx_info                             */
72
73 static void __dealloc_vx_info(struct vx_info *vxi)
74 {
75         vxdprintk("dealloc_vx_info(%p)\n", vxi);
76
77         vxi->vx_hlist.next = LIST_POISON1;
78         vxi->vx_id = -1;
79
80         if (vxi->vx_namespace)
81                 put_namespace(vxi->vx_namespace);
82         if (vxi->vx_fs)
83                 put_fs_struct(vxi->vx_fs);
84         
85         vx_info_exit_limit(&vxi->limit);
86         vx_info_exit_sched(&vxi->sched);
87         vx_info_exit_cvirt(&vxi->cvirt);
88         vx_info_exit_cacct(&vxi->cacct);
89         
90         BUG_ON(atomic_read(&vxi->vx_usecnt));
91         BUG_ON(atomic_read(&vxi->vx_refcnt));
92
93         kfree(vxi);
94 }
95
96
97 /*      hash table for vx_info hash */
98
99 #define VX_HASH_SIZE    13
100
101 struct hlist_head vx_info_hash[VX_HASH_SIZE];
102
103 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
104
105
106 static inline unsigned int __hashval(xid_t xid)
107 {
108         return (xid % VX_HASH_SIZE);
109 }
110
111
112
113 /*      __hash_vx_info()
114
115         * add the vxi to the global hash table
116         * requires the hash_lock to be held                     */
117
118 static inline void __hash_vx_info(struct vx_info *vxi)
119 {
120         struct hlist_head *head;
121         
122         vxdprintk("__hash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
123         get_vx_info(vxi);
124         head = &vx_info_hash[__hashval(vxi->vx_id)];
125         hlist_add_head_rcu(&vxi->vx_hlist, head);
126 }
127
128 /*      __unhash_vx_info()
129
130         * remove the vxi from the global hash table
131         * requires the hash_lock to be held                     */
132
133 static inline void __unhash_vx_info(struct vx_info *vxi)
134 {
135         vxdprintk("__unhash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
136         hlist_del_rcu(&vxi->vx_hlist);
137         put_vx_info(vxi);
138 }
139
140
141 /*      __lookup_vx_info()
142
143         * requires the rcu_read_lock()
144         * doesn't increment the vx_refcnt                       */
145
146 static inline struct vx_info *__lookup_vx_info(xid_t xid)
147 {
148         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
149         struct hlist_node *pos;
150
151         hlist_for_each_rcu(pos, head) {
152                 struct vx_info *vxi =
153                         hlist_entry(pos, struct vx_info, vx_hlist);
154
155                 if (vxi->vx_id == xid) {
156                         return vxi;
157                 }
158         }
159         return NULL;
160 }
161
162
163 /*      __vx_dynamic_id()
164
165         * find unused dynamic xid
166         * requires the hash_lock to be held                     */
167
168 static inline xid_t __vx_dynamic_id(void)
169 {
170         static xid_t seq = MAX_S_CONTEXT;
171         xid_t barrier = seq;
172         
173         do {
174                 if (++seq > MAX_S_CONTEXT)
175                         seq = MIN_D_CONTEXT;
176                 if (!__lookup_vx_info(seq))
177                         return seq;
178         } while (barrier != seq);
179         return 0;
180 }
181
182 /*      __loc_vx_info()
183
184         * locate or create the requested context
185         * get() it and if new hash it                           */
186
187 static struct vx_info * __loc_vx_info(int id, int *err)
188 {
189         struct vx_info *new, *vxi = NULL;
190         
191         vxdprintk("loc_vx_info(%d)\n", id);
192
193         if (!(new = __alloc_vx_info(id))) {
194                 *err = -ENOMEM;
195                 return NULL;
196         }
197
198         spin_lock(&vx_info_hash_lock);
199
200         /* dynamic context requested */
201         if (id == VX_DYNAMIC_ID) {
202                 id = __vx_dynamic_id();
203                 if (!id) {
204                         printk(KERN_ERR "no dynamic context available.\n");
205                         goto out_unlock;
206                 }
207                 new->vx_id = id;
208         }
209         /* existing context requested */
210         else if ((vxi = __lookup_vx_info(id))) {
211                 /* context in setup is not available */
212                 if (vxi->vx_flags & VXF_STATE_SETUP) {
213                         vxdprintk("loc_vx_info(%d) = %p (not available)\n", id, vxi);
214                         vxi = NULL;
215                         *err = -EBUSY;
216                 } else {
217                         vxdprintk("loc_vx_info(%d) = %p (found)\n", id, vxi);
218                         get_vx_info(vxi);
219                         *err = 0;
220                 }
221                 goto out_unlock;
222         }
223
224         /* new context requested */
225         vxdprintk("loc_vx_info(%d) = %p (new)\n", id, new);
226         __hash_vx_info(get_vx_info(new));
227         vxi = new, new = NULL;
228         *err = 1;
229
230 out_unlock:
231         spin_unlock(&vx_info_hash_lock);
232         if (new)
233                 __dealloc_vx_info(new);
234         return vxi;
235 }
236
237
238
239 /*      exported stuff                                          */
240
241
242
243 void rcu_free_vx_info(void *obj)
244 {
245         struct vx_info *vxi = obj;
246         int usecnt, refcnt;
247
248         BUG_ON(!vxi);
249
250         usecnt = atomic_read(&vxi->vx_usecnt);
251         BUG_ON(usecnt < 0);
252
253         refcnt = atomic_read(&vxi->vx_refcnt);
254         BUG_ON(refcnt < 0);
255
256         if (!usecnt)
257                 __dealloc_vx_info(vxi);
258         else
259                 printk("!!! rcu didn't free\n");
260 }
261
262 void unhash_vx_info(struct vx_info *vxi)
263 {
264         spin_lock(&vx_info_hash_lock);
265         __unhash_vx_info(vxi);
266         spin_unlock(&vx_info_hash_lock);
267 }
268
269 /*      locate_vx_info()
270
271         * search for a vx_info and get() it                     
272         * negative id means current                             */
273
274 struct vx_info *locate_vx_info(int id)
275 {
276         struct vx_info *vxi;
277         
278         if (id < 0) {
279                 vxi = get_vx_info(current->vx_info);
280         } else {
281                 rcu_read_lock();
282                 vxi = get_vx_info(__lookup_vx_info(id));
283                 rcu_read_unlock();
284         }
285         return vxi;
286 }
287
288 /*      vx_info_is_hashed()
289
290         * verify that xid is still hashed                       */
291
292 int vx_info_is_hashed(xid_t xid)
293 {
294         int hashed;
295
296         rcu_read_lock();
297         hashed = (__lookup_vx_info(xid) != NULL);
298         rcu_read_unlock();
299         return hashed;
300 }
301
302 #ifdef  CONFIG_VSERVER_LEGACY
303
304 #if 0
305 struct vx_info *alloc_vx_info(xid_t xid)
306 {
307         return __alloc_vx_info(xid);
308 }
309 #endif
310
311 struct vx_info *locate_or_create_vx_info(int id)
312 {
313         int err;
314
315         return __loc_vx_info(id, &err);
316 }
317
318 #endif
319
320 #ifdef  CONFIG_PROC_FS
321
322 #define hlist_for_each_rcu(pos, head) \
323         for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
324                 pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
325
326 int get_xid_list(int index, unsigned int *xids, int size)
327 {
328         int hindex, nr_xids = 0;
329
330         rcu_read_lock();
331         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
332                 struct hlist_head *head = &vx_info_hash[hindex];
333                 struct hlist_node *pos;
334
335                 hlist_for_each_rcu(pos, head) {
336                         struct vx_info *vxi;
337
338                         if (--index > 0)
339                                 continue;
340
341                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
342                         xids[nr_xids] = vxi->vx_id;                     
343                         if (++nr_xids >= size)
344                                 goto out;
345                 }
346         }
347 out:
348         rcu_read_unlock();
349         return nr_xids;
350 }
351 #endif
352
353 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
354 {
355         struct user_struct *new_user, *old_user;
356         
357         if (!p || !vxi)
358                 BUG();
359         new_user = alloc_uid(vxi->vx_id, p->uid);
360         if (!new_user)
361                 return -ENOMEM;
362
363         old_user = p->user;
364         if (new_user != old_user) {
365                 atomic_inc(&new_user->processes);
366                 atomic_dec(&old_user->processes);
367                 p->user = new_user;
368         }
369         free_uid(old_user);
370         return 0;
371 }
372
373 void vx_mask_bcaps(struct task_struct *p)
374 {
375         struct vx_info *vxi = p->vx_info;
376
377         p->cap_effective &= vxi->vx_bcaps;
378         p->cap_inheritable &= vxi->vx_bcaps;
379         p->cap_permitted &= vxi->vx_bcaps;
380 }
381
382
383 #include <linux/file.h>
384
385 static inline int vx_nofiles_task(struct task_struct *tsk)
386 {
387         struct files_struct *files = tsk->files;
388         const unsigned long *obptr, *cbptr;
389         int count, total;
390
391         spin_lock(&files->file_lock);
392         obptr = files->open_fds->fds_bits;
393         cbptr = files->close_on_exec->fds_bits;
394         count = files->max_fds / (sizeof(unsigned long) * 8);
395         for (total = 0; count > 0; count--) {
396                 if (*obptr)
397                         total += hweight_long(*obptr);
398                 obptr++;
399         /*      if (*cbptr)
400                         total += hweight_long(*cbptr);
401                 cbptr++; */
402         }
403         spin_unlock(&files->file_lock);
404         return total;
405 }
406
407 static inline int vx_openfd_task(struct task_struct *tsk)
408 {
409         struct files_struct *files = tsk->files;
410         const unsigned long *bptr;
411         int count, total;
412
413         spin_lock(&files->file_lock);
414         bptr = files->open_fds->fds_bits;
415         count = files->max_fds / (sizeof(unsigned long) * 8);
416         for (total = 0; count > 0; count--) {
417                 if (*bptr)
418                         total += hweight_long(*bptr);
419                 bptr++;
420         }
421         spin_unlock(&files->file_lock);
422         return total;
423 }
424
425 /*
426  *      migrate task to new context
427  *      gets vxi, puts old_vxi on change
428  */
429
430 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
431 {
432         struct vx_info *old_vxi;
433         int ret = 0;
434         
435         if (!p || !vxi)
436                 BUG();
437
438         old_vxi = task_get_vx_info(p);
439         if (old_vxi == vxi)
440                 goto out;
441
442         vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
443                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
444
445         if (!(ret = vx_migrate_user(p, vxi))) {
446                 int openfd, nofiles;
447
448                 task_lock(p);
449                 openfd = vx_openfd_task(p);
450                 nofiles = vx_nofiles_task(p);
451
452                 if (old_vxi) {
453                         atomic_dec(&old_vxi->cacct.nr_threads);
454                         atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
455                         atomic_sub(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
456                         atomic_sub(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
457                 }               
458                 atomic_inc(&vxi->cacct.nr_threads);
459                 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
460                 atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
461                 atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
462                 /* should be handled in set_vx_info !! */
463                 if (old_vxi)
464                         clr_vx_info(&p->vx_info);
465                 set_vx_info(&p->vx_info, vxi);
466                 p->xid = vxi->vx_id;
467                 vx_mask_bcaps(p);
468                 task_unlock(p);
469
470                 /* obsoleted by clr/set */
471                 // put_vx_info(old_vxi);
472         }
473 out:
474         put_vx_info(old_vxi);
475         return ret;
476 }
477
478 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
479 {
480         if (!vxi)
481                 return -EINVAL;
482         if (vxi->vx_initpid)
483                 return -EPERM;
484
485         vxi->vx_initpid = p->tgid;
486         return 0;
487 }
488
489
490 /* vserver syscall commands below here */
491
492 /* taks xid and vx_info functions */
493
494 #include <asm/uaccess.h>
495
496
497 int vc_task_xid(uint32_t id, void __user *data)
498 {
499         xid_t xid;
500
501         if (id) {
502                 struct task_struct *tsk;
503
504                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
505                         return -EPERM;
506
507                 read_lock(&tasklist_lock);
508                 tsk = find_task_by_pid(id);
509                 xid = (tsk) ? tsk->xid : -ESRCH;
510                 read_unlock(&tasklist_lock);
511         }
512         else
513                 xid = current->xid;
514         return xid;
515 }
516
517
518 int vc_vx_info(uint32_t id, void __user *data)
519 {
520         struct vx_info *vxi;
521         struct vcmd_vx_info_v0 vc_data;
522
523         if (!vx_check(0, VX_ADMIN))
524                 return -ENOSYS;
525         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
526                 return -EPERM;
527
528         vxi = locate_vx_info(id);
529         if (!vxi)
530                 return -ESRCH;
531
532         vc_data.xid = vxi->vx_id;
533         vc_data.initpid = vxi->vx_initpid;
534         put_vx_info(vxi);
535
536         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
537                 return -EFAULT;
538         return 0;
539 }
540
541
542 /* context functions */
543
544 int vc_ctx_create(uint32_t xid, void __user *data)
545 {
546         struct vx_info *new_vxi;
547         int ret;
548
549         if (!capable(CAP_SYS_ADMIN))
550                 return -EPERM;
551
552         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
553                 return -EINVAL;
554
555         if (xid < 1)
556                 return -EINVAL;
557
558         new_vxi = __loc_vx_info(xid, &ret);
559         if (!new_vxi)
560                 return ret;
561         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
562                 ret = -EEXIST;
563                 goto out_put;
564         }
565
566         ret = new_vxi->vx_id;
567         vx_migrate_task(current, new_vxi);
568         /* if this fails, we might end up with a hashed vx_info */
569 out_put:
570         put_vx_info(new_vxi);
571         return ret;
572 }
573
574
575 int vc_ctx_migrate(uint32_t id, void __user *data)
576 {
577         struct vx_info *vxi;
578         
579         if (!capable(CAP_SYS_ADMIN))
580                 return -EPERM;
581
582         /* dirty hack until Spectator becomes a cap */
583         if (id == 1) {
584                 current->xid = 1;
585                 return 0;
586         }
587
588         vxi = locate_vx_info(id);
589         if (!vxi)
590                 return -ESRCH;
591         vx_migrate_task(current, vxi);
592         put_vx_info(vxi);
593         return 0;
594 }
595
596
597 int vc_get_cflags(uint32_t id, void __user *data)
598 {
599         struct vx_info *vxi;
600         struct vcmd_ctx_flags_v0 vc_data;
601
602         if (!capable(CAP_SYS_ADMIN))
603                 return -EPERM;
604
605         vxi = locate_vx_info(id);
606         if (!vxi)
607                 return -ESRCH;
608
609         vc_data.flagword = vxi->vx_flags;
610
611         /* special STATE flag handling */
612         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
613
614         put_vx_info(vxi);
615
616         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
617                 return -EFAULT;
618         return 0;
619 }
620
621 int vc_set_cflags(uint32_t id, void __user *data)
622 {
623         struct vx_info *vxi;
624         struct vcmd_ctx_flags_v0 vc_data;
625         uint64_t mask, trigger;
626
627         if (!capable(CAP_SYS_ADMIN))
628                 return -EPERM;
629         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
630                 return -EFAULT;
631
632         vxi = locate_vx_info(id);
633         if (!vxi)
634                 return -ESRCH;
635
636         /* special STATE flag handling */
637         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
638         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
639
640         if (trigger & VXF_STATE_SETUP)
641                 vx_mask_bcaps(current);
642         if (trigger & VXF_STATE_INIT)
643                 if (vxi == current->vx_info)
644                         vx_set_init(vxi, current);
645
646         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
647                 vc_data.flagword, mask);
648         put_vx_info(vxi);
649         return 0;
650 }
651
652 int vc_get_ccaps(uint32_t id, void __user *data)
653 {
654         struct vx_info *vxi;
655         struct vcmd_ctx_caps_v0 vc_data;
656
657         if (!capable(CAP_SYS_ADMIN))
658                 return -EPERM;
659
660         vxi = locate_vx_info(id);
661         if (!vxi)
662                 return -ESRCH;
663
664         vc_data.bcaps = vxi->vx_bcaps;
665         vc_data.ccaps = vxi->vx_ccaps;
666         vc_data.cmask = ~0UL;
667         put_vx_info(vxi);
668
669         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
670                 return -EFAULT;
671         return 0;
672 }
673
674 int vc_set_ccaps(uint32_t id, void __user *data)
675 {
676         struct vx_info *vxi;
677         struct vcmd_ctx_caps_v0 vc_data;
678
679         if (!capable(CAP_SYS_ADMIN))
680                 return -EPERM;
681         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
682                 return -EFAULT;
683
684         vxi = locate_vx_info(id);
685         if (!vxi)
686                 return -ESRCH;
687
688         vxi->vx_bcaps &= vc_data.bcaps;
689         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
690                 vc_data.ccaps, vc_data.cmask);
691         put_vx_info(vxi);
692         return 0;
693 }
694
695 #include <linux/module.h>
696
697 EXPORT_SYMBOL_GPL(rcu_free_vx_info);
698 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
699 EXPORT_SYMBOL_GPL(unhash_vx_info);
700