VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
28
29 #include <asm/errno.h>
30
31
32 /*      __alloc_vx_info()
33
34         * allocate an initialized vx_info struct
35         * doesn't make it visible (hash)                        */
36
37 static struct vx_info *__alloc_vx_info(xid_t xid)
38 {
39         struct vx_info *new = NULL;
40         
41         vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
42
43         /* would this benefit from a slab cache? */
44         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
45         if (!new)
46                 return 0;
47
48         memset (new, 0, sizeof(struct vx_info));
49         new->vx_id = xid;
50         INIT_RCU_HEAD(&new->vx_rcu);
51         INIT_HLIST_NODE(&new->vx_hlist);
52         atomic_set(&new->vx_refcnt, 0);
53         atomic_set(&new->vx_usecnt, 0);
54
55         /* rest of init goes here */
56         vx_info_init_limit(&new->limit);
57         vx_info_init_sched(&new->sched);
58         vx_info_init_cvirt(&new->cvirt);
59         vx_info_init_cacct(&new->cacct);
60
61         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
62         new->vx_bcaps = CAP_INIT_EFF_SET;
63         new->vx_ccaps = 0;
64
65         vxdprintk(VXD_CBIT(xid, 0),
66                 "alloc_vx_info(%d) = %p", xid, new);
67         return new;
68 }
69
70 /*      __dealloc_vx_info()
71
72         * final disposal of vx_info                             */
73
74 static void __dealloc_vx_info(struct vx_info *vxi)
75 {
76         vxdprintk(VXD_CBIT(xid, 0),
77                 "dealloc_vx_info(%p)", vxi);
78
79         vxi->vx_hlist.next = LIST_POISON1;
80         vxi->vx_id = -1;
81
82         if (vxi->vx_namespace)
83                 put_namespace(vxi->vx_namespace);
84         if (vxi->vx_fs)
85                 put_fs_struct(vxi->vx_fs);
86         
87         vx_info_exit_limit(&vxi->limit);
88         vx_info_exit_sched(&vxi->sched);
89         vx_info_exit_cvirt(&vxi->cvirt);
90         vx_info_exit_cacct(&vxi->cacct);
91         
92         BUG_ON(atomic_read(&vxi->vx_usecnt));
93         BUG_ON(atomic_read(&vxi->vx_refcnt));
94
95         kfree(vxi);
96 }
97
98
99 /*      hash table for vx_info hash */
100
101 #define VX_HASH_SIZE    13
102
103 struct hlist_head vx_info_hash[VX_HASH_SIZE];
104
105 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
106
107
108 static inline unsigned int __hashval(xid_t xid)
109 {
110         return (xid % VX_HASH_SIZE);
111 }
112
113
114
115 /*      __hash_vx_info()
116
117         * add the vxi to the global hash table
118         * requires the hash_lock to be held                     */
119
120 static inline void __hash_vx_info(struct vx_info *vxi)
121 {
122         struct hlist_head *head;
123         
124         vxdprintk(VXD_CBIT(xid, 4),
125                 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
126         get_vx_info(vxi);
127         head = &vx_info_hash[__hashval(vxi->vx_id)];
128         hlist_add_head_rcu(&vxi->vx_hlist, head);
129 }
130
131 /*      __unhash_vx_info()
132
133         * remove the vxi from the global hash table
134         * requires the hash_lock to be held                     */
135
136 static inline void __unhash_vx_info(struct vx_info *vxi)
137 {
138         vxdprintk(VXD_CBIT(xid, 4),
139                 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
140         hlist_del_rcu(&vxi->vx_hlist);
141         put_vx_info(vxi);
142 }
143
144
145 /*      __lookup_vx_info()
146
147         * requires the rcu_read_lock()
148         * doesn't increment the vx_refcnt                       */
149
150 static inline struct vx_info *__lookup_vx_info(xid_t xid)
151 {
152         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
153         struct hlist_node *pos;
154
155         hlist_for_each_rcu(pos, head) {
156                 struct vx_info *vxi =
157                         hlist_entry(pos, struct vx_info, vx_hlist);
158
159                 if (vxi->vx_id == xid) {
160                         return vxi;
161                 }
162         }
163         return NULL;
164 }
165
166
167 /*      __vx_dynamic_id()
168
169         * find unused dynamic xid
170         * requires the hash_lock to be held                     */
171
172 static inline xid_t __vx_dynamic_id(void)
173 {
174         static xid_t seq = MAX_S_CONTEXT;
175         xid_t barrier = seq;
176         
177         do {
178                 if (++seq > MAX_S_CONTEXT)
179                         seq = MIN_D_CONTEXT;
180                 if (!__lookup_vx_info(seq)) {
181                         vxdprintk(VXD_CBIT(xid, 4),
182                                 "__vx_dynamic_id: [#%d]", seq);
183                         return seq;
184                 }
185         } while (barrier != seq);
186         return 0;
187 }
188
189 /*      __loc_vx_info()
190
191         * locate or create the requested context
192         * get() it and if new hash it                           */
193
194 static struct vx_info * __loc_vx_info(int id, int *err)
195 {
196         struct vx_info *new, *vxi = NULL;
197         
198         vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
199
200         if (!(new = __alloc_vx_info(id))) {
201                 *err = -ENOMEM;
202                 return NULL;
203         }
204
205         spin_lock(&vx_info_hash_lock);
206
207         /* dynamic context requested */
208         if (id == VX_DYNAMIC_ID) {
209                 id = __vx_dynamic_id();
210                 if (!id) {
211                         printk(KERN_ERR "no dynamic context available.\n");
212                         goto out_unlock;
213                 }
214                 new->vx_id = id;
215         }
216         /* existing context requested */
217         else if ((vxi = __lookup_vx_info(id))) {
218                 /* context in setup is not available */
219                 if (vxi->vx_flags & VXF_STATE_SETUP) {
220                         vxdprintk(VXD_CBIT(xid, 0),
221                                 "loc_vx_info(%d) = %p (not available)", id, vxi);
222                         vxi = NULL;
223                         *err = -EBUSY;
224                 } else {
225                         vxdprintk(VXD_CBIT(xid, 0),
226                                 "loc_vx_info(%d) = %p (found)", id, vxi);
227                         get_vx_info(vxi);
228                         *err = 0;
229                 }
230                 goto out_unlock;
231         }
232
233         /* new context requested */
234         vxdprintk(VXD_CBIT(xid, 0),
235                 "loc_vx_info(%d) = %p (new)", id, new);
236         __hash_vx_info(get_vx_info(new));
237         vxi = new, new = NULL;
238         *err = 1;
239
240 out_unlock:
241         spin_unlock(&vx_info_hash_lock);
242         if (new)
243                 __dealloc_vx_info(new);
244         return vxi;
245 }
246
247
248
249 /*      exported stuff                                          */
250
251
252
253 void rcu_free_vx_info(struct rcu_head *head)
254 {
255         struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
256         int usecnt, refcnt;
257
258         BUG_ON(!vxi || !head);
259
260         usecnt = atomic_read(&vxi->vx_usecnt);
261         BUG_ON(usecnt < 0);
262
263         refcnt = atomic_read(&vxi->vx_refcnt);
264         BUG_ON(refcnt < 0);
265
266         vxdprintk(VXD_CBIT(xid, 3),
267                 "rcu_free_vx_info(%p): uc=%d", vxi, usecnt);
268         if (!usecnt)
269                 __dealloc_vx_info(vxi);
270         else
271                 printk("!!! rcu didn't free\n");
272 }
273
274 void unhash_vx_info(struct vx_info *vxi)
275 {
276         spin_lock(&vx_info_hash_lock);
277         __unhash_vx_info(vxi);
278         spin_unlock(&vx_info_hash_lock);
279 }
280
281 /*      locate_vx_info()
282
283         * search for a vx_info and get() it                     
284         * negative id means current                             */
285
286 struct vx_info *locate_vx_info(int id)
287 {
288         struct vx_info *vxi;
289         
290         if (id < 0) {
291                 vxi = get_vx_info(current->vx_info);
292         } else {
293                 rcu_read_lock();
294                 vxi = get_vx_info(__lookup_vx_info(id));
295                 rcu_read_unlock();
296         }
297         return vxi;
298 }
299
300 /*      vx_info_is_hashed()
301
302         * verify that xid is still hashed                       */
303
304 int vx_info_is_hashed(xid_t xid)
305 {
306         int hashed;
307
308         rcu_read_lock();
309         hashed = (__lookup_vx_info(xid) != NULL);
310         rcu_read_unlock();
311         return hashed;
312 }
313
314 #ifdef  CONFIG_VSERVER_LEGACY
315
316 #if 0
317 struct vx_info *alloc_vx_info(xid_t xid)
318 {
319         return __alloc_vx_info(xid);
320 }
321 #endif
322
323 struct vx_info *locate_or_create_vx_info(int id)
324 {
325         int err;
326
327         return __loc_vx_info(id, &err);
328 }
329
330 #endif
331
332 #ifdef  CONFIG_PROC_FS
333
334 #define hlist_for_each_rcu(pos, head) \
335         for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
336                 pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
337
338 int get_xid_list(int index, unsigned int *xids, int size)
339 {
340         int hindex, nr_xids = 0;
341
342         rcu_read_lock();
343         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
344                 struct hlist_head *head = &vx_info_hash[hindex];
345                 struct hlist_node *pos;
346
347                 hlist_for_each_rcu(pos, head) {
348                         struct vx_info *vxi;
349
350                         if (--index > 0)
351                                 continue;
352
353                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
354                         xids[nr_xids] = vxi->vx_id;                     
355                         if (++nr_xids >= size)
356                                 goto out;
357                 }
358         }
359 out:
360         rcu_read_unlock();
361         return nr_xids;
362 }
363 #endif
364
365 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
366 {
367         struct user_struct *new_user, *old_user;
368         
369         if (!p || !vxi)
370                 BUG();
371         new_user = alloc_uid(vxi->vx_id, p->uid);
372         if (!new_user)
373                 return -ENOMEM;
374
375         old_user = p->user;
376         if (new_user != old_user) {
377                 atomic_inc(&new_user->processes);
378                 atomic_dec(&old_user->processes);
379                 p->user = new_user;
380         }
381         free_uid(old_user);
382         return 0;
383 }
384
385 void vx_mask_bcaps(struct task_struct *p)
386 {
387         struct vx_info *vxi = p->vx_info;
388
389         p->cap_effective &= vxi->vx_bcaps;
390         p->cap_inheritable &= vxi->vx_bcaps;
391         p->cap_permitted &= vxi->vx_bcaps;
392 }
393
394
395 #include <linux/file.h>
396
397 static inline int vx_nofiles_task(struct task_struct *tsk)
398 {
399         struct files_struct *files = tsk->files;
400         const unsigned long *obptr;
401         int count, total;
402
403         spin_lock(&files->file_lock);
404         obptr = files->open_fds->fds_bits;
405         count = files->max_fds / (sizeof(unsigned long) * 8);
406         for (total = 0; count > 0; count--) {
407                 if (*obptr)
408                         total += hweight_long(*obptr);
409                 obptr++;
410         }
411         spin_unlock(&files->file_lock);
412         return total;
413 }
414
415 #if 0
416
417 static inline int vx_openfd_task(struct task_struct *tsk)
418 {
419         struct files_struct *files = tsk->files;
420         const unsigned long *bptr;
421         int count, total;
422
423         spin_lock(&files->file_lock);
424         bptr = files->open_fds->fds_bits;
425         count = files->max_fds / (sizeof(unsigned long) * 8);
426         for (total = 0; count > 0; count--) {
427                 if (*bptr)
428                         total += hweight_long(*bptr);
429                 bptr++;
430         }
431         spin_unlock(&files->file_lock);
432         return total;
433 }
434
435 #endif
436
437 /*
438  *      migrate task to new context
439  *      gets vxi, puts old_vxi on change
440  */
441
442 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
443 {
444         struct vx_info *old_vxi;
445         int ret = 0;
446         
447         if (!p || !vxi)
448                 BUG();
449
450         old_vxi = task_get_vx_info(p);
451         if (old_vxi == vxi)
452                 goto out;
453
454         vxdprintk(VXD_CBIT(xid, 5),
455                 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
456                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
457
458         if (!(ret = vx_migrate_user(p, vxi))) {
459                 int nofiles;
460
461                 task_lock(p);
462                 // openfd = vx_openfd_task(p);
463                 nofiles = vx_nofiles_task(p);
464
465                 if (old_vxi) {
466                         atomic_dec(&old_vxi->cacct.nr_threads);
467                         atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
468                         atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
469                         // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
470                 }               
471                 atomic_inc(&vxi->cacct.nr_threads);
472                 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
473                 atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
474                 // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
475
476                 vxdprintk(VXD_CBIT(xid, 5),
477                         "moved task %p into vxi:%p[#%d]",
478                         p, vxi, vxi->vx_id);
479
480                 /* should be handled in set_vx_info !! */
481                 if (old_vxi)
482                         clr_vx_info(&p->vx_info);
483                 set_vx_info(&p->vx_info, vxi);
484                 p->xid = vxi->vx_id;
485                 vx_mask_bcaps(p);
486                 task_unlock(p);
487
488                 /* obsoleted by clr/set */
489                 // put_vx_info(old_vxi);
490         }
491 out:
492         put_vx_info(old_vxi);
493         return ret;
494 }
495
496 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
497 {
498         if (!vxi)
499                 return -EINVAL;
500         if (vxi->vx_initpid)
501                 return -EPERM;
502
503         vxi->vx_initpid = p->tgid;
504         return 0;
505 }
506
507
508 /* vserver syscall commands below here */
509
510 /* taks xid and vx_info functions */
511
512 #include <asm/uaccess.h>
513
514
515 int vc_task_xid(uint32_t id, void __user *data)
516 {
517         xid_t xid;
518
519         if (id) {
520                 struct task_struct *tsk;
521
522                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
523                         return -EPERM;
524
525                 read_lock(&tasklist_lock);
526                 tsk = find_task_by_pid(id);
527                 xid = (tsk) ? tsk->xid : -ESRCH;
528                 read_unlock(&tasklist_lock);
529         }
530         else
531                 xid = current->xid;
532         return xid;
533 }
534
535
536 int vc_vx_info(uint32_t id, void __user *data)
537 {
538         struct vx_info *vxi;
539         struct vcmd_vx_info_v0 vc_data;
540
541         if (!vx_check(0, VX_ADMIN))
542                 return -ENOSYS;
543         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
544                 return -EPERM;
545
546         vxi = locate_vx_info(id);
547         if (!vxi)
548                 return -ESRCH;
549
550         vc_data.xid = vxi->vx_id;
551         vc_data.initpid = vxi->vx_initpid;
552         put_vx_info(vxi);
553
554         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
555                 return -EFAULT;
556         return 0;
557 }
558
559
560 /* context functions */
561
562 int vc_ctx_create(uint32_t xid, void __user *data)
563 {
564         struct vx_info *new_vxi;
565         int ret;
566
567         if (!capable(CAP_SYS_ADMIN))
568                 return -EPERM;
569
570         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
571                 return -EINVAL;
572
573         if (xid < 1)
574                 return -EINVAL;
575
576         new_vxi = __loc_vx_info(xid, &ret);
577         if (!new_vxi)
578                 return ret;
579         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
580                 ret = -EEXIST;
581                 goto out_put;
582         }
583
584         ret = new_vxi->vx_id;
585         vx_migrate_task(current, new_vxi);
586         /* if this fails, we might end up with a hashed vx_info */
587 out_put:
588         put_vx_info(new_vxi);
589         return ret;
590 }
591
592
593 int vc_ctx_migrate(uint32_t id, void __user *data)
594 {
595         struct vx_info *vxi;
596         
597         if (!capable(CAP_SYS_ADMIN))
598                 return -EPERM;
599
600         /* dirty hack until Spectator becomes a cap */
601         if (id == 1) {
602                 current->xid = 1;
603                 return 0;
604         }
605
606         vxi = locate_vx_info(id);
607         if (!vxi)
608                 return -ESRCH;
609         vx_migrate_task(current, vxi);
610         put_vx_info(vxi);
611         return 0;
612 }
613
614
615 int vc_get_cflags(uint32_t id, void __user *data)
616 {
617         struct vx_info *vxi;
618         struct vcmd_ctx_flags_v0 vc_data;
619
620         if (!capable(CAP_SYS_ADMIN))
621                 return -EPERM;
622
623         vxi = locate_vx_info(id);
624         if (!vxi)
625                 return -ESRCH;
626
627         vc_data.flagword = vxi->vx_flags;
628
629         /* special STATE flag handling */
630         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
631
632         put_vx_info(vxi);
633
634         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
635                 return -EFAULT;
636         return 0;
637 }
638
639 int vc_set_cflags(uint32_t id, void __user *data)
640 {
641         struct vx_info *vxi;
642         struct vcmd_ctx_flags_v0 vc_data;
643         uint64_t mask, trigger;
644
645         if (!capable(CAP_SYS_ADMIN))
646                 return -EPERM;
647         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
648                 return -EFAULT;
649
650         vxi = locate_vx_info(id);
651         if (!vxi)
652                 return -ESRCH;
653
654         /* special STATE flag handling */
655         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
656         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
657
658         if (trigger & VXF_STATE_SETUP)
659                 vx_mask_bcaps(current);
660         if (trigger & VXF_STATE_INIT)
661                 if (vxi == current->vx_info)
662                         vx_set_init(vxi, current);
663
664         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
665                 vc_data.flagword, mask);
666         put_vx_info(vxi);
667         return 0;
668 }
669
670 int vc_get_ccaps(uint32_t id, void __user *data)
671 {
672         struct vx_info *vxi;
673         struct vcmd_ctx_caps_v0 vc_data;
674
675         if (!capable(CAP_SYS_ADMIN))
676                 return -EPERM;
677
678         vxi = locate_vx_info(id);
679         if (!vxi)
680                 return -ESRCH;
681
682         vc_data.bcaps = vxi->vx_bcaps;
683         vc_data.ccaps = vxi->vx_ccaps;
684         vc_data.cmask = ~0UL;
685         put_vx_info(vxi);
686
687         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
688                 return -EFAULT;
689         return 0;
690 }
691
692 int vc_set_ccaps(uint32_t id, void __user *data)
693 {
694         struct vx_info *vxi;
695         struct vcmd_ctx_caps_v0 vc_data;
696
697         if (!capable(CAP_SYS_ADMIN))
698                 return -EPERM;
699         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
700                 return -EFAULT;
701
702         vxi = locate_vx_info(id);
703         if (!vxi)
704                 return -ESRCH;
705
706         vxi->vx_bcaps &= vc_data.bcaps;
707         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
708                 vc_data.ccaps, vc_data.cmask);
709         put_vx_info(vxi);
710         return 0;
711 }
712
713 #include <linux/module.h>
714
715 EXPORT_SYMBOL_GPL(rcu_free_vx_info);
716 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
717 EXPORT_SYMBOL_GPL(unhash_vx_info);
718