vserver 1.9.3
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
28
29 #include <asm/errno.h>
30
31
32 /*      __alloc_vx_info()
33
34         * allocate an initialized vx_info struct
35         * doesn't make it visible (hash)                        */
36
37 static struct vx_info *__alloc_vx_info(xid_t xid)
38 {
39         struct vx_info *new = NULL;
40
41         vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
42
43         /* would this benefit from a slab cache? */
44         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
45         if (!new)
46                 return 0;
47
48         memset (new, 0, sizeof(struct vx_info));
49         new->vx_id = xid;
50         INIT_RCU_HEAD(&new->vx_rcu);
51         INIT_HLIST_NODE(&new->vx_hlist);
52         atomic_set(&new->vx_refcnt, 0);
53         atomic_set(&new->vx_usecnt, 0);
54         new->vx_parent = NULL;
55         new->vx_state = 0;
56         new->vx_lock = SPIN_LOCK_UNLOCKED;
57         init_waitqueue_head(&new->vx_exit);
58
59         /* rest of init goes here */
60         vx_info_init_limit(&new->limit);
61         vx_info_init_sched(&new->sched);
62         vx_info_init_cvirt(&new->cvirt);
63         vx_info_init_cacct(&new->cacct);
64
65
66         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
67         new->vx_bcaps = CAP_INIT_EFF_SET;
68         new->vx_ccaps = 0;
69
70         vxdprintk(VXD_CBIT(xid, 0),
71                 "alloc_vx_info(%d) = %p", xid, new);
72         return new;
73 }
74
75 /*      __dealloc_vx_info()
76
77         * final disposal of vx_info                             */
78
79 static void __dealloc_vx_info(struct vx_info *vxi)
80 {
81         vxdprintk(VXD_CBIT(xid, 0),
82                 "dealloc_vx_info(%p)", vxi);
83
84         vxi->vx_hlist.next = LIST_POISON1;
85         vxi->vx_id = -1;
86
87         vx_info_exit_limit(&vxi->limit);
88         vx_info_exit_sched(&vxi->sched);
89         vx_info_exit_cvirt(&vxi->cvirt);
90         vx_info_exit_cacct(&vxi->cacct);
91
92
93         BUG_ON(atomic_read(&vxi->vx_usecnt));
94         BUG_ON(atomic_read(&vxi->vx_refcnt));
95
96         BUG_ON(vx_info_state(vxi, VXS_HASHED));
97         // BUG_ON(!vx_state(vxi, VXS_DEFUNCT));
98
99         vxi->vx_state |= VXS_RELEASED;
100         kfree(vxi);
101 }
102
103 static inline int __free_vx_info(struct vx_info *vxi)
104 {
105         int usecnt, refcnt;
106
107         BUG_ON(!vxi);
108
109         usecnt = atomic_read(&vxi->vx_usecnt);
110         BUG_ON(usecnt < 0);
111
112         refcnt = atomic_read(&vxi->vx_refcnt);
113         BUG_ON(refcnt < 0);
114
115         if (!usecnt)
116                 __dealloc_vx_info(vxi);
117         return usecnt;
118 }
119
120 #if 0
121
122 static void __rcu_free_vx_info(struct rcu_head *head)
123 {
124         struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
125
126         BUG_ON(!head);
127         vxdprintk(VXD_CBIT(xid, 3),
128                 "rcu_free_vx_info(%p): uc=%d", vxi,
129                 atomic_read(&vxi->vx_usecnt));
130
131         __free_vx_info(vxi);
132 }
133
134 #endif
135
136 void free_vx_info(struct vx_info *vxi)
137 {
138         struct namespace *namespace;
139         struct fs_struct *fs;
140
141         /* context shutdown is mandatory */
142         // BUG_ON(vxi->vx_state != VXS_SHUTDOWN);
143
144         namespace = xchg(&vxi->vx_namespace, NULL);
145         fs = xchg(&vxi->vx_fs, NULL);
146
147         if (namespace)
148                 put_namespace(namespace);
149         if (fs)
150                 put_fs_struct(fs);
151
152         BUG_ON(__free_vx_info(vxi));
153         // call_rcu(&i->vx_rcu, __rcu_free_vx_info);
154 }
155
156
157 /*      hash table for vx_info hash */
158
159 #define VX_HASH_SIZE    13
160
161 struct hlist_head vx_info_hash[VX_HASH_SIZE];
162
163 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
164
165
166 static inline unsigned int __hashval(xid_t xid)
167 {
168         return (xid % VX_HASH_SIZE);
169 }
170
171
172
173 /*      __hash_vx_info()
174
175         * add the vxi to the global hash table
176         * requires the hash_lock to be held                     */
177
178 static inline void __hash_vx_info(struct vx_info *vxi)
179 {
180         struct hlist_head *head;
181
182         vxdprintk(VXD_CBIT(xid, 4),
183                 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
184         get_vx_info(vxi);
185         vxi->vx_state |= VXS_HASHED;
186         head = &vx_info_hash[__hashval(vxi->vx_id)];
187         hlist_add_head_rcu(&vxi->vx_hlist, head);
188 }
189
190 /*      __unhash_vx_info()
191
192         * remove the vxi from the global hash table
193         * requires the hash_lock to be held                     */
194
195 static inline void __unhash_vx_info(struct vx_info *vxi)
196 {
197         vxdprintk(VXD_CBIT(xid, 4),
198                 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
199         vxi->vx_state &= ~VXS_HASHED;
200         hlist_del_rcu(&vxi->vx_hlist);
201         put_vx_info(vxi);
202 }
203
204
205 /*      __lookup_vx_info()
206
207         * requires the rcu_read_lock()
208         * doesn't increment the vx_refcnt                       */
209
210 static inline struct vx_info *__lookup_vx_info(xid_t xid)
211 {
212         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
213         struct hlist_node *pos;
214
215         hlist_for_each_rcu(pos, head) {
216                 struct vx_info *vxi =
217                         hlist_entry(pos, struct vx_info, vx_hlist);
218
219                 if ((vxi->vx_id == xid) &&
220                         vx_info_state(vxi, VXS_HASHED))
221                         return vxi;
222         }
223         return NULL;
224 }
225
226
227 /*      __vx_dynamic_id()
228
229         * find unused dynamic xid
230         * requires the hash_lock to be held                     */
231
232 static inline xid_t __vx_dynamic_id(void)
233 {
234         static xid_t seq = MAX_S_CONTEXT;
235         xid_t barrier = seq;
236
237         do {
238                 if (++seq > MAX_S_CONTEXT)
239                         seq = MIN_D_CONTEXT;
240                 if (!__lookup_vx_info(seq)) {
241                         vxdprintk(VXD_CBIT(xid, 4),
242                                 "__vx_dynamic_id: [#%d]", seq);
243                         return seq;
244                 }
245         } while (barrier != seq);
246         return 0;
247 }
248
249 /*      __loc_vx_info()
250
251         * locate or create the requested context
252         * get() it and if new hash it                           */
253
254 static struct vx_info * __loc_vx_info(int id, int *err)
255 {
256         struct vx_info *new, *vxi = NULL;
257
258         vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
259
260         if (!(new = __alloc_vx_info(id))) {
261                 *err = -ENOMEM;
262                 return NULL;
263         }
264
265         spin_lock(&vx_info_hash_lock);
266
267         /* dynamic context requested */
268         if (id == VX_DYNAMIC_ID) {
269                 id = __vx_dynamic_id();
270                 if (!id) {
271                         printk(KERN_ERR "no dynamic context available.\n");
272                         goto out_unlock;
273                 }
274                 new->vx_id = id;
275         }
276         /* existing context requested */
277         else if ((vxi = __lookup_vx_info(id))) {
278                 /* context in setup is not available */
279                 if (vxi->vx_flags & VXF_STATE_SETUP) {
280                         vxdprintk(VXD_CBIT(xid, 0),
281                                 "loc_vx_info(%d) = %p (not available)", id, vxi);
282                         vxi = NULL;
283                         *err = -EBUSY;
284                 } else {
285                         vxdprintk(VXD_CBIT(xid, 0),
286                                 "loc_vx_info(%d) = %p (found)", id, vxi);
287                         get_vx_info(vxi);
288                         *err = 0;
289                 }
290                 goto out_unlock;
291         }
292
293         /* new context requested */
294         vxdprintk(VXD_CBIT(xid, 0),
295                 "loc_vx_info(%d) = %p (new)", id, new);
296         __hash_vx_info(get_vx_info(new));
297         vxi = new, new = NULL;
298         *err = 1;
299
300 out_unlock:
301         spin_unlock(&vx_info_hash_lock);
302         if (new)
303                 __dealloc_vx_info(new);
304         return vxi;
305 }
306
307
308
309 /*      exported stuff                                          */
310
311
312 void unhash_vx_info(struct vx_info *vxi)
313 {
314         spin_lock(&vx_info_hash_lock);
315         __unhash_vx_info(vxi);
316         spin_unlock(&vx_info_hash_lock);
317 }
318
319 /*      locate_vx_info()
320
321         * search for a vx_info and get() it
322         * negative id means current                             */
323
324 struct vx_info *locate_vx_info(int id)
325 {
326         struct vx_info *vxi;
327
328         if (id < 0) {
329                 vxi = get_vx_info(current->vx_info);
330         } else {
331                 rcu_read_lock();
332                 vxi = get_vx_info(__lookup_vx_info(id));
333                 rcu_read_unlock();
334         }
335         return vxi;
336 }
337
338 /*      vx_info_is_hashed()
339
340         * verify that xid is still hashed                       */
341
342 int vx_info_is_hashed(xid_t xid)
343 {
344         int hashed;
345
346         rcu_read_lock();
347         hashed = (__lookup_vx_info(xid) != NULL);
348         rcu_read_unlock();
349         return hashed;
350 }
351
352 #ifdef  CONFIG_VSERVER_LEGACY
353
354 #if 0
355 struct vx_info *alloc_vx_info(xid_t xid)
356 {
357         return __alloc_vx_info(xid);
358 }
359 #endif
360
361 struct vx_info *locate_or_create_vx_info(int id)
362 {
363         int err;
364
365         return __loc_vx_info(id, &err);
366 }
367
368 #endif
369
370 #ifdef  CONFIG_PROC_FS
371
372 int get_xid_list(int index, unsigned int *xids, int size)
373 {
374         int hindex, nr_xids = 0;
375
376         rcu_read_lock();
377         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
378                 struct hlist_head *head = &vx_info_hash[hindex];
379                 struct hlist_node *pos;
380
381                 hlist_for_each_rcu(pos, head) {
382                         struct vx_info *vxi;
383
384                         if (--index > 0)
385                                 continue;
386
387                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
388                         xids[nr_xids] = vxi->vx_id;
389                         if (++nr_xids >= size)
390                                 goto out;
391                 }
392         }
393 out:
394         rcu_read_unlock();
395         return nr_xids;
396 }
397 #endif
398
399 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
400 {
401         struct user_struct *new_user, *old_user;
402
403         if (!p || !vxi)
404                 BUG();
405         new_user = alloc_uid(vxi->vx_id, p->uid);
406         if (!new_user)
407                 return -ENOMEM;
408
409         old_user = p->user;
410         if (new_user != old_user) {
411                 atomic_inc(&new_user->processes);
412                 atomic_dec(&old_user->processes);
413                 p->user = new_user;
414         }
415         free_uid(old_user);
416         return 0;
417 }
418
419 void vx_mask_bcaps(struct task_struct *p)
420 {
421         struct vx_info *vxi = p->vx_info;
422
423         p->cap_effective &= vxi->vx_bcaps;
424         p->cap_inheritable &= vxi->vx_bcaps;
425         p->cap_permitted &= vxi->vx_bcaps;
426 }
427
428
429 #include <linux/file.h>
430
431 static inline int vx_nofiles_task(struct task_struct *tsk)
432 {
433         struct files_struct *files = tsk->files;
434         unsigned long *obptr;
435         int count, total;
436
437         spin_lock(&files->file_lock);
438         obptr = files->open_fds->fds_bits;
439         count = files->max_fds / (sizeof(unsigned long) * 8);
440         for (total = 0; count > 0; count--) {
441                 if (*obptr)
442                         total += hweight_long(*obptr);
443                 obptr++;
444         }
445         spin_unlock(&files->file_lock);
446         return total;
447 }
448
449 #if 0
450
451 static inline int vx_openfd_task(struct task_struct *tsk)
452 {
453         struct files_struct *files = tsk->files;
454         const unsigned long *bptr;
455         int count, total;
456
457         spin_lock(&files->file_lock);
458         bptr = files->open_fds->fds_bits;
459         count = files->max_fds / (sizeof(unsigned long) * 8);
460         for (total = 0; count > 0; count--) {
461                 if (*bptr)
462                         total += hweight_long(*bptr);
463                 bptr++;
464         }
465         spin_unlock(&files->file_lock);
466         return total;
467 }
468
469 #endif
470
471 /*
472  *      migrate task to new context
473  *      gets vxi, puts old_vxi on change
474  */
475
476 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
477 {
478         struct vx_info *old_vxi;
479         int ret = 0;
480
481         if (!p || !vxi)
482                 BUG();
483
484         old_vxi = task_get_vx_info(p);
485         if (old_vxi == vxi)
486                 goto out;
487
488         vxdprintk(VXD_CBIT(xid, 5),
489                 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
490                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
491
492         if (!(ret = vx_migrate_user(p, vxi))) {
493                 int nofiles;
494
495                 task_lock(p);
496                 // openfd = vx_openfd_task(p);
497                 nofiles = vx_nofiles_task(p);
498
499                 if (old_vxi) {
500                         atomic_dec(&old_vxi->cvirt.nr_threads);
501                         atomic_dec(&old_vxi->cvirt.nr_running);
502                         atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
503                         /* FIXME: what about the struct files here? */
504                         // atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
505                         // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
506                 }
507                 atomic_inc(&vxi->cvirt.nr_threads);
508                 atomic_inc(&vxi->cvirt.nr_running);
509                 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
510                 /* FIXME: what about the struct files here? */
511                 // atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
512                 // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
513
514                 vxdprintk(VXD_CBIT(xid, 5),
515                         "moved task %p into vxi:%p[#%d]",
516                         p, vxi, vxi->vx_id);
517
518                 /* should be handled in set_vx_info !! */
519                 if (old_vxi)
520                         clr_vx_info(&p->vx_info);
521                 set_vx_info(&p->vx_info, vxi);
522                 p->xid = vxi->vx_id;
523                 vx_mask_bcaps(p);
524                 task_unlock(p);
525
526                 /* obsoleted by clr/set */
527                 // put_vx_info(old_vxi);
528         }
529 out:
530         put_vx_info(old_vxi);
531         return ret;
532 }
533
534 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
535 {
536         if (!vxi)
537                 return -EINVAL;
538         if (vxi->vx_initpid)
539                 return -EPERM;
540
541         vxdprintk(VXD_CBIT(xid, 6),
542                 "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
543                 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
544
545         vxi->vx_initpid = p->tgid;
546         return 0;
547 }
548
549
550 /* vserver syscall commands below here */
551
552 /* taks xid and vx_info functions */
553
554 #include <asm/uaccess.h>
555
556
557 int vc_task_xid(uint32_t id, void __user *data)
558 {
559         xid_t xid;
560
561         if (id) {
562                 struct task_struct *tsk;
563
564                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
565                         return -EPERM;
566
567                 read_lock(&tasklist_lock);
568                 tsk = find_task_by_real_pid(id);
569                 xid = (tsk) ? tsk->xid : -ESRCH;
570                 read_unlock(&tasklist_lock);
571         }
572         else
573                 xid = current->xid;
574         return xid;
575 }
576
577
578 int vc_vx_info(uint32_t id, void __user *data)
579 {
580         struct vx_info *vxi;
581         struct vcmd_vx_info_v0 vc_data;
582
583         if (!vx_check(0, VX_ADMIN))
584                 return -ENOSYS;
585         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
586                 return -EPERM;
587
588         vxi = locate_vx_info(id);
589         if (!vxi)
590                 return -ESRCH;
591
592         vc_data.xid = vxi->vx_id;
593         vc_data.initpid = vxi->vx_initpid;
594         put_vx_info(vxi);
595
596         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
597                 return -EFAULT;
598         return 0;
599 }
600
601
602 /* context functions */
603
604 int vc_ctx_create(uint32_t xid, void __user *data)
605 {
606         struct vx_info *new_vxi;
607         int ret;
608
609         if (!capable(CAP_SYS_ADMIN))
610                 return -EPERM;
611
612         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
613                 return -EINVAL;
614
615         if (xid < 1)
616                 return -EINVAL;
617
618         new_vxi = __loc_vx_info(xid, &ret);
619         if (!new_vxi)
620                 return ret;
621         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
622                 ret = -EEXIST;
623                 goto out_put;
624         }
625
626         ret = new_vxi->vx_id;
627         vx_migrate_task(current, new_vxi);
628         /* if this fails, we might end up with a hashed vx_info */
629 out_put:
630         put_vx_info(new_vxi);
631         return ret;
632 }
633
634
635 int vc_ctx_migrate(uint32_t id, void __user *data)
636 {
637         struct vx_info *vxi;
638
639         if (!capable(CAP_SYS_ADMIN))
640                 return -EPERM;
641
642         /* dirty hack until Spectator becomes a cap */
643         if (id == 1) {
644                 current->xid = 1;
645                 return 0;
646         }
647
648         vxi = locate_vx_info(id);
649         if (!vxi)
650                 return -ESRCH;
651         vx_migrate_task(current, vxi);
652         put_vx_info(vxi);
653         return 0;
654 }
655
656
657 int vc_get_cflags(uint32_t id, void __user *data)
658 {
659         struct vx_info *vxi;
660         struct vcmd_ctx_flags_v0 vc_data;
661
662         if (!capable(CAP_SYS_ADMIN))
663                 return -EPERM;
664
665         vxi = locate_vx_info(id);
666         if (!vxi)
667                 return -ESRCH;
668
669         vc_data.flagword = vxi->vx_flags;
670
671         /* special STATE flag handling */
672         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
673
674         put_vx_info(vxi);
675
676         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
677                 return -EFAULT;
678         return 0;
679 }
680
681 int vc_set_cflags(uint32_t id, void __user *data)
682 {
683         struct vx_info *vxi;
684         struct vcmd_ctx_flags_v0 vc_data;
685         uint64_t mask, trigger;
686
687         if (!capable(CAP_SYS_ADMIN))
688                 return -EPERM;
689         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
690                 return -EFAULT;
691
692         vxi = locate_vx_info(id);
693         if (!vxi)
694                 return -ESRCH;
695
696         /* special STATE flag handling */
697         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
698         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
699
700         if (trigger & VXF_STATE_SETUP)
701                 vx_mask_bcaps(current);
702         if (trigger & VXF_STATE_INIT)
703                 if (vxi == current->vx_info)
704                         vx_set_init(vxi, current);
705
706         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
707                 vc_data.flagword, mask);
708         put_vx_info(vxi);
709         return 0;
710 }
711
712 int vc_get_ccaps(uint32_t id, void __user *data)
713 {
714         struct vx_info *vxi;
715         struct vcmd_ctx_caps_v0 vc_data;
716
717         if (!capable(CAP_SYS_ADMIN))
718                 return -EPERM;
719
720         vxi = locate_vx_info(id);
721         if (!vxi)
722                 return -ESRCH;
723
724         vc_data.bcaps = vxi->vx_bcaps;
725         vc_data.ccaps = vxi->vx_ccaps;
726         vc_data.cmask = ~0UL;
727         put_vx_info(vxi);
728
729         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
730                 return -EFAULT;
731         return 0;
732 }
733
734 int vc_set_ccaps(uint32_t id, void __user *data)
735 {
736         struct vx_info *vxi;
737         struct vcmd_ctx_caps_v0 vc_data;
738
739         if (!capable(CAP_SYS_ADMIN))
740                 return -EPERM;
741         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
742                 return -EFAULT;
743
744         vxi = locate_vx_info(id);
745         if (!vxi)
746                 return -ESRCH;
747
748         vxi->vx_bcaps &= vc_data.bcaps;
749         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
750                 vc_data.ccaps, vc_data.cmask);
751         put_vx_info(vxi);
752         return 0;
753 }
754
755 #include <linux/module.h>
756
757 // EXPORT_SYMBOL_GPL(rcu_free_vx_info);
758 EXPORT_SYMBOL_GPL(free_vx_info);
759 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
760 EXPORT_SYMBOL_GPL(unhash_vx_info);
761