This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
28
29 #define CKRM_VSERVER_INTEGRATION
30 #ifdef CKRM_VSERVER_INTEGRATION
31 #include <linux/ckrm.h>
32 #endif //CKRM_VSERVER_INTEGRATION
33
34 #include <asm/errno.h>
35
36
37 /*      __alloc_vx_info()
38
39         * allocate an initialized vx_info struct
40         * doesn't make it visible (hash)                        */
41
42 static struct vx_info *__alloc_vx_info(xid_t xid)
43 {
44         struct vx_info *new = NULL;
45
46         vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
47
48         /* would this benefit from a slab cache? */
49         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
50         if (!new)
51                 return 0;
52
53         memset (new, 0, sizeof(struct vx_info));
54         new->vx_id = xid;
55         INIT_RCU_HEAD(&new->vx_rcu);
56         INIT_HLIST_NODE(&new->vx_hlist);
57         atomic_set(&new->vx_refcnt, 0);
58         atomic_set(&new->vx_usecnt, 0);
59         new->vx_parent = NULL;
60         new->vx_state = 0;
61         new->vx_lock = SPIN_LOCK_UNLOCKED;
62         init_waitqueue_head(&new->vx_exit);
63
64         /* rest of init goes here */
65         vx_info_init_limit(&new->limit);
66         vx_info_init_sched(&new->sched);
67         vx_info_init_cvirt(&new->cvirt);
68         vx_info_init_cacct(&new->cacct);
69
70
71         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
72         new->vx_bcaps = CAP_INIT_EFF_SET;
73         new->vx_ccaps = 0;
74
75         vxdprintk(VXD_CBIT(xid, 0),
76                 "alloc_vx_info(%d) = %p", xid, new);
77         return new;
78 }
79
80 /*      __dealloc_vx_info()
81
82         * final disposal of vx_info                             */
83
84 static void __dealloc_vx_info(struct vx_info *vxi)
85 {
86         vxdprintk(VXD_CBIT(xid, 0),
87                 "dealloc_vx_info(%p)", vxi);
88
89         vxi->vx_hlist.next = LIST_POISON1;
90         vxi->vx_id = -1;
91
92         vx_info_exit_limit(&vxi->limit);
93         vx_info_exit_sched(&vxi->sched);
94         vx_info_exit_cvirt(&vxi->cvirt);
95         vx_info_exit_cacct(&vxi->cacct);
96
97
98         BUG_ON(atomic_read(&vxi->vx_usecnt));
99         BUG_ON(atomic_read(&vxi->vx_refcnt));
100
101         BUG_ON(vx_info_state(vxi, VXS_HASHED));
102         // BUG_ON(!vx_state(vxi, VXS_DEFUNCT));
103
104         vxi->vx_state |= VXS_RELEASED;
105         kfree(vxi);
106 }
107
108 static inline int __free_vx_info(struct vx_info *vxi)
109 {
110         int usecnt, refcnt;
111
112         BUG_ON(!vxi);
113
114         usecnt = atomic_read(&vxi->vx_usecnt);
115         BUG_ON(usecnt < 0);
116
117         refcnt = atomic_read(&vxi->vx_refcnt);
118         BUG_ON(refcnt < 0);
119
120         if (!usecnt)
121                 __dealloc_vx_info(vxi);
122         return usecnt;
123 }
124
125 #if 0
126
127 static void __rcu_free_vx_info(struct rcu_head *head)
128 {
129         struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
130
131         BUG_ON(!head);
132         vxdprintk(VXD_CBIT(xid, 3),
133                 "rcu_free_vx_info(%p): uc=%d", vxi,
134                 atomic_read(&vxi->vx_usecnt));
135
136         __free_vx_info(vxi);
137 }
138
139 #endif
140
141 void free_vx_info(struct vx_info *vxi)
142 {
143         struct namespace *namespace;
144         struct fs_struct *fs;
145
146         /* context shutdown is mandatory */
147         // BUG_ON(vxi->vx_state != VXS_SHUTDOWN);
148
149         namespace = xchg(&vxi->vx_namespace, NULL);
150         fs = xchg(&vxi->vx_fs, NULL);
151
152         if (namespace)
153                 put_namespace(namespace);
154         if (fs)
155                 put_fs_struct(fs);
156
157         BUG_ON(__free_vx_info(vxi));
158         // call_rcu(&i->vx_rcu, __rcu_free_vx_info);
159 }
160
161
162 /*      hash table for vx_info hash */
163
164 #define VX_HASH_SIZE    13
165
166 struct hlist_head vx_info_hash[VX_HASH_SIZE];
167
168 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
169
170
171 static inline unsigned int __hashval(xid_t xid)
172 {
173         return (xid % VX_HASH_SIZE);
174 }
175
176
177
178 /*      __hash_vx_info()
179
180         * add the vxi to the global hash table
181         * requires the hash_lock to be held                     */
182
183 static inline void __hash_vx_info(struct vx_info *vxi)
184 {
185         struct hlist_head *head;
186
187         vxdprintk(VXD_CBIT(xid, 4),
188                 "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
189         get_vx_info(vxi);
190         vxi->vx_state |= VXS_HASHED;
191         head = &vx_info_hash[__hashval(vxi->vx_id)];
192         hlist_add_head_rcu(&vxi->vx_hlist, head);
193 }
194
195 /*      __unhash_vx_info()
196
197         * remove the vxi from the global hash table
198         * requires the hash_lock to be held                     */
199
200 static inline void __unhash_vx_info(struct vx_info *vxi)
201 {
202         vxdprintk(VXD_CBIT(xid, 4),
203                 "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id);
204         vxi->vx_state &= ~VXS_HASHED;
205         hlist_del_rcu(&vxi->vx_hlist);
206         put_vx_info(vxi);
207 }
208
209
210 /*      __lookup_vx_info()
211
212         * requires the rcu_read_lock()
213         * doesn't increment the vx_refcnt                       */
214
215 static inline struct vx_info *__lookup_vx_info(xid_t xid)
216 {
217         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
218         struct hlist_node *pos;
219
220         hlist_for_each_rcu(pos, head) {
221                 struct vx_info *vxi =
222                         hlist_entry(pos, struct vx_info, vx_hlist);
223
224                 if ((vxi->vx_id == xid) &&
225                         vx_info_state(vxi, VXS_HASHED))
226                         return vxi;
227         }
228         return NULL;
229 }
230
231
232 /*      __vx_dynamic_id()
233
234         * find unused dynamic xid
235         * requires the hash_lock to be held                     */
236
237 static inline xid_t __vx_dynamic_id(void)
238 {
239         static xid_t seq = MAX_S_CONTEXT;
240         xid_t barrier = seq;
241
242         do {
243                 if (++seq > MAX_S_CONTEXT)
244                         seq = MIN_D_CONTEXT;
245                 if (!__lookup_vx_info(seq)) {
246                         vxdprintk(VXD_CBIT(xid, 4),
247                                 "__vx_dynamic_id: [#%d]", seq);
248                         return seq;
249                 }
250         } while (barrier != seq);
251         return 0;
252 }
253
254 /*      __loc_vx_info()
255
256         * locate or create the requested context
257         * get() it and if new hash it                           */
258
259 static struct vx_info * __loc_vx_info(int id, int *err)
260 {
261         struct vx_info *new, *vxi = NULL;
262
263         vxdprintk(VXD_CBIT(xid, 1), "loc_vx_info(%d)*", id);
264
265         if (!(new = __alloc_vx_info(id))) {
266                 *err = -ENOMEM;
267                 return NULL;
268         }
269
270         spin_lock(&vx_info_hash_lock);
271
272         /* dynamic context requested */
273         if (id == VX_DYNAMIC_ID) {
274                 id = __vx_dynamic_id();
275                 if (!id) {
276                         printk(KERN_ERR "no dynamic context available.\n");
277                         goto out_unlock;
278                 }
279                 new->vx_id = id;
280         }
281         /* existing context requested */
282         else if ((vxi = __lookup_vx_info(id))) {
283                 /* context in setup is not available */
284                 if (vxi->vx_flags & VXF_STATE_SETUP) {
285                         vxdprintk(VXD_CBIT(xid, 0),
286                                 "loc_vx_info(%d) = %p (not available)", id, vxi);
287                         vxi = NULL;
288                         *err = -EBUSY;
289                 } else {
290                         vxdprintk(VXD_CBIT(xid, 0),
291                                 "loc_vx_info(%d) = %p (found)", id, vxi);
292                         get_vx_info(vxi);
293                         *err = 0;
294                 }
295                 goto out_unlock;
296         }
297
298         /* new context requested */
299         vxdprintk(VXD_CBIT(xid, 0),
300                 "loc_vx_info(%d) = %p (new)", id, new);
301         __hash_vx_info(get_vx_info(new));
302         vxi = new, new = NULL;
303         *err = 1;
304
305 out_unlock:
306         spin_unlock(&vx_info_hash_lock);
307         if (new)
308                 __dealloc_vx_info(new);
309         return vxi;
310 }
311
312
313
314 /*      exported stuff                                          */
315
316
317 void unhash_vx_info(struct vx_info *vxi)
318 {
319         spin_lock(&vx_info_hash_lock);
320         __unhash_vx_info(vxi);
321         spin_unlock(&vx_info_hash_lock);
322 }
323
324 /*      locate_vx_info()
325
326         * search for a vx_info and get() it
327         * negative id means current                             */
328
329 struct vx_info *locate_vx_info(int id)
330 {
331         struct vx_info *vxi;
332
333         if (id < 0) {
334                 vxi = get_vx_info(current->vx_info);
335         } else {
336                 rcu_read_lock();
337                 vxi = get_vx_info(__lookup_vx_info(id));
338                 rcu_read_unlock();
339         }
340         return vxi;
341 }
342
343 /*      vx_info_is_hashed()
344
345         * verify that xid is still hashed                       */
346
347 int vx_info_is_hashed(xid_t xid)
348 {
349         int hashed;
350
351         rcu_read_lock();
352         hashed = (__lookup_vx_info(xid) != NULL);
353         rcu_read_unlock();
354         return hashed;
355 }
356
357 #ifdef  CONFIG_VSERVER_LEGACY
358
359 #if 0
360 struct vx_info *alloc_vx_info(xid_t xid)
361 {
362         return __alloc_vx_info(xid);
363 }
364 #endif
365
366 struct vx_info *locate_or_create_vx_info(int id)
367 {
368         int err;
369
370         return __loc_vx_info(id, &err);
371 }
372
373 #endif
374
375 #ifdef  CONFIG_PROC_FS
376
377 int get_xid_list(int index, unsigned int *xids, int size)
378 {
379         int hindex, nr_xids = 0;
380
381         rcu_read_lock();
382         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
383                 struct hlist_head *head = &vx_info_hash[hindex];
384                 struct hlist_node *pos;
385
386                 hlist_for_each_rcu(pos, head) {
387                         struct vx_info *vxi;
388
389                         if (--index > 0)
390                                 continue;
391
392                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
393                         xids[nr_xids] = vxi->vx_id;
394                         if (++nr_xids >= size)
395                                 goto out;
396                 }
397         }
398 out:
399         rcu_read_unlock();
400         return nr_xids;
401 }
402 #endif
403
404 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
405 {
406         struct user_struct *new_user, *old_user;
407
408         if (!p || !vxi)
409                 BUG();
410         new_user = alloc_uid(vxi->vx_id, p->uid);
411         if (!new_user)
412                 return -ENOMEM;
413
414         old_user = p->user;
415         if (new_user != old_user) {
416                 atomic_inc(&new_user->processes);
417                 atomic_dec(&old_user->processes);
418                 p->user = new_user;
419         }
420         free_uid(old_user);
421         return 0;
422 }
423
424 void vx_mask_bcaps(struct task_struct *p)
425 {
426         struct vx_info *vxi = p->vx_info;
427
428         p->cap_effective &= vxi->vx_bcaps;
429         p->cap_inheritable &= vxi->vx_bcaps;
430         p->cap_permitted &= vxi->vx_bcaps;
431 }
432
433
434 #include <linux/file.h>
435
436 static inline int vx_nofiles_task(struct task_struct *tsk)
437 {
438         struct files_struct *files = tsk->files;
439         unsigned long *obptr;
440         int count, total;
441
442         spin_lock(&files->file_lock);
443         obptr = files->open_fds->fds_bits;
444         count = files->max_fds / (sizeof(unsigned long) * 8);
445         for (total = 0; count > 0; count--) {
446                 if (*obptr)
447                         total += hweight_long(*obptr);
448                 obptr++;
449         }
450         spin_unlock(&files->file_lock);
451         return total;
452 }
453
454 #if 0
455
456 static inline int vx_openfd_task(struct task_struct *tsk)
457 {
458         struct files_struct *files = tsk->files;
459         const unsigned long *bptr;
460         int count, total;
461
462         spin_lock(&files->file_lock);
463         bptr = files->open_fds->fds_bits;
464         count = files->max_fds / (sizeof(unsigned long) * 8);
465         for (total = 0; count > 0; count--) {
466                 if (*bptr)
467                         total += hweight_long(*bptr);
468                 bptr++;
469         }
470         spin_unlock(&files->file_lock);
471         return total;
472 }
473
474 #endif
475
476 /*
477  *      migrate task to new context
478  *      gets vxi, puts old_vxi on change
479  */
480
481 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
482 {
483         struct vx_info *old_vxi;
484         int ret = 0;
485
486         if (!p || !vxi)
487                 BUG();
488
489         old_vxi = task_get_vx_info(p);
490         if (old_vxi == vxi)
491                 goto out;
492
493         vxdprintk(VXD_CBIT(xid, 5),
494                 "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
495                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
496
497         if (!(ret = vx_migrate_user(p, vxi))) {
498                 int nofiles;
499
500                 task_lock(p);
501                 // openfd = vx_openfd_task(p);
502                 nofiles = vx_nofiles_task(p);
503
504                 if (old_vxi) {
505                         atomic_dec(&old_vxi->cvirt.nr_threads);
506                         atomic_dec(&old_vxi->cvirt.nr_running);
507                         atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
508                         /* FIXME: what about the struct files here? */
509                         // atomic_sub(nofiles, &old_vxi->limit.rcur[RLIMIT_NOFILE]);
510                         // atomic_sub(openfd, &old_vxi->limit.rcur[RLIMIT_OPENFD]);
511                 }
512                 atomic_inc(&vxi->cvirt.nr_threads);
513                 atomic_inc(&vxi->cvirt.nr_running);
514                 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
515                 /* FIXME: what about the struct files here? */
516                 // atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
517                 // atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
518
519                 vxdprintk(VXD_CBIT(xid, 5),
520                         "moved task %p into vxi:%p[#%d]",
521                         p, vxi, vxi->vx_id);
522
523                 /* should be handled in set_vx_info !! */
524                 if (old_vxi)
525                         clr_vx_info(&p->vx_info);
526                 set_vx_info(&p->vx_info, vxi);
527                 p->xid = vxi->vx_id;
528                 vx_mask_bcaps(p);
529                 task_unlock(p);
530
531                 /* obsoleted by clr/set */
532                 // put_vx_info(old_vxi);
533         }
534 out:
535
536
537 #ifdef CKRM_VSERVER_INTEGRATION
538         do {
539           ckrm_cb_xid(p);
540         } while (0);
541 #endif //CKRM_VSERVER_INTEGRATION
542
543
544         put_vx_info(old_vxi);
545         return ret;
546 }
547
548 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
549 {
550         if (!vxi)
551                 return -EINVAL;
552         if (vxi->vx_initpid)
553                 return -EPERM;
554
555         vxdprintk(VXD_CBIT(xid, 6),
556                 "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
557                 vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
558
559         vxi->vx_initpid = p->tgid;
560         return 0;
561 }
562
563
564 /* vserver syscall commands below here */
565
566 /* taks xid and vx_info functions */
567
568 #include <asm/uaccess.h>
569
570
571 int vc_task_xid(uint32_t id, void __user *data)
572 {
573         xid_t xid;
574
575         if (id) {
576                 struct task_struct *tsk;
577
578                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
579                         return -EPERM;
580
581                 read_lock(&tasklist_lock);
582                 tsk = find_task_by_real_pid(id);
583                 xid = (tsk) ? tsk->xid : -ESRCH;
584                 read_unlock(&tasklist_lock);
585         }
586         else
587                 xid = current->xid;
588         return xid;
589 }
590
591
592 int vc_vx_info(uint32_t id, void __user *data)
593 {
594         struct vx_info *vxi;
595         struct vcmd_vx_info_v0 vc_data;
596
597         if (!vx_check(0, VX_ADMIN))
598                 return -ENOSYS;
599         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
600                 return -EPERM;
601
602         vxi = locate_vx_info(id);
603         if (!vxi)
604                 return -ESRCH;
605
606         vc_data.xid = vxi->vx_id;
607         vc_data.initpid = vxi->vx_initpid;
608         put_vx_info(vxi);
609
610         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
611                 return -EFAULT;
612         return 0;
613 }
614
615
616 /* context functions */
617
618 int vc_ctx_create(uint32_t xid, void __user *data)
619 {
620         struct vx_info *new_vxi;
621         int ret;
622
623         if (!capable(CAP_SYS_ADMIN))
624                 return -EPERM;
625
626         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
627                 return -EINVAL;
628
629         if (xid < 1)
630                 return -EINVAL;
631
632         new_vxi = __loc_vx_info(xid, &ret);
633         if (!new_vxi)
634                 return ret;
635         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
636                 ret = -EEXIST;
637                 goto out_put;
638         }
639
640         ret = new_vxi->vx_id;
641         vx_migrate_task(current, new_vxi);
642         /* if this fails, we might end up with a hashed vx_info */
643 out_put:
644         put_vx_info(new_vxi);
645         return ret;
646 }
647
648
649 int vc_ctx_migrate(uint32_t id, void __user *data)
650 {
651         struct vx_info *vxi;
652
653         if (!capable(CAP_SYS_ADMIN))
654                 return -EPERM;
655
656         /* dirty hack until Spectator becomes a cap */
657         if (id == 1) {
658                 current->xid = 1;
659                 return 0;
660         }
661
662         vxi = locate_vx_info(id);
663         if (!vxi)
664                 return -ESRCH;
665         vx_migrate_task(current, vxi);
666         put_vx_info(vxi);
667         return 0;
668 }
669
670
671 int vc_get_cflags(uint32_t id, void __user *data)
672 {
673         struct vx_info *vxi;
674         struct vcmd_ctx_flags_v0 vc_data;
675
676         if (!capable(CAP_SYS_ADMIN))
677                 return -EPERM;
678
679         vxi = locate_vx_info(id);
680         if (!vxi)
681                 return -ESRCH;
682
683         vc_data.flagword = vxi->vx_flags;
684
685         /* special STATE flag handling */
686         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
687
688         put_vx_info(vxi);
689
690         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
691                 return -EFAULT;
692         return 0;
693 }
694
695 int vc_set_cflags(uint32_t id, void __user *data)
696 {
697         struct vx_info *vxi;
698         struct vcmd_ctx_flags_v0 vc_data;
699         uint64_t mask, trigger;
700
701         if (!capable(CAP_SYS_ADMIN))
702                 return -EPERM;
703         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
704                 return -EFAULT;
705
706         vxi = locate_vx_info(id);
707         if (!vxi)
708                 return -ESRCH;
709
710         /* special STATE flag handling */
711         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
712         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
713
714         if (trigger & VXF_STATE_SETUP)
715                 vx_mask_bcaps(current);
716         if (trigger & VXF_STATE_INIT)
717                 if (vxi == current->vx_info)
718                         vx_set_init(vxi, current);
719
720         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
721                 vc_data.flagword, mask);
722         put_vx_info(vxi);
723         return 0;
724 }
725
726 int vc_get_ccaps(uint32_t id, void __user *data)
727 {
728         struct vx_info *vxi;
729         struct vcmd_ctx_caps_v0 vc_data;
730
731         if (!capable(CAP_SYS_ADMIN))
732                 return -EPERM;
733
734         vxi = locate_vx_info(id);
735         if (!vxi)
736                 return -ESRCH;
737
738         vc_data.bcaps = vxi->vx_bcaps;
739         vc_data.ccaps = vxi->vx_ccaps;
740         vc_data.cmask = ~0UL;
741         put_vx_info(vxi);
742
743         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
744                 return -EFAULT;
745         return 0;
746 }
747
748 int vc_set_ccaps(uint32_t id, void __user *data)
749 {
750         struct vx_info *vxi;
751         struct vcmd_ctx_caps_v0 vc_data;
752
753         if (!capable(CAP_SYS_ADMIN))
754                 return -EPERM;
755         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
756                 return -EFAULT;
757
758         vxi = locate_vx_info(id);
759         if (!vxi)
760                 return -ESRCH;
761
762         vxi->vx_bcaps &= vc_data.bcaps;
763         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
764                 vc_data.ccaps, vc_data.cmask);
765         put_vx_info(vxi);
766         return 0;
767 }
768
769 #include <linux/module.h>
770
771 // EXPORT_SYMBOL_GPL(rcu_free_vx_info);
772 EXPORT_SYMBOL_GPL(free_vx_info);
773 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
774 EXPORT_SYMBOL_GPL(unhash_vx_info);
775