dba8af5629a5a3df1cec35e647373d235b1275c7
[linux-2.6.git] / kernel / vserver / context.c
1 /*
2  *  linux/kernel/vserver/context.c
3  *
4  *  Virtual Server: Context Support
5  *
6  *  Copyright (C) 2003-2004  Herbert Pƶtzl
7  *
8  *  V0.01  context helper
9  *  V0.02  vx_ctx_kill syscall command
10  *  V0.03  replaced context_info calls
11  *  V0.04  redesign of struct (de)alloc
12  *  V0.05  rlimit basic implementation
13  *  V0.06  task_xid and info commands
14  *  V0.07  context flags and caps
15  *  V0.08  switch to RCU based hash
16  *
17  */
18
19 #include <linux/config.h>
20 #include <linux/slab.h>
21 #include <linux/vserver.h>
22 #include <linux/vserver/legacy.h>
23 #include <linux/vs_base.h>
24 #include <linux/vs_context.h>
25 #include <linux/kernel_stat.h>
26 #include <linux/namespace.h>
27 #include <linux/rcupdate.h>
28
29 #define CKRM_VSERVER_INTEGRATION
30 #ifdef CKRM_VSERVER_INTEGRATION
31 #include <linux/ckrm.h>
32 #endif //CKRM_VSERVER_INTEGRATION
33
34 #include <asm/errno.h>
35
36
37 /*      __alloc_vx_info()
38
39         * allocate an initialized vx_info struct
40         * doesn't make it visible (hash)                        */
41
42 static struct vx_info *__alloc_vx_info(xid_t xid)
43 {
44         struct vx_info *new = NULL;
45         
46         vxdprintk("alloc_vx_info(%d)\n", xid);
47
48         /* would this benefit from a slab cache? */
49         new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
50         if (!new)
51                 return 0;
52
53         memset (new, 0, sizeof(struct vx_info));
54         new->vx_id = xid;
55         INIT_RCU_HEAD(&new->vx_rcu);
56         INIT_HLIST_NODE(&new->vx_hlist);
57         atomic_set(&new->vx_refcnt, 0);
58         atomic_set(&new->vx_usecnt, 0);
59
60         /* rest of init goes here */
61         vx_info_init_limit(&new->limit);
62         vx_info_init_sched(&new->sched);
63         vx_info_init_cvirt(&new->cvirt);
64         vx_info_init_cacct(&new->cacct);
65
66         new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
67         new->vx_bcaps = CAP_INIT_EFF_SET;
68         new->vx_ccaps = 0;
69
70         vxdprintk("alloc_vx_info(%d) = %p\n", xid, new);
71         return new;
72 }
73
74 /*      __dealloc_vx_info()
75
76         * final disposal of vx_info                             */
77
78 static void __dealloc_vx_info(struct vx_info *vxi)
79 {
80         vxdprintk("dealloc_vx_info(%p)\n", vxi);
81
82         vxi->vx_hlist.next = LIST_POISON1;
83         vxi->vx_id = -1;
84
85         if (vxi->vx_namespace)
86                 put_namespace(vxi->vx_namespace);
87         if (vxi->vx_fs)
88                 put_fs_struct(vxi->vx_fs);
89         
90         vx_info_exit_limit(&vxi->limit);
91         vx_info_exit_sched(&vxi->sched);
92         vx_info_exit_cvirt(&vxi->cvirt);
93         vx_info_exit_cacct(&vxi->cacct);
94         
95         BUG_ON(atomic_read(&vxi->vx_usecnt));
96         BUG_ON(atomic_read(&vxi->vx_refcnt));
97
98         kfree(vxi);
99 }
100
101
102 /*      hash table for vx_info hash */
103
104 #define VX_HASH_SIZE    13
105
106 struct hlist_head vx_info_hash[VX_HASH_SIZE];
107
108 static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
109
110
111 static inline unsigned int __hashval(xid_t xid)
112 {
113         return (xid % VX_HASH_SIZE);
114 }
115
116
117
118 /*      __hash_vx_info()
119
120         * add the vxi to the global hash table
121         * requires the hash_lock to be held                     */
122
123 static inline void __hash_vx_info(struct vx_info *vxi)
124 {
125         struct hlist_head *head;
126         
127         vxdprintk("__hash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
128         get_vx_info(vxi);
129         head = &vx_info_hash[__hashval(vxi->vx_id)];
130         hlist_add_head_rcu(&vxi->vx_hlist, head);
131 }
132
133 /*      __unhash_vx_info()
134
135         * remove the vxi from the global hash table
136         * requires the hash_lock to be held                     */
137
138 static inline void __unhash_vx_info(struct vx_info *vxi)
139 {
140         vxdprintk("__unhash_vx_info: %p[#%d]\n", vxi, vxi->vx_id);
141         hlist_del_rcu(&vxi->vx_hlist);
142         put_vx_info(vxi);
143 }
144
145
146 /*      __lookup_vx_info()
147
148         * requires the rcu_read_lock()
149         * doesn't increment the vx_refcnt                       */
150
151 static inline struct vx_info *__lookup_vx_info(xid_t xid)
152 {
153         struct hlist_head *head = &vx_info_hash[__hashval(xid)];
154         struct hlist_node *pos;
155
156         hlist_for_each_rcu(pos, head) {
157                 struct vx_info *vxi =
158                         hlist_entry(pos, struct vx_info, vx_hlist);
159
160                 if (vxi->vx_id == xid) {
161                         return vxi;
162                 }
163         }
164         return NULL;
165 }
166
167
168 /*      __vx_dynamic_id()
169
170         * find unused dynamic xid
171         * requires the hash_lock to be held                     */
172
173 static inline xid_t __vx_dynamic_id(void)
174 {
175         static xid_t seq = MAX_S_CONTEXT;
176         xid_t barrier = seq;
177         
178         do {
179                 if (++seq > MAX_S_CONTEXT)
180                         seq = MIN_D_CONTEXT;
181                 if (!__lookup_vx_info(seq))
182                         return seq;
183         } while (barrier != seq);
184         return 0;
185 }
186
187 /*      __loc_vx_info()
188
189         * locate or create the requested context
190         * get() it and if new hash it                           */
191
192 static struct vx_info * __loc_vx_info(int id, int *err)
193 {
194         struct vx_info *new, *vxi = NULL;
195         
196         vxdprintk("loc_vx_info(%d)\n", id);
197
198         if (!(new = __alloc_vx_info(id))) {
199                 *err = -ENOMEM;
200                 return NULL;
201         }
202
203         spin_lock(&vx_info_hash_lock);
204
205         /* dynamic context requested */
206         if (id == VX_DYNAMIC_ID) {
207                 id = __vx_dynamic_id();
208                 if (!id) {
209                         printk(KERN_ERR "no dynamic context available.\n");
210                         goto out_unlock;
211                 }
212                 new->vx_id = id;
213         }
214         /* existing context requested */
215         else if ((vxi = __lookup_vx_info(id))) {
216                 /* context in setup is not available */
217                 if (vxi->vx_flags & VXF_STATE_SETUP) {
218                         vxdprintk("loc_vx_info(%d) = %p (not available)\n", id, vxi);
219                         vxi = NULL;
220                         *err = -EBUSY;
221                 } else {
222                         vxdprintk("loc_vx_info(%d) = %p (found)\n", id, vxi);
223                         get_vx_info(vxi);
224                         *err = 0;
225                 }
226                 goto out_unlock;
227         }
228
229         /* new context requested */
230         vxdprintk("loc_vx_info(%d) = %p (new)\n", id, new);
231         __hash_vx_info(get_vx_info(new));
232         vxi = new, new = NULL;
233         *err = 1;
234
235 out_unlock:
236         spin_unlock(&vx_info_hash_lock);
237         if (new)
238                 __dealloc_vx_info(new);
239         return vxi;
240 }
241
242
243
244 /*      exported stuff                                          */
245
246
247
248 void rcu_free_vx_info(struct rcu_head *head)
249 {
250         struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu);
251         int usecnt, refcnt;
252
253         BUG_ON(!vxi || !head);
254
255         usecnt = atomic_read(&vxi->vx_usecnt);
256         BUG_ON(usecnt < 0);
257
258         refcnt = atomic_read(&vxi->vx_refcnt);
259         BUG_ON(refcnt < 0);
260
261         if (!usecnt)
262                 __dealloc_vx_info(vxi);
263         else
264                 printk("!!! rcu didn't free\n");
265 }
266
267 void unhash_vx_info(struct vx_info *vxi)
268 {
269         spin_lock(&vx_info_hash_lock);
270         __unhash_vx_info(vxi);
271         spin_unlock(&vx_info_hash_lock);
272 }
273
274 /*      locate_vx_info()
275
276         * search for a vx_info and get() it                     
277         * negative id means current                             */
278
279 struct vx_info *locate_vx_info(int id)
280 {
281         struct vx_info *vxi;
282         
283         if (id < 0) {
284                 vxi = get_vx_info(current->vx_info);
285         } else {
286                 rcu_read_lock();
287                 vxi = get_vx_info(__lookup_vx_info(id));
288                 rcu_read_unlock();
289         }
290         return vxi;
291 }
292
293 /*      vx_info_is_hashed()
294
295         * verify that xid is still hashed                       */
296
297 int vx_info_is_hashed(xid_t xid)
298 {
299         int hashed;
300
301         rcu_read_lock();
302         hashed = (__lookup_vx_info(xid) != NULL);
303         rcu_read_unlock();
304         return hashed;
305 }
306
307 #ifdef  CONFIG_VSERVER_LEGACY
308
309 #if 0
310 struct vx_info *alloc_vx_info(xid_t xid)
311 {
312         return __alloc_vx_info(xid);
313 }
314 #endif
315
316 struct vx_info *locate_or_create_vx_info(int id)
317 {
318         int err;
319
320         return __loc_vx_info(id, &err);
321 }
322
323 #endif
324
325 #ifdef  CONFIG_PROC_FS
326
327 #define hlist_for_each_rcu(pos, head) \
328         for (pos = (head)->first; pos && ({ prefetch(pos->next); 1;}); \
329                 pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
330
331 int get_xid_list(int index, unsigned int *xids, int size)
332 {
333         int hindex, nr_xids = 0;
334
335         rcu_read_lock();
336         for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
337                 struct hlist_head *head = &vx_info_hash[hindex];
338                 struct hlist_node *pos;
339
340                 hlist_for_each_rcu(pos, head) {
341                         struct vx_info *vxi;
342
343                         if (--index > 0)
344                                 continue;
345
346                         vxi = hlist_entry(pos, struct vx_info, vx_hlist);
347                         xids[nr_xids] = vxi->vx_id;                     
348                         if (++nr_xids >= size)
349                                 goto out;
350                 }
351         }
352 out:
353         rcu_read_unlock();
354         return nr_xids;
355 }
356 #endif
357
358 int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
359 {
360         struct user_struct *new_user, *old_user;
361         
362         if (!p || !vxi)
363                 BUG();
364         new_user = alloc_uid(vxi->vx_id, p->uid);
365         if (!new_user)
366                 return -ENOMEM;
367
368         old_user = p->user;
369         if (new_user != old_user) {
370                 atomic_inc(&new_user->processes);
371                 atomic_dec(&old_user->processes);
372                 p->user = new_user;
373         }
374         free_uid(old_user);
375         return 0;
376 }
377
378 void vx_mask_bcaps(struct task_struct *p)
379 {
380         struct vx_info *vxi = p->vx_info;
381
382         p->cap_effective &= vxi->vx_bcaps;
383         p->cap_inheritable &= vxi->vx_bcaps;
384         p->cap_permitted &= vxi->vx_bcaps;
385 }
386
387
388 #include <linux/file.h>
389
390 static inline int vx_nofiles_task(struct task_struct *tsk)
391 {
392         struct files_struct *files = tsk->files;
393         const unsigned long *obptr, *cbptr;
394         int count, total;
395
396         spin_lock(&files->file_lock);
397         obptr = files->open_fds->fds_bits;
398         cbptr = files->close_on_exec->fds_bits;
399         count = files->max_fds / (sizeof(unsigned long) * 8);
400         for (total = 0; count > 0; count--) {
401                 if (*obptr)
402                         total += hweight_long(*obptr);
403                 obptr++;
404         /*      if (*cbptr)
405                         total += hweight_long(*cbptr);
406                 cbptr++; */
407         }
408         spin_unlock(&files->file_lock);
409         return total;
410 }
411
412 static inline int vx_openfd_task(struct task_struct *tsk)
413 {
414         struct files_struct *files = tsk->files;
415         const unsigned long *bptr;
416         int count, total;
417
418         spin_lock(&files->file_lock);
419         bptr = files->open_fds->fds_bits;
420         count = files->max_fds / (sizeof(unsigned long) * 8);
421         for (total = 0; count > 0; count--) {
422                 if (*bptr)
423                         total += hweight_long(*bptr);
424                 bptr++;
425         }
426         spin_unlock(&files->file_lock);
427         return total;
428 }
429
430 /*
431  *      migrate task to new context
432  *      gets vxi, puts old_vxi on change
433  */
434
435 int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
436 {
437         struct vx_info *old_vxi;
438         int ret = 0;
439         
440         if (!p || !vxi)
441                 BUG();
442
443         old_vxi = task_get_vx_info(p);
444         if (old_vxi == vxi)
445                 goto out;
446
447         vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
448                 vxi->vx_id, atomic_read(&vxi->vx_usecnt));
449
450         if (!(ret = vx_migrate_user(p, vxi))) {
451                 int openfd, nofiles;
452
453                 task_lock(p);
454                 openfd = vx_openfd_task(p);
455                 nofiles = vx_nofiles_task(p);
456
457                 if (old_vxi) {
458                         atomic_dec(&old_vxi->cacct.nr_threads);
459                         atomic_dec(&old_vxi->limit.rcur[RLIMIT_NPROC]);
460                         atomic_sub(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
461                         atomic_sub(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
462                 }               
463                 atomic_inc(&vxi->cacct.nr_threads);
464                 atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]);
465                 atomic_add(nofiles, &vxi->limit.rcur[RLIMIT_NOFILE]);
466                 atomic_add(openfd, &vxi->limit.rcur[RLIMIT_OPENFD]);
467                 /* should be handled in set_vx_info !! */
468                 if (old_vxi)
469                         clr_vx_info(&p->vx_info);
470                 set_vx_info(&p->vx_info, vxi);
471                 p->xid = vxi->vx_id;
472                 vx_mask_bcaps(p);
473                 task_unlock(p);
474
475                 /* obsoleted by clr/set */
476                 // put_vx_info(old_vxi);
477         }
478 out:
479
480
481 #ifdef CKRM_VSERVER_INTEGRATION
482         do {
483           ckrm_cb_xid(p);
484         } while (0);
485 #endif //CKRM_VSERVER_INTEGRATION
486
487
488         put_vx_info(old_vxi);
489         return ret;
490 }
491
492 int vx_set_init(struct vx_info *vxi, struct task_struct *p)
493 {
494         if (!vxi)
495                 return -EINVAL;
496         if (vxi->vx_initpid)
497                 return -EPERM;
498
499         vxi->vx_initpid = p->tgid;
500         return 0;
501 }
502
503
504 /* vserver syscall commands below here */
505
506 /* taks xid and vx_info functions */
507
508 #include <asm/uaccess.h>
509
510
511 int vc_task_xid(uint32_t id, void __user *data)
512 {
513         xid_t xid;
514
515         if (id) {
516                 struct task_struct *tsk;
517
518                 if (!vx_check(0, VX_ADMIN|VX_WATCH))
519                         return -EPERM;
520
521                 read_lock(&tasklist_lock);
522                 tsk = find_task_by_pid(id);
523                 xid = (tsk) ? tsk->xid : -ESRCH;
524                 read_unlock(&tasklist_lock);
525         }
526         else
527                 xid = current->xid;
528         return xid;
529 }
530
531
532 int vc_vx_info(uint32_t id, void __user *data)
533 {
534         struct vx_info *vxi;
535         struct vcmd_vx_info_v0 vc_data;
536
537         if (!vx_check(0, VX_ADMIN))
538                 return -ENOSYS;
539         if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
540                 return -EPERM;
541
542         vxi = locate_vx_info(id);
543         if (!vxi)
544                 return -ESRCH;
545
546         vc_data.xid = vxi->vx_id;
547         vc_data.initpid = vxi->vx_initpid;
548         put_vx_info(vxi);
549
550         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
551                 return -EFAULT;
552         return 0;
553 }
554
555
556 /* context functions */
557
558 int vc_ctx_create(uint32_t xid, void __user *data)
559 {
560         struct vx_info *new_vxi;
561         int ret;
562
563         if (!capable(CAP_SYS_ADMIN))
564                 return -EPERM;
565
566         if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
567                 return -EINVAL;
568
569         if (xid < 1)
570                 return -EINVAL;
571
572         new_vxi = __loc_vx_info(xid, &ret);
573         if (!new_vxi)
574                 return ret;
575         if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
576                 ret = -EEXIST;
577                 goto out_put;
578         }
579
580         ret = new_vxi->vx_id;
581         vx_migrate_task(current, new_vxi);
582         /* if this fails, we might end up with a hashed vx_info */
583 out_put:
584         put_vx_info(new_vxi);
585         return ret;
586 }
587
588
589 int vc_ctx_migrate(uint32_t id, void __user *data)
590 {
591         struct vx_info *vxi;
592         
593         if (!capable(CAP_SYS_ADMIN))
594                 return -EPERM;
595
596         /* dirty hack until Spectator becomes a cap */
597         if (id == 1) {
598                 current->xid = 1;
599                 return 0;
600         }
601
602         vxi = locate_vx_info(id);
603         if (!vxi)
604                 return -ESRCH;
605         vx_migrate_task(current, vxi);
606         put_vx_info(vxi);
607         return 0;
608 }
609
610
611 int vc_get_cflags(uint32_t id, void __user *data)
612 {
613         struct vx_info *vxi;
614         struct vcmd_ctx_flags_v0 vc_data;
615
616         if (!capable(CAP_SYS_ADMIN))
617                 return -EPERM;
618
619         vxi = locate_vx_info(id);
620         if (!vxi)
621                 return -ESRCH;
622
623         vc_data.flagword = vxi->vx_flags;
624
625         /* special STATE flag handling */
626         vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
627
628         put_vx_info(vxi);
629
630         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
631                 return -EFAULT;
632         return 0;
633 }
634
635 int vc_set_cflags(uint32_t id, void __user *data)
636 {
637         struct vx_info *vxi;
638         struct vcmd_ctx_flags_v0 vc_data;
639         uint64_t mask, trigger;
640
641         if (!capable(CAP_SYS_ADMIN))
642                 return -EPERM;
643         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
644                 return -EFAULT;
645
646         vxi = locate_vx_info(id);
647         if (!vxi)
648                 return -ESRCH;
649
650         /* special STATE flag handling */
651         mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
652         trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
653
654         if (trigger & VXF_STATE_SETUP)
655                 vx_mask_bcaps(current);
656         if (trigger & VXF_STATE_INIT)
657                 if (vxi == current->vx_info)
658                         vx_set_init(vxi, current);
659
660         vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
661                 vc_data.flagword, mask);
662         put_vx_info(vxi);
663         return 0;
664 }
665
666 int vc_get_ccaps(uint32_t id, void __user *data)
667 {
668         struct vx_info *vxi;
669         struct vcmd_ctx_caps_v0 vc_data;
670
671         if (!capable(CAP_SYS_ADMIN))
672                 return -EPERM;
673
674         vxi = locate_vx_info(id);
675         if (!vxi)
676                 return -ESRCH;
677
678         vc_data.bcaps = vxi->vx_bcaps;
679         vc_data.ccaps = vxi->vx_ccaps;
680         vc_data.cmask = ~0UL;
681         put_vx_info(vxi);
682
683         if (copy_to_user (data, &vc_data, sizeof(vc_data)))
684                 return -EFAULT;
685         return 0;
686 }
687
688 int vc_set_ccaps(uint32_t id, void __user *data)
689 {
690         struct vx_info *vxi;
691         struct vcmd_ctx_caps_v0 vc_data;
692
693         if (!capable(CAP_SYS_ADMIN))
694                 return -EPERM;
695         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
696                 return -EFAULT;
697
698         vxi = locate_vx_info(id);
699         if (!vxi)
700                 return -ESRCH;
701
702         vxi->vx_bcaps &= vc_data.bcaps;
703         vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
704                 vc_data.ccaps, vc_data.cmask);
705         put_vx_info(vxi);
706         return 0;
707 }
708
709 #include <linux/module.h>
710
711 EXPORT_SYMBOL_GPL(rcu_free_vx_info);
712 EXPORT_SYMBOL_GPL(vx_info_hash_lock);
713 EXPORT_SYMBOL_GPL(unhash_vx_info);
714