Initial revision
authorMark Huang <mlhuang@cs.princeton.edu>
Wed, 2 Jun 2004 20:45:37 +0000 (20:45 +0000)
committerMark Huang <mlhuang@cs.princeton.edu>
Wed, 2 Jun 2004 20:45:37 +0000 (20:45 +0000)
29 files changed:
include/linux/ninline.h [new file with mode: 0644]
include/linux/vinline.h [new file with mode: 0644]
include/linux/vserver.h [new file with mode: 0644]
include/linux/vserver/context.h [new file with mode: 0644]
include/linux/vserver/cvirt.h [new file with mode: 0644]
include/linux/vserver/inode.h [new file with mode: 0644]
include/linux/vserver/legacy.h [new file with mode: 0644]
include/linux/vserver/limit.h [new file with mode: 0644]
include/linux/vserver/namespace.h [new file with mode: 0644]
include/linux/vserver/network.h [new file with mode: 0644]
include/linux/vserver/sched.h [new file with mode: 0644]
include/linux/vserver/signal.h [new file with mode: 0644]
include/linux/vserver/switch.h [new file with mode: 0644]
include/linux/vserver/xid.h [new file with mode: 0644]
kernel/vserver/Kconfig [new file with mode: 0644]
kernel/vserver/Makefile [new file with mode: 0644]
kernel/vserver/context.c [new file with mode: 0644]
kernel/vserver/cvirt.c [new file with mode: 0644]
kernel/vserver/init.c [new file with mode: 0644]
kernel/vserver/inode.c [new file with mode: 0644]
kernel/vserver/legacy.c [new file with mode: 0644]
kernel/vserver/limit.c [new file with mode: 0644]
kernel/vserver/namespace.c [new file with mode: 0644]
kernel/vserver/network.c [new file with mode: 0644]
kernel/vserver/proc.c [new file with mode: 0644]
kernel/vserver/sched.c [new file with mode: 0644]
kernel/vserver/signal.c [new file with mode: 0644]
kernel/vserver/switch.c [new file with mode: 0644]
kernel/vserver/sysctl.c [new file with mode: 0644]

diff --git a/include/linux/ninline.h b/include/linux/ninline.h
new file mode 100644 (file)
index 0000000..d3f7525
--- /dev/null
@@ -0,0 +1,151 @@
+#ifndef _NX_INLINE_H
+#define _NX_INLINE_H
+
+
+// #define NX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+#include "vserver/network.h"
+
+#if defined(NX_DEBUG)
+#define nxdprintk(x...) printk("nxd: " x)
+#else
+#define nxdprintk(x...)
+#endif
+
+
+void free_nx_info(struct nx_info *);
+
+extern int proc_pid_nx_info(struct task_struct *, char *);
+
+
+#define get_nx_info(i) __get_nx_info(i,__FILE__,__LINE__)
+
+static __inline__ struct nx_info *__get_nx_info(struct nx_info *nxi, const char *_file, int _line)
+{
+       if (!nxi)
+               return NULL;
+       nxdprintk("get_nx_info(%p[%d.%d])\t%s:%d\n",
+               nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_refcount):0,
+               _file, _line);
+       atomic_inc(&nxi->nx_refcount);
+       return nxi;
+}
+
+#define put_nx_info(i) __put_nx_info(i,__FILE__,__LINE__)
+
+static __inline__ void __put_nx_info(struct nx_info *nxi, const char *_file, int _line)
+{
+       if (!nxi)
+               return;
+       nxdprintk("put_nx_info(%p[%d.%d])\t%s:%d\n",
+               nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_refcount):0,
+               _file, _line);
+       if (atomic_dec_and_lock(&nxi->nx_refcount, &nxlist_lock)) {
+               list_del(&nxi->nx_list);
+               spin_unlock(&nxlist_lock);
+               free_nx_info(nxi);
+       }
+}
+
+
+#define set_nx_info(p,i) __set_nx_info(p,i,__FILE__,__LINE__)
+
+static inline void __set_nx_info(struct nx_info **nxp, struct nx_info *nxi,
+       const char *_file, int _line)
+{
+       BUG_ON(*nxp);
+       if (!nxi)
+               return;
+       nxdprintk("set_nx_info(%p[#%d.%d])\t%s:%d\n",
+               nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_refcount):0,
+               _file, _line);
+       *nxp = __get_nx_info(nxi, _file, _line);
+}
+
+#define        clr_nx_info(p)  __clr_nx_info(p,__FILE__,__LINE__)
+
+static inline void __clr_nx_info(struct nx_info **nxp,
+       const char *_file, int _line)
+{
+       struct nx_info *nxo = *nxp;
+
+       if (!nxo)
+               return;
+       nxdprintk("clr_nx_info(%p[#%d.%d])\t%s:%d\n",
+               nxo, nxo?nxo->nx_id:0, nxo?atomic_read(&nxo->nx_refcount):0,
+               _file, _line);
+       *nxp = NULL;
+       wmb();
+       __put_nx_info(nxo, _file, _line);
+}
+
+
+#define task_get_nx_info(i)    __task_get_nx_info(i,__FILE__,__LINE__)
+
+static __inline__ struct nx_info *__task_get_nx_info(struct task_struct *p,
+       const char *_file, int _line)
+{
+       struct nx_info *nxi;
+       
+       task_lock(p);
+       nxi = __get_nx_info(p->nx_info, _file, _line);
+       task_unlock(p);
+       return nxi;
+}
+
+#define nx_verify_info(p,i)    \
+       __nx_verify_info((p)->nx_info,i,__FILE__,__LINE__)
+
+static __inline__ void __nx_verify_info(
+       struct nx_info *ipa, struct nx_info *ipb,
+       const char *_file, int _line)
+{
+       if (ipa == ipb)
+               return;
+       printk(KERN_ERR "ip bad assumption (%p==%p) at %s:%d\n",
+               ipa, ipb, _file, _line);
+}
+
+
+#define nx_task_nid(t) ((t)->nid)
+
+#define nx_current_nid() nx_task_nid(current)
+
+#define nx_check(c,m)  __nx_check(nx_current_nid(),c,m)
+
+#define nx_weak_check(c,m)     ((m) ? nx_check(c,m) : 1)
+
+#undef nxdprintk
+#define nxdprintk(x...)
+
+
+#define __nx_flags(v,m,f)      (((v) & (m)) ^ (f))
+
+#define        __nx_task_flags(t,m,f) \
+       (((t) && ((t)->nx_info)) ? \
+               __nx_flags((t)->nx_info->nx_flags,(m),(f)) : 0)
+
+#define nx_current_flags() \
+       ((current->nx_info) ? current->nx_info->nx_flags : 0)
+
+#define nx_flags(m,f)  __nx_flags(nx_current_flags(),(m),(f))
+
+
+#define nx_current_ncaps() \
+       ((current->nx_info) ? current->nx_info->nx_ncaps : 0)
+
+#define nx_ncaps(c)    (nx_current_ncaps() & (c))
+
+
+
+#define        sock_nx_init(s)  do {           \
+       (s)->sk_nid = 0;                \
+       (s)->sk_nx_info = NULL;         \
+       } while (0)
+
+
+
+#endif
diff --git a/include/linux/vinline.h b/include/linux/vinline.h
new file mode 100644 (file)
index 0000000..07bb369
--- /dev/null
@@ -0,0 +1,462 @@
+#ifndef _VX_INLINE_H
+#define _VX_INLINE_H
+
+
+// #define VX_DEBUG
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+#include "vserver/context.h"
+#include "vserver/limit.h"
+#include "vserver/cvirt.h"
+
+#if defined(VX_DEBUG)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+
+void free_vx_info(struct vx_info *);
+
+extern int proc_pid_vx_info(struct task_struct *, char *);
+
+
+#define get_vx_info(i) __get_vx_info(i,__FILE__,__LINE__)
+
+static __inline__ struct vx_info *__get_vx_info(struct vx_info *vxi,
+       const char *_file, int _line)
+{
+       if (!vxi)
+               return NULL;
+       vxdprintk("get_vx_info(%p[#%d.%d])\t%s:%d\n",
+               vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_refcount):0,
+               _file, _line);
+       atomic_inc(&vxi->vx_refcount);
+       return vxi;
+}
+
+#define put_vx_info(i) __put_vx_info(i,__FILE__,__LINE__)
+
+static __inline__ void __put_vx_info(struct vx_info *vxi, const char *_file, int _line)
+{
+       if (!vxi)
+               return;
+       vxdprintk("put_vx_info(%p[#%d.%d])\t%s:%d\n",
+               vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_refcount):0,
+               _file, _line);
+       if (atomic_dec_and_lock(&vxi->vx_refcount, &vxlist_lock)) {
+               list_del(&vxi->vx_list);
+               spin_unlock(&vxlist_lock);
+               free_vx_info(vxi);
+       }
+}
+
+#define set_vx_info(p,i) __set_vx_info(p,i,__FILE__,__LINE__)
+
+static inline void __set_vx_info(struct vx_info **vxp, struct vx_info *vxi,
+       const char *_file, int _line)
+{
+       BUG_ON(*vxp);
+       if (!vxi)
+               return;
+       vxdprintk("set_vx_info(%p[#%d.%d])\t%s:%d\n",
+               vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_refcount):0,
+               _file, _line);
+       *vxp = __get_vx_info(vxi, _file, _line);
+}
+
+#define        clr_vx_info(p)  __clr_vx_info(p,__FILE__,__LINE__)
+
+static inline void __clr_vx_info(struct vx_info **vxp,
+       const char *_file, int _line)
+{
+       struct vx_info *vxo = *vxp;
+
+       vxdprintk("clr_vx_info(%p[#%d.%d])\t%s:%d\n",
+               vxo, vxo?vxo->vx_id:0, vxo?atomic_read(&vxo->vx_refcount):0,
+               _file, _line);
+       *vxp = NULL;
+       wmb();
+       __put_vx_info(vxo, _file, _line);
+}
+
+
+#define task_get_vx_info(i)    __task_get_vx_info(i,__FILE__,__LINE__)
+
+static __inline__ struct vx_info *__task_get_vx_info(struct task_struct *p,
+       const char *_file, int _line)
+{
+       struct vx_info *vxi;
+       
+       task_lock(p);
+       vxi = __get_vx_info(p->vx_info, _file, _line);
+       task_unlock(p);
+       return vxi;
+}
+
+
+#define vx_verify_info(p,i)    \
+       __vx_verify_info((p)->vx_info,i,__FILE__,__LINE__)
+
+static __inline__ void __vx_verify_info(
+       struct vx_info *vxa, struct vx_info *vxb,
+       const char *_file, int _line)
+{
+       if (vxa == vxb)
+               return;
+       printk(KERN_ERR "vx bad assumption (%p==%p) at %s:%d\n",
+               vxa, vxb, _file, _line);
+}
+
+
+#define vx_task_xid(t) ((t)->xid)
+
+#define vx_current_xid() vx_task_xid(current)
+
+#define vx_check(c,m)  __vx_check(vx_current_xid(),c,m)
+
+#define vx_weak_check(c,m)     ((m) ? vx_check(c,m) : 1)
+
+
+/*
+ * check current context for ADMIN/WATCH and
+ * optionally agains supplied argument
+ */
+static __inline__ int __vx_check(xid_t cid, xid_t id, unsigned int mode)
+{
+       if (mode & VX_ARG_MASK) {
+               if ((mode & VX_IDENT) &&
+                       (id == cid))
+                       return 1;
+       }
+       if (mode & VX_ATR_MASK) {
+               if ((mode & VX_DYNAMIC) &&
+                       (id >= MIN_D_CONTEXT) &&
+                       (id <= MAX_S_CONTEXT))
+                       return 1;
+               if ((mode & VX_STATIC) &&
+                       (id > 1) && (id < MIN_D_CONTEXT))
+                       return 1;
+       }
+       return (((mode & VX_ADMIN) && (cid == 0)) ||
+               ((mode & VX_WATCH) && (cid == 1)));
+}
+
+
+#define __vx_flags(v,m,f)      (((v) & (m)) ^ (f))
+
+#define        __vx_task_flags(t,m,f) \
+       (((t) && ((t)->vx_info)) ? \
+               __vx_flags((t)->vx_info->vx_flags,(m),(f)) : 0)
+
+#define vx_current_flags() \
+       ((current->vx_info) ? current->vx_info->vx_flags : 0)
+
+#define vx_flags(m,f)  __vx_flags(vx_current_flags(),(m),(f))
+
+
+#define vx_current_ccaps() \
+       ((current->vx_info) ? current->vx_info->vx_ccaps : 0)
+
+#define vx_ccaps(c)    (vx_current_ccaps() & (c))
+
+#define vx_current_bcaps() \
+       (((current->vx_info) && !vx_flags(VXF_STATE_SETUP, 0)) ? \
+       current->vx_info->vx_bcaps : cap_bset)
+
+
+#define VX_DEBUG_ACC_RSS   0
+#define VX_DEBUG_ACC_VM    0
+#define VX_DEBUG_ACC_VML   0
+
+#undef vxdprintk
+#if    (VX_DEBUG_ACC_RSS) || (VX_DEBUG_ACC_VM) || (VX_DEBUG_ACC_VML)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+#define vx_acc_page(m, d, v, r) \
+       __vx_acc_page(&(m->v), m->mm_vx_info, r, d, __FILE__, __LINE__)
+
+static inline void __vx_acc_page(unsigned long *v, struct vx_info *vxi,
+                int res, int dir, char *file, int line)
+{
+        if (v) {
+                if (dir > 0)
+                        ++(*v);
+                else
+                        --(*v);
+        }
+        if (vxi) {
+                if (dir > 0)
+                        atomic_inc(&vxi->limit.res[res]);
+                else
+                        atomic_dec(&vxi->limit.res[res]);
+        }
+}
+
+
+#define vx_acc_pages(m, p, v, r) \
+       __vx_acc_pages(&(m->v), m->mm_vx_info, r, p, __FILE__, __LINE__)
+
+static inline void __vx_acc_pages(unsigned long *v, struct vx_info *vxi,
+                int res, int pages, char *file, int line)
+{
+        if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) ||
+               (res == RLIMIT_AS && VX_DEBUG_ACC_VM) ||
+               (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML))
+               vxdprintk("vx_acc_pages  [%5d,%2d]: %5d += %5d in %s:%d\n",
+                       (vxi?vxi->vx_id:-1), res,
+                       (vxi?atomic_read(&vxi->limit.res[res]):0),
+                       pages, file, line);
+        if (pages == 0)
+                return;
+        if (v)
+                *v += pages;
+        if (vxi)
+                atomic_add(pages, &vxi->limit.res[res]);
+}
+
+
+
+#define vx_acc_vmpage(m,d)     vx_acc_page(m, d, total_vm,  RLIMIT_AS)
+#define vx_acc_vmlpage(m,d)    vx_acc_page(m, d, locked_vm, RLIMIT_MEMLOCK)
+#define vx_acc_rsspage(m,d)    vx_acc_page(m, d, rss,      RLIMIT_RSS)
+
+#define vx_acc_vmpages(m,p)    vx_acc_pages(m, p, total_vm,  RLIMIT_AS)
+#define vx_acc_vmlpages(m,p)   vx_acc_pages(m, p, locked_vm, RLIMIT_MEMLOCK)
+#define vx_acc_rsspages(m,p)   vx_acc_pages(m, p, rss,       RLIMIT_RSS)
+
+#define vx_pages_add(s,r,p)    __vx_acc_pages(0, s, r, p, __FILE__, __LINE__)
+#define vx_pages_sub(s,r,p)    __vx_pages_add(s, r, -(p))
+
+#define vx_vmpages_inc(m)      vx_acc_vmpage(m, 1)
+#define vx_vmpages_dec(m)      vx_acc_vmpage(m,-1)
+#define vx_vmpages_add(m,p)    vx_acc_vmpages(m, p)
+#define vx_vmpages_sub(m,p)    vx_acc_vmpages(m,-(p))
+
+#define vx_vmlocked_inc(m)     vx_acc_vmlpage(m, 1)
+#define vx_vmlocked_dec(m)     vx_acc_vmlpage(m,-1)
+#define vx_vmlocked_add(m,p)   vx_acc_vmlpages(m, p)
+#define vx_vmlocked_sub(m,p)   vx_acc_vmlpages(m,-(p))
+
+#define vx_rsspages_inc(m)     vx_acc_rsspage(m, 1)
+#define vx_rsspages_dec(m)     vx_acc_rsspage(m,-1)
+#define vx_rsspages_add(m,p)   vx_acc_rsspages(m, p)
+#define vx_rsspages_sub(m,p)   vx_acc_rsspages(m,-(p))
+
+
+
+#define vx_pages_avail(m, p, r) \
+        __vx_pages_avail((m)->mm_vx_info, (r), (p), __FILE__, __LINE__)
+
+static inline int __vx_pages_avail(struct vx_info *vxi,
+                int res, int pages, char *file, int line)
+{
+        if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) ||
+                (res == RLIMIT_AS && VX_DEBUG_ACC_VM) ||
+                (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML))
+                printk("vx_pages_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n",
+                        (vxi?vxi->vx_id:-1), res,
+                       (vxi?vxi->limit.rlim[res]:1),
+                        (vxi?atomic_read(&vxi->limit.res[res]):0),
+                       pages, file, line);
+        if (!vxi)
+                return 1;
+        if (vxi->limit.rlim[res] == RLIM_INFINITY)
+                return 1;
+        if (atomic_read(&vxi->limit.res[res]) + pages < vxi->limit.rlim[res])
+                return 1;
+        return 0;
+}
+
+#define vx_vmpages_avail(m,p)  vx_pages_avail(m, p, RLIMIT_AS)
+#define vx_vmlocked_avail(m,p) vx_pages_avail(m, p, RLIMIT_MEMLOCK)
+#define vx_rsspages_avail(m,p) vx_pages_avail(m, p, RLIMIT_RSS)
+
+/* file limits */
+
+#define VX_DEBUG_ACC_FILE      0
+#define VX_DEBUG_ACC_OPENFD    0
+
+#undef vxdprintk
+#if    (VX_DEBUG_ACC_FILE) || (VX_DEBUG_ACC_OPENFD)
+#define vxdprintk(x...) printk("vxd: " x)
+#else
+#define vxdprintk(x...)
+#endif
+
+
+#define vx_acc_cres(v,d,r) \
+       __vx_acc_cres((v), (r), (d), __FILE__, __LINE__)
+
+static inline void __vx_acc_cres(struct vx_info *vxi,
+       int res, int dir, char *file, int line)
+{
+        if (vxi) {
+       if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) ||
+                       (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD))
+       printk("vx_acc_cres[%5d,%2d]: %5d%s in %s:%d\n",
+                        (vxi?vxi->vx_id:-1), res,
+                        (vxi?atomic_read(&vxi->limit.res[res]):0),
+                       (dir>0)?"++":"--", file, line);
+                if (dir > 0)
+                        atomic_inc(&vxi->limit.res[res]);
+                else
+                        atomic_dec(&vxi->limit.res[res]);
+        }
+}
+
+#define vx_files_inc(f)        vx_acc_cres(current->vx_info, 1, RLIMIT_NOFILE)
+#define vx_files_dec(f)        vx_acc_cres(current->vx_info,-1, RLIMIT_NOFILE)
+
+#define vx_openfd_inc(f) vx_acc_cres(current->vx_info, 1, RLIMIT_OPENFD)
+#define vx_openfd_dec(f) vx_acc_cres(current->vx_info,-1, RLIMIT_OPENFD)
+
+#define vx_cres_avail(v,n,r) \
+        __vx_cres_avail((v), (r), (n), __FILE__, __LINE__)
+
+static inline int __vx_cres_avail(struct vx_info *vxi,
+                int res, int num, char *file, int line)
+{
+       if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) ||
+               (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD))
+                printk("vx_cres_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n",
+                        (vxi?vxi->vx_id:-1), res,
+                       (vxi?vxi->limit.rlim[res]:1),
+                        (vxi?atomic_read(&vxi->limit.res[res]):0),
+                       num, file, line);
+        if (!vxi)
+                return 1;
+        if (vxi->limit.rlim[res] == RLIM_INFINITY)
+                return 1;
+        if (vxi->limit.rlim[res] < atomic_read(&vxi->limit.res[res]) + num)
+                return 0;
+        return 1;
+}
+
+#define vx_files_avail(n) \
+       vx_cres_avail(current->vx_info, (n), RLIMIT_NOFILE)
+
+#define vx_openfd_avail(n) \
+       vx_cres_avail(current->vx_info, (n), RLIMIT_OPENFD)
+
+/* socket limits */
+
+#define vx_sock_inc(f) vx_acc_cres(current->vx_info, 1, VLIMIT_SOCK)
+#define vx_sock_dec(f) vx_acc_cres(current->vx_info,-1, VLIMIT_SOCK)
+
+#define vx_sock_avail(n) \
+       vx_cres_avail(current->vx_info, (n), VLIMIT_SOCK)
+
+/* procfs ioctls */
+
+#define        FIOC_GETXFLG    _IOR('x', 5, long)
+#define        FIOC_SETXFLG    _IOW('x', 6, long)
+
+/* utsname virtualization */
+
+static inline struct new_utsname *vx_new_utsname(void)
+{
+       if (current->vx_info)
+               return &current->vx_info->cvirt.utsname;
+       return &system_utsname;
+}
+
+#define vx_new_uts(x)          ((vx_new_utsname())->x)
+
+/* generic flag merging */
+
+#define        vx_mask_flags(v,f,m)    (((v) & ~(m)) | ((f) & (m)))
+
+#define        vx_mask_mask(v,f,m)     (((v) & ~(m)) | ((v) & (f) & (m)))
+
+
+/* socket accounting */
+
+#include <linux/socket.h>
+
+static inline int vx_sock_type(int family)
+{
+       int type = 4;
+
+       if (family > 0 && family < 3)
+               type = family;
+       else if (family == PF_INET6)
+               type = 3;
+       return type;
+}
+
+#define vx_acc_sock(v,f,p,s) \
+       __vx_acc_sock((v), (f), (p), (s), __FILE__, __LINE__)
+
+static inline void __vx_acc_sock(struct vx_info *vxi,
+       int family, int pos, int size, char *file, int line)
+{
+        if (vxi) {
+               int type = vx_sock_type(family);
+
+               atomic_inc(&vxi->cacct.sock[type][pos].count);
+               atomic_add(size, &vxi->cacct.sock[type][pos].total);
+        }
+}
+
+#define vx_sock_recv(sk,s) \
+       vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 0, (s))
+#define vx_sock_send(sk,s) \
+       vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 1, (s))
+#define vx_sock_fail(sk,s) \
+       vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 2, (s))
+
+
+#define        sock_vx_init(s)  do {           \
+       (s)->sk_xid = 0;                \
+       (s)->sk_vx_info = NULL;         \
+       } while (0)
+
+
+/* pid faking stuff */
+
+
+#define vx_map_tgid(v,p) \
+       __vx_map_tgid((v), (p), __FILE__, __LINE__)
+
+static inline int __vx_map_tgid(struct vx_info *vxi, int pid,
+       char *file, int line)
+{
+       if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) {
+               vxdprintk("vx_map_tgid: %p/%llx: %d -> %d in %s:%d\n",
+                       vxi, vxi->vx_flags, pid,
+                       (pid == vxi->vx_initpid)?1:pid,
+                       file, line);
+               if (pid == vxi->vx_initpid)
+                       return 1;
+       }
+       return pid;
+}
+
+#define vx_rmap_tgid(v,p) \
+       __vx_rmap_tgid((v), (p), __FILE__, __LINE__)
+
+static inline int __vx_rmap_tgid(struct vx_info *vxi, int pid,
+       char *file, int line)
+{
+       if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) {
+               vxdprintk("vx_rmap_tgid: %p/%llx: %d -> %d in %s:%d\n",
+                       vxi, vxi->vx_flags, pid,
+                       (pid == 1)?vxi->vx_initpid:pid,
+                       file, line);
+               if ((pid == 1) && vxi->vx_initpid)
+                       return vxi->vx_initpid;
+       }
+       return pid;
+}
+
+#undef vxdprintk
+#define vxdprintk(x...)
+
+#endif
diff --git a/include/linux/vserver.h b/include/linux/vserver.h
new file mode 100644 (file)
index 0000000..2c39ebb
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef _LINUX_VSERVER_H
+#define _LINUX_VSERVER_H
+
+#include <linux/vserver/context.h>
+#include <linux/vserver/network.h>
+#include <linux/vinline.h>
+#include <linux/ninline.h>
+
+#endif
diff --git a/include/linux/vserver/context.h b/include/linux/vserver/context.h
new file mode 100644 (file)
index 0000000..7692603
--- /dev/null
@@ -0,0 +1,176 @@
+#ifndef _VX_CONTEXT_H
+#define _VX_CONTEXT_H
+
+#include <linux/types.h>
+
+#define MAX_S_CONTEXT  65535   /* Arbitrary limit */
+#define MIN_D_CONTEXT  49152   /* dynamic contexts start here */
+
+#define VX_DYNAMIC_ID  ((uint32_t)-1)          /* id for dynamic context */
+
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#define _VX_INFO_DEF_
+#include "cvirt.h"
+#include "limit.h"
+#include "sched.h"
+#undef _VX_INFO_DEF_
+
+struct vx_info {
+       struct list_head vx_list;               /* linked list of contexts */
+       xid_t vx_id;                            /* context id */
+       atomic_t vx_refcount;                   /* refcount */
+       struct vx_info *vx_parent;              /* parent context */
+
+       struct namespace *vx_namespace;         /* private namespace */
+       struct fs_struct *vx_fs;                /* private namespace fs */
+       uint64_t vx_flags;                      /* VX_INFO_xxx */
+       uint64_t vx_bcaps;                      /* bounding caps (system) */
+       uint64_t vx_ccaps;                      /* context caps (vserver) */
+
+       pid_t vx_initpid;                       /* PID of fake init process */
+
+       struct _vx_limit limit;                 /* vserver limits */
+       struct _vx_sched sched;                 /* vserver scheduler */
+       struct _vx_cvirt cvirt;                 /* virtual/bias stuff */
+       struct _vx_cacct cacct;                 /* context accounting */
+
+       char vx_name[65];                       /* vserver name */
+};
+
+
+extern spinlock_t vxlist_lock;
+extern struct list_head vx_infos;
+
+
+#define VX_ADMIN       0x0001
+#define VX_WATCH       0x0002
+#define VX_DUMMY       0x0008
+
+#define VX_IDENT       0x0010
+#define VX_EQUIV       0x0020
+#define VX_PARENT      0x0040
+#define VX_CHILD       0x0080
+
+#define VX_ARG_MASK    0x00F0
+
+#define VX_DYNAMIC     0x0100
+#define VX_STATIC      0x0200
+
+#define VX_ATR_MASK    0x0F00
+
+
+void free_vx_info(struct vx_info *);
+
+extern struct vx_info *find_vx_info(int);
+extern struct vx_info *find_or_create_vx_info(int);
+extern int vx_info_id_valid(int);
+
+extern int vx_migrate_task(struct task_struct *, struct vx_info *);
+
+#endif /* __KERNEL__ */
+
+#include "switch.h"
+
+/* vinfo commands */
+
+#define VCMD_task_xid          VC_CMD(VINFO, 1, 0)
+#define VCMD_task_nid          VC_CMD(VINFO, 2, 0)
+
+#ifdef __KERNEL__
+extern int vc_task_xid(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_vx_info           VC_CMD(VINFO, 5, 0)
+#define VCMD_nx_info           VC_CMD(VINFO, 6, 0)
+
+struct  vcmd_vx_info_v0 {
+       uint32_t xid;
+       uint32_t initpid;
+       /* more to come */      
+};
+
+#ifdef __KERNEL__
+extern int vc_vx_info(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_ctx_create                VC_CMD(VPROC, 1, 0)
+#define VCMD_ctx_migrate       VC_CMD(PROCMIG, 1, 0)
+
+#ifdef __KERNEL__
+extern int vc_ctx_create(uint32_t, void __user *);
+extern int vc_ctx_migrate(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_get_cflags                VC_CMD(FLAGS, 1, 0)
+#define VCMD_set_cflags                VC_CMD(FLAGS, 2, 0)
+
+struct  vcmd_ctx_flags_v0 {
+       uint64_t flagword;
+       uint64_t mask;
+};
+
+#ifdef __KERNEL__
+extern int vc_get_cflags(uint32_t, void __user *);
+extern int vc_set_cflags(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VXF_INFO_LOCK          0x00000001
+#define VXF_INFO_SCHED         0x00000002
+#define VXF_INFO_NPROC         0x00000004
+#define VXF_INFO_PRIVATE       0x00000008
+
+#define VXF_INFO_INIT          0x00000010
+#define VXF_INFO_HIDE          0x00000020
+#define VXF_INFO_ULIMIT                0x00000040
+#define VXF_INFO_NSPACE                0x00000080
+
+#define VXF_SCHED_HARD         0x00000100
+#define VXF_SCHED_PRIO         0x00000200
+#define VXF_SCHED_PAUSE                0x00000400
+
+#define VXF_VIRT_MEM           0x00010000
+#define VXF_VIRT_UPTIME                0x00020000
+#define VXF_VIRT_CPU           0x00040000
+
+#define VXF_HIDE_MOUNT         0x01000000
+#define VXF_HIDE_NETIF         0x02000000
+
+#define VXF_STATE_SETUP                (1ULL<<32)
+#define VXF_STATE_INIT         (1ULL<<33)
+
+#define        VXF_FORK_RSS            (1ULL<<48)
+
+#define VXF_ONE_TIME           (0x0003ULL<<32)
+
+#define VCMD_get_ccaps         VC_CMD(FLAGS, 3, 0)
+#define VCMD_set_ccaps         VC_CMD(FLAGS, 4, 0)
+
+struct  vcmd_ctx_caps_v0 {
+       uint64_t bcaps;
+       uint64_t ccaps;
+       uint64_t cmask;
+};
+
+#ifdef __KERNEL__
+extern int vc_get_ccaps(uint32_t, void __user *);
+extern int vc_set_ccaps(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VXC_SET_UTSNAME                0x00000001
+#define VXC_SET_RLIMIT         0x00000002
+
+#define VXC_ICMP_PING          0x00000100
+
+#define VXC_SECURE_MOUNT       0x00010000
+
+
+#endif /* _VX_CONTEXT_H */
diff --git a/include/linux/vserver/cvirt.h b/include/linux/vserver/cvirt.h
new file mode 100644 (file)
index 0000000..ba3a253
--- /dev/null
@@ -0,0 +1,133 @@
+#if    defined(__KERNEL__) && defined(_VX_INFO_DEF_)
+
+#include <linux/utsname.h>
+#include <linux/rwsem.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <asm/atomic.h>
+
+/* context sub struct */
+
+struct _vx_cvirt {
+       int max_threads;
+
+       unsigned int bias_cswtch;
+       struct timespec bias_idle;
+       struct timespec bias_tp;
+       uint64_t bias_jiffies;
+
+       struct new_utsname utsname;
+};
+
+struct sock_acc {
+       atomic_t count;
+       atomic_t total;
+};
+
+struct _vx_cacct {
+       atomic_t nr_threads;
+       int nr_running;
+
+       unsigned long total_forks;
+
+       struct sock_acc sock[5][3];
+};
+
+
+static inline long vx_sock_count(struct _vx_cacct *cacct, int type, int pos)
+{
+       return atomic_read(&cacct->sock[type][pos].count);
+}
+
+
+static inline long vx_sock_total(struct _vx_cacct *cacct, int type, int pos)
+{
+       return atomic_read(&cacct->sock[type][pos].total);
+}
+
+
+extern uint64_t vx_idle_jiffies(void);
+
+static inline void vx_info_init_cvirt(struct _vx_cvirt *cvirt)
+{
+       uint64_t idle_jiffies = vx_idle_jiffies();
+
+       // new->virt.bias_cswtch = kstat.context_swtch;
+       cvirt->bias_jiffies = get_jiffies_64();
+
+       jiffies_to_timespec(idle_jiffies, &cvirt->bias_idle);
+       do_posix_clock_monotonic_gettime(&cvirt->bias_tp);
+
+       down_read(&uts_sem);
+       cvirt->utsname = system_utsname;
+       up_read(&uts_sem);
+}
+
+static inline void vx_info_exit_cvirt(struct _vx_cvirt *cvirt)
+{
+       return;
+}
+
+static inline void vx_info_init_cacct(struct _vx_cacct *cacct)
+{
+       int i,j;
+
+       atomic_set(&cacct->nr_threads, 1);
+       for (i=0; i<5; i++) {
+               for (j=0; j<3; j++) {
+                       atomic_set(&cacct->sock[i][j].count, 0);
+                       atomic_set(&cacct->sock[i][j].total, 0);
+               }
+       }
+}
+
+static inline void vx_info_exit_cacct(struct _vx_cacct *cacct)
+{
+       return;
+}
+
+static inline int vx_info_proc_cvirt(struct _vx_cvirt *cvirt, char *buffer)
+{
+       int length = 0;
+       return length;
+}
+
+static inline int vx_info_proc_cacct(struct _vx_cacct *cacct, char *buffer)
+{
+       int i,j, length = 0;
+       static char *type[] = { "UNSPEC", "UNIX", "INET", "INET6", "OTHER" };
+
+       for (i=0; i<5; i++) {
+               length += sprintf(buffer + length,
+                       "%s:", type[i]);
+               for (j=0; j<3; j++) {
+                       length += sprintf(buffer + length,
+                               "\t%12lu/%-12lu"
+                               ,vx_sock_count(cacct, i, j)
+                               ,vx_sock_total(cacct, i, j)
+                               );
+               }       
+               buffer[length++] = '\n';
+       }
+       return length;
+}
+
+#else  /* _VX_INFO_DEF_ */
+#ifndef _VX_CVIRT_H
+#define _VX_CVIRT_H
+
+#include "switch.h"
+
+/*  cvirt vserver commands */
+
+
+#ifdef __KERNEL__
+
+struct timespec;
+
+void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle);
+
+#endif /* __KERNEL__ */
+
+#endif /* _VX_CVIRT_H */
+#endif
diff --git a/include/linux/vserver/inode.h b/include/linux/vserver/inode.h
new file mode 100644 (file)
index 0000000..aa8852f
--- /dev/null
@@ -0,0 +1,67 @@
+#ifndef _VX_INODE_H
+#define _VX_INODE_H
+
+#include "switch.h"
+
+/*  inode vserver commands */
+
+#define VCMD_get_iattr_v0      VC_CMD(INODE, 1, 0)
+#define VCMD_set_iattr_v0      VC_CMD(INODE, 2, 0)
+
+#define VCMD_get_iattr         VC_CMD(INODE, 1, 1)
+#define VCMD_set_iattr         VC_CMD(INODE, 2, 1)
+
+struct  vcmd_ctx_iattr_v0 {
+       /* device handle in id */
+       uint64_t ino;
+       uint32_t xid;
+       uint32_t flags;
+       uint32_t mask;
+};
+
+struct  vcmd_ctx_iattr_v1 {
+       const char __user *name;
+       uint32_t xid;
+       uint32_t flags;
+       uint32_t mask;
+};
+
+
+#define IATTR_XID      0x01000000
+
+#define IATTR_ADMIN    0x00000001
+#define IATTR_WATCH    0x00000002
+#define IATTR_HIDE     0x00000004
+#define IATTR_FLAGS    0x00000007
+
+#define IATTR_BARRIER  0x00010000
+#define IATTR_IUNLINK  0x00020000
+#define IATTR_IMMUTABLE        0x00040000
+
+
+#ifdef CONFIG_PROC_SECURE
+#define IATTR_PROC_DEFAULT     ( IATTR_ADMIN | IATTR_HIDE )
+#define IATTR_PROC_SYMLINK     ( IATTR_ADMIN )
+#else
+#define IATTR_PROC_DEFAULT     ( IATTR_ADMIN )
+#define IATTR_PROC_SYMLINK     ( IATTR_ADMIN )
+#endif
+
+#ifdef __KERNEL__
+
+#define vx_hide_check(c,m)      (((m) & IATTR_HIDE) ? vx_check(c,m) : 1)
+
+extern int vc_get_iattr_v0(uint32_t, void __user *);
+extern int vc_set_iattr_v0(uint32_t, void __user *);
+
+extern int vc_get_iattr(uint32_t, void __user *);
+extern int vc_set_iattr(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+/* inode ioctls */
+
+#define FIOC_GETXFLG   _IOR('x', 5, long)
+#define FIOC_SETXFLG   _IOW('x', 6, long)
+
+#endif /* _VX_INODE_H */
diff --git a/include/linux/vserver/legacy.h b/include/linux/vserver/legacy.h
new file mode 100644 (file)
index 0000000..1372c0f
--- /dev/null
@@ -0,0 +1,54 @@
+#ifndef _VX_LEGACY_H
+#define _VX_LEGACY_H
+
+#include "switch.h"
+#include "network.h"
+
+/*  compatibiliy vserver commands */
+
+#define VCMD_new_s_context     VC_CMD(COMPAT, 1, 1)
+#define VCMD_set_ipv4root      VC_CMD(COMPAT, 2, 3)
+
+#define VCMD_create_context    VC_CMD(VSETUP, 1, 0)
+
+/*  compatibiliy vserver arguments */
+
+struct  vcmd_new_s_context_v1 {
+       uint32_t remove_cap;
+       uint32_t flags;
+};
+
+struct  vcmd_set_ipv4root_v3 {
+       /* number of pairs in id */
+       uint32_t broadcast;
+       struct {
+               uint32_t ip;
+               uint32_t mask;
+       } nx_mask_pair[NB_IPV4ROOT];
+};
+
+
+#define VX_INFO_LOCK           1       /* Can't request a new vx_id */
+#define VX_INFO_NPROC          4       /* Limit number of processes in a context */
+#define VX_INFO_PRIVATE                8       /* Noone can join this security context */
+#define VX_INFO_INIT           16      /* This process wants to become the */
+                                       /* logical process 1 of the security */
+                                       /* context */
+#define VX_INFO_HIDEINFO       32      /* Hide some information in /proc */
+#define VX_INFO_ULIMIT         64      /* Use ulimit of the current process */
+                                       /* to become the global limits */
+                                       /* of the context */
+#define VX_INFO_NAMESPACE      128     /* save private namespace */
+
+       
+#define NB_S_CONTEXT   16
+
+#define NB_IPV4ROOT    16
+
+
+#ifdef __KERNEL__
+extern int vc_new_s_context(uint32_t, void __user *);
+extern int vc_set_ipv4root(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+#endif /* _VX_LEGACY_H */
diff --git a/include/linux/vserver/limit.h b/include/linux/vserver/limit.h
new file mode 100644 (file)
index 0000000..27496c1
--- /dev/null
@@ -0,0 +1,117 @@
+#if    defined(__KERNEL__) && defined(_VX_INFO_DEF_)
+
+#include <asm/atomic.h>
+#include <asm/resource.h>
+
+/* context sub struct */
+
+#define        RLIMIT_OPENFD   12
+
+#define NUM_RLIMITS    16
+
+#define VLIMIT_SOCK    16
+
+
+struct _vx_limit {
+       atomic_t ticks;
+
+       unsigned long rlim[NUM_RLIMITS];        /* Per context limit */
+       atomic_t res[NUM_RLIMITS];              /* Current value */
+};
+
+static inline void vx_info_init_limit(struct _vx_limit *limit)
+{
+       int lim;
+
+       for (lim=0; lim<NUM_RLIMITS; lim++) {
+               limit->rlim[lim] = RLIM_INFINITY;
+               atomic_set(&limit->res[lim], 0);
+       }
+}
+
+extern unsigned int vx_debug_limit;
+
+static inline void vx_info_exit_limit(struct _vx_limit *limit)
+{
+       int lim, value;
+
+       for (lim=0; lim<NUM_RLIMITS; lim++) {
+               value = atomic_read(&limit->res[lim]);
+               if (value && vx_debug_limit)
+                       printk("!!! limit: %p[%d] = %d on exit.\n",
+                               limit, lim, value);
+       }
+}
+
+
+static inline int vx_info_proc_limit(struct _vx_limit *limit, char *buffer)
+{
+       return sprintf(buffer,
+               "PROC:\t%8d/%ld\n"
+               "VM:\t%8d/%ld\n"
+               "VML:\t%8d/%ld\n"               
+               "RSS:\t%8d/%ld\n"
+               "FILES:\t%8d/%ld\n"
+               "OFD:\t%8d/%ld\n"
+               ,atomic_read(&limit->res[RLIMIT_NPROC])
+               ,limit->rlim[RLIMIT_NPROC]
+               ,atomic_read(&limit->res[RLIMIT_AS])
+               ,limit->rlim[RLIMIT_AS]
+               ,atomic_read(&limit->res[RLIMIT_MEMLOCK])
+               ,limit->rlim[RLIMIT_MEMLOCK]
+               ,atomic_read(&limit->res[RLIMIT_RSS])
+               ,limit->rlim[RLIMIT_RSS]
+               ,atomic_read(&limit->res[RLIMIT_NOFILE])
+               ,limit->rlim[RLIMIT_NOFILE]
+               ,atomic_read(&limit->res[RLIMIT_OPENFD])
+               ,limit->rlim[RLIMIT_OPENFD]
+               );
+}
+
+#else  /* _VX_INFO_DEF_ */
+#ifndef _VX_LIMIT_H
+#define _VX_LIMIT_H
+
+#include "switch.h"
+
+/*  rlimit vserver commands */
+
+#define VCMD_get_rlimit                VC_CMD(RLIMIT, 1, 0)
+#define VCMD_set_rlimit                VC_CMD(RLIMIT, 2, 0)
+#define VCMD_get_rlimit_mask   VC_CMD(RLIMIT, 3, 0)
+
+struct  vcmd_ctx_rlimit_v0 {
+       uint32_t id;
+       uint64_t minimum;
+       uint64_t softlimit;
+       uint64_t maximum;
+};
+
+struct  vcmd_ctx_rlimit_mask_v0 {
+       uint32_t minimum;
+       uint32_t softlimit;
+       uint32_t maximum;
+};
+
+#define CRLIM_UNSET            (0ULL)
+#define CRLIM_INFINITY         (~0ULL)
+#define CRLIM_KEEP             (~1ULL)
+
+#ifdef __KERNEL__
+
+#include <linux/compiler.h>
+
+extern int vc_get_rlimit(uint32_t, void __user *);
+extern int vc_set_rlimit(uint32_t, void __user *);
+extern int vc_get_rlimit_mask(uint32_t, void __user *);
+
+struct sysinfo;
+
+void vx_vsi_meminfo(struct sysinfo *);
+void vx_vsi_swapinfo(struct sysinfo *);
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _VX_LIMIT_H */
+#endif
diff --git a/include/linux/vserver/namespace.h b/include/linux/vserver/namespace.h
new file mode 100644 (file)
index 0000000..140fc79
--- /dev/null
@@ -0,0 +1,55 @@
+#ifndef _VX_NAMESPACE_H
+#define _VX_NAMESPACE_H
+
+#include <linux/types.h>
+
+       
+/* virtual host info names */
+
+#define VCMD_vx_set_vhi_name   VC_CMD(VHOST, 1, 0)
+#define VCMD_vx_get_vhi_name   VC_CMD(VHOST, 2, 0)
+
+struct  vcmd_vx_vhi_name_v0 {
+       uint32_t field;
+       char name[65];
+};
+
+
+enum vx_vhi_name_field {
+       VHIN_CONTEXT=0,
+       VHIN_SYSNAME,
+       VHIN_NODENAME,
+       VHIN_RELEASE,
+       VHIN_VERSION,
+       VHIN_MACHINE,
+       VHIN_DOMAINNAME,
+};
+
+
+#ifdef __KERNEL__
+
+#include <linux/compiler.h>
+
+extern int vc_set_vhi_name(uint32_t, void __user *);
+extern int vc_get_vhi_name(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_enter_namespace   VC_CMD(PROCALT, 1, 0)
+#define VCMD_cleanup_namespace VC_CMD(PROCALT, 2, 0)
+#define VCMD_set_namespace     VC_CMD(PROCALT, 3, 0)
+
+#ifdef __KERNEL__
+
+struct vx_info;
+struct namespace;
+struct fs_struct;
+
+extern int vx_set_namespace(struct vx_info *, struct namespace *, struct fs_struct *);
+
+extern int vc_enter_namespace(uint32_t, void __user *);
+extern int vc_cleanup_namespace(uint32_t, void __user *);
+extern int vc_set_namespace(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+#endif /* _VX_NAMESPACE_H */
diff --git a/include/linux/vserver/network.h b/include/linux/vserver/network.h
new file mode 100644 (file)
index 0000000..b3c39b0
--- /dev/null
@@ -0,0 +1,142 @@
+#ifndef _VX_NETWORK_H
+#define _VX_NETWORK_H
+
+#define MAX_N_CONTEXT  65535   /* Arbitrary limit */
+
+#define IP_DYNAMIC_ID  ((uint32_t)-1)          /* id for dynamic context */
+
+#define NB_IPV4ROOT    16
+
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/utsname.h>
+#include <asm/resource.h>
+#include <asm/atomic.h>
+
+
+struct nx_info {
+       struct list_head nx_list;       /* linked list of nxinfos */
+       nid_t nx_id;                    /* vnet id */
+       atomic_t nx_refcount;
+
+       uint64_t nx_flags;              /* network flag word */
+       uint64_t nx_ncaps;              /* network capabilities */
+
+       int nbipv4;
+       __u32 ipv4[NB_IPV4ROOT];        /* Process can only bind to these IPs */
+                                       /* The first one is used to connect */
+                                       /* and for bind any service */
+                                       /* The other must be used explicity */
+       __u32 mask[NB_IPV4ROOT];        /* Netmask for each ipv4 */
+                                       /* Used to select the proper source */
+                                       /* address for sockets */
+       __u32 v4_bcast;                 /* Broadcast address to receive UDP  */
+
+       char nx_name[65];               /* network context name */
+};
+
+
+extern spinlock_t nxlist_lock;
+extern struct list_head nx_infos;
+
+
+void free_nx_info(struct nx_info *);
+struct nx_info *create_nx_info(void);
+
+extern struct nx_info *find_nx_info(int);
+extern int nx_info_id_valid(int);
+
+struct in_ifaddr;
+struct net_device;
+
+int ifa_in_nx_info(struct in_ifaddr *, struct nx_info *);
+int dev_in_nx_info(struct net_device *, struct nx_info *);
+
+
+#endif /* __KERNEL__ */
+
+#include "switch.h"
+
+/* vinfo commands */
+
+#define VCMD_task_nid          VC_CMD(VINFO, 2, 0)
+
+#ifdef __KERNEL__
+extern int vc_task_nid(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_nx_info           VC_CMD(VINFO, 6, 0)
+
+struct  vcmd_nx_info_v0 {
+       uint32_t nid;
+       /* more to come */      
+};
+
+#ifdef __KERNEL__
+extern int vc_nx_info(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_net_create                VC_CMD(VNET, 1, 0)
+#define VCMD_net_migrate       VC_CMD(NETMIG, 1, 0)
+
+#define VCMD_net_add           VC_CMD(NETALT, 1, 0)
+#define VCMD_net_remove                VC_CMD(NETALT, 2, 0)
+
+struct  vcmd_net_nx_v0 {
+       uint16_t type;
+       uint16_t count;
+       uint32_t ip[4];
+       uint32_t mask[4];
+       /* more to come */      
+};
+
+//     IPN_TYPE_IPV4   
+
+
+#ifdef __KERNEL__
+extern int vc_net_create(uint32_t, void __user *);
+extern int vc_net_migrate(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define VCMD_get_nflags                VC_CMD(FLAGS, 5, 0)
+#define VCMD_set_nflags                VC_CMD(FLAGS, 6, 0)
+
+struct  vcmd_net_flags_v0 {
+       uint64_t flagword;
+       uint64_t mask;
+};
+
+#ifdef __KERNEL__
+extern int vc_get_nflags(uint32_t, void __user *);
+extern int vc_set_nflags(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define IPF_STATE_SETUP                (1ULL<<32)
+
+
+#define IPF_ONE_TIME           (0x0001ULL<<32)
+
+#define VCMD_get_ncaps         VC_CMD(FLAGS, 7, 0)
+#define VCMD_set_ncaps         VC_CMD(FLAGS, 8, 0)
+
+struct  vcmd_net_caps_v0 {
+       uint64_t ncaps;
+       uint64_t cmask;
+};
+
+#ifdef __KERNEL__
+extern int vc_get_ncaps(uint32_t, void __user *);
+extern int vc_set_ncaps(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+
+#define IPC_WOSSNAME           0x00000001
+
+
+#endif /* _VX_NETWORK_H */
diff --git a/include/linux/vserver/sched.h b/include/linux/vserver/sched.h
new file mode 100644 (file)
index 0000000..d1a2068
--- /dev/null
@@ -0,0 +1,139 @@
+#if    defined(__KERNEL__) && defined(_VX_INFO_DEF_)
+
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <asm/atomic.h>
+#include <asm/param.h>
+#include <asm/cpumask.h>
+
+/* context sub struct */
+
+struct _vx_sched {
+       spinlock_t tokens_lock; /* lock for this structure */
+
+       int fill_rate;          /* Fill rate: add X tokens... */
+       int interval;           /* Divisor:   per Y jiffies   */
+       atomic_t tokens;        /* number of CPU tokens in this context */
+       int tokens_min;         /* Limit:     minimum for unhold */
+       int tokens_max;         /* Limit:     no more than N tokens */
+       uint32_t jiffies;       /* add an integral multiple of Y to this */
+
+       uint64_t ticks;         /* token tick events */
+       cpumask_t cpus_allowed; /* cpu mask for context */
+};
+
+static inline void vx_info_init_sched(struct _vx_sched *sched)
+{
+        /* scheduling; hard code starting values as constants */
+        sched->fill_rate       = 1;
+        sched->interval                = 4;
+        sched->tokens_min      = HZ >> 4;
+        sched->tokens_max      = HZ >> 1;
+        sched->jiffies         = jiffies;
+        sched->tokens_lock     = SPIN_LOCK_UNLOCKED;
+
+        atomic_set(&sched->tokens, HZ >> 2);
+       sched->cpus_allowed     = CPU_MASK_ALL;
+}
+
+static inline void vx_info_exit_sched(struct _vx_sched *sched)
+{
+       return;
+}
+
+static inline int vx_info_proc_sched(struct _vx_sched *sched, char *buffer)
+{
+       return sprintf(buffer,
+               "Ticks:\t%16lld\n"
+               "Token:\t\t%8d\n"
+               "FillRate:\t%8d\n"
+               "Interval:\t%8d\n"              
+               "TokensMin:\t%8d\n"
+               "TokensMax:\t%8d\n"
+               ,sched->ticks
+               ,atomic_read(&sched->tokens)
+               ,sched->fill_rate
+               ,sched->interval
+               ,sched->tokens_min
+               ,sched->tokens_max
+               );
+}
+
+
+#else  /* _VX_INFO_DEF_ */
+#ifndef _VX_SCHED_H
+#define _VX_SCHED_H
+
+#include "switch.h"
+
+/*  sched vserver commands */
+
+#define VCMD_set_sched         VC_CMD(SCHED, 1, 2)
+
+struct  vcmd_set_sched_v2 {
+       int32_t fill_rate;
+       int32_t interval;
+       int32_t tokens;
+       int32_t tokens_min;
+       int32_t tokens_max;
+       uint64_t cpu_mask;
+};
+
+#define SCHED_KEEP             (-2)
+
+#ifdef __KERNEL__
+
+extern int vc_set_sched_v1(uint32_t, void __user *);
+extern int vc_set_sched(uint32_t, void __user *);
+
+
+#define VAVAVOOM_RATIO         50
+
+#include "context.h"
+
+
+/* scheduling stuff */
+
+int effective_vavavoom(struct task_struct *, int);
+
+int vx_tokens_recalc(struct vx_info *);
+
+/* new stuff ;) */
+
+static inline int vx_tokens_avail(struct vx_info *vxi)
+{
+       return atomic_read(&vxi->sched.tokens);
+}
+
+static inline void vx_consume_token(struct vx_info *vxi)
+{
+       atomic_dec(&vxi->sched.tokens);
+}
+
+static inline int vx_need_resched(struct task_struct *p)
+{
+#ifdef CONFIG_VSERVER_HARDCPU
+       struct vx_info *vxi = p->vx_info;
+
+       if (vxi) {
+               int tokens;
+
+               p->time_slice--;
+               if (atomic_read(&vxi->vx_refcount) < 1)
+                       printk("need_resched: p=%p, s=%ld, ref=%d, id=%d/%d\n",
+                               p, p->state, atomic_read(&vxi->vx_refcount),
+                               vxi->vx_id, p->xid);
+               if ((tokens = vx_tokens_avail(vxi)) > 0)
+                       vx_consume_token(vxi);
+               return ((p->time_slice == 0) || (tokens < 1));
+       }
+#endif
+       p->time_slice--;
+       return (p->time_slice == 0);
+}
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _VX_SCHED_H */
+#endif
diff --git a/include/linux/vserver/signal.h b/include/linux/vserver/signal.h
new file mode 100644 (file)
index 0000000..3911127
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef _VX_SIGNAL_H
+#define _VX_SIGNAL_H
+
+#include "switch.h"
+
+/*  context signalling */
+
+#define VCMD_ctx_kill          VC_CMD(PROCTRL, 1, 0)
+
+struct  vcmd_ctx_kill_v0 {
+       int32_t pid;
+       int32_t sig;
+};
+
+#ifdef __KERNEL__
+extern int vc_ctx_kill(uint32_t, void __user *);
+
+#endif /* __KERNEL__ */
+#endif /* _VX_SIGNAL_H */
diff --git a/include/linux/vserver/switch.h b/include/linux/vserver/switch.h
new file mode 100644 (file)
index 0000000..5fef690
--- /dev/null
@@ -0,0 +1,95 @@
+#ifndef _VX_SWITCH_H
+#define _VX_SWITCH_H
+
+#include <linux/types.h>
+
+#define VC_CATEGORY(c)         (((c) >> 24) & 0x3F)
+#define VC_COMMAND(c)          (((c) >> 16) & 0xFF)
+#define VC_VERSION(c)          ((c) & 0xFFF)
+
+#define VC_CMD(c,i,v)          ((((VC_CAT_ ## c) & 0x3F) << 24) \
+                               | (((i) & 0xFF) << 16) | ((v) & 0xFFF))
+
+/*
+
+  Syscall Matrix V2.6
+
+         |VERSION|CREATE |MODIFY |MIGRATE|CONTROL|EXPERIM| |SPECIAL|SPECIAL|
+         |STATS  |DESTROY|ALTER  |CHANGE |LIMIT  |TEST   | |       |       |
+         |INFO   |SETUP  |       |MOVE   |       |       | |       |       |
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  SYSTEM |VERSION|VSETUP |VHOST  |       |       |       | |DEVICES|       |
+  HOST   |     00|     01|     02|     03|     04|     05| |     06|     07|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  CPU    |       |VPROC  |PROCALT|PROCMIG|PROCTRL|       | |SCHED. |       |
+  PROCESS|     08|     09|     10|     11|     12|     13| |     14|     15|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  MEMORY |       |       |       |       |       |       | |SWAP   |       |
+         |     16|     17|     18|     19|     20|     21| |     22|     23|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  NETWORK|       |VNET   |NETALT |NETMIG |NETCTL |       | |SERIAL |       |
+         |     24|     25|     26|     27|     28|     29| |     30|     31|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  DISK   |       |       |       |       |       |       | |INODE  |       |
+  VFS    |     32|     33|     34|     35|     36|     37| |     38|     39|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  OTHER  |       |       |       |       |       |       | |VINFO  |       |
+         |     40|     41|     42|     43|     44|     45| |     46|     47|
+  =======+=======+=======+=======+=======+=======+=======+ +=======+=======+
+  SPECIAL|       |       |       |       |FLAGS  |       | |       |       |
+         |     48|     49|     50|     51|     52|     53| |     54|     55|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+  SPECIAL|       |       |       |       |RLIMIT |SYSCALL| |       |COMPAT |
+         |     56|     57|     58|     59|     60|TEST 61| |     62|     63|
+  -------+-------+-------+-------+-------+-------+-------+ +-------+-------+
+
+*/
+
+#define VC_CAT_VERSION         0
+
+#define VC_CAT_VSETUP          1
+#define VC_CAT_VHOST           2
+       
+#define VC_CAT_VPROC           9
+#define VC_CAT_PROCALT         10
+#define VC_CAT_PROCMIG         11
+#define VC_CAT_PROCTRL         12
+
+#define VC_CAT_SCHED           14
+
+#define VC_CAT_VNET            25
+#define VC_CAT_NETALT          26
+#define VC_CAT_NETMIG          27
+#define VC_CAT_NETCTRL         28
+
+#define VC_CAT_INODE           38
+
+#define VC_CAT_VINFO           46
+
+#define VC_CAT_FLAGS           52
+#define VC_CAT_RLIMIT          60
+
+#define VC_CAT_SYSTEST         61
+#define VC_CAT_COMPAT          63
+       
+/*  interface version */
+
+#define VCI_VERSION            0x00010016
+
+
+/*  query version */
+
+#define VCMD_get_version       VC_CMD(VERSION, 0, 0)
+
+
+#ifdef __KERNEL__
+
+#include <linux/errno.h>
+
+#define ENOTSUP                -EOPNOTSUPP
+
+#else  /* __KERNEL__ */
+#define __user
+#endif /* __KERNEL__ */
+
+#endif /* _VX_SWITCH_H */
diff --git a/include/linux/vserver/xid.h b/include/linux/vserver/xid.h
new file mode 100644 (file)
index 0000000..ba52c25
--- /dev/null
@@ -0,0 +1,94 @@
+#ifndef _LINUX_XID_H_
+#define _LINUX_XID_H_
+
+#ifdef CONFIG_INOXID_NONE
+
+#define MAX_UID                0xFFFFFFFF
+#define MAX_GID                0xFFFFFFFF
+
+#define INOXID_XID(uid, gid, xid)      (0)
+
+#define XIDINO_UID(uid, xid)           (uid)
+#define XIDINO_GID(gid, xid)           (gid)
+
+#endif
+
+
+#ifdef CONFIG_INOXID_GID16
+
+#define MAX_UID                0xFFFFFFFF
+#define MAX_GID                0x0000FFFF
+
+#define INOXID_XID(uid, gid, xid)      (((gid) >> 16) & 0xFFFF)
+
+#define XIDINO_UID(uid, xid)           (uid)
+#define XIDINO_GID(gid, xid)           (((gid) & 0xFFFF) | ((xid) << 16))
+
+
+#endif
+
+
+#ifdef CONFIG_INOXID_GID24
+
+#define MAX_UID                0x00FFFFFF
+#define MAX_GID                0x00FFFFFF
+
+#define INOXID_XID(uid, gid, xid)      ((((uid) >> 16) & 0xFF00) | (((gid) >> 24) & 0xFF))
+
+#define XIDINO_UID(uid, xid)           (((uid) & 0xFFFFFF) | (((xid) & 0xFF00) << 16))
+#define XIDINO_GID(gid, xid)           (((gid) & 0xFFFFFF) | (((xid) & 0x00FF) << 24))
+
+#endif
+
+
+#ifdef CONFIG_INOXID_GID32
+
+#define MAX_UID                0xFFFFFFFF
+#define MAX_GID                0xFFFFFFFF
+
+#define INOXID_XID(uid, gid, xid)      (xid)
+
+#define XIDINO_UID(uid, xid)           (uid)
+#define XIDINO_GID(gid, xid)           (gid)
+
+#endif
+
+
+#ifdef CONFIG_INOXID_RUNTIME
+
+#define MAX_UID                0xFFFFFFFF
+#define MAX_GID                0xFFFFFFFF
+
+#define INOXID_XID(uid, gid, xid)      (0)
+
+#define XIDINO_UID(uid, xid)           (uid)
+#define XIDINO_GID(gid, xid)           (gid)
+
+#endif
+
+
+#define INOXID_UID(uid, gid)           ((uid) & MAX_UID)
+#define INOXID_GID(uid, gid)           ((gid) & MAX_GID)
+
+static inline uid_t vx_map_uid(uid_t uid)
+{
+       if ((uid > MAX_UID) && (uid != -1))
+               uid = -2;
+       return (uid & MAX_UID);
+}
+
+static inline gid_t vx_map_gid(gid_t gid)
+{
+       if ((gid > MAX_GID) && (gid != -1))
+               gid = -2;
+       return (gid & MAX_GID);
+}
+
+
+#ifdef CONFIG_VSERVER_LEGACY           
+#define FIOC_GETXID    _IOR('x', 1, long)
+#define FIOC_SETXID    _IOW('x', 2, long)
+#define FIOC_SETXIDJ   _IOW('x', 3, long)
+#endif
+
+#endif /* _LINUX_XID_H_ */
diff --git a/kernel/vserver/Kconfig b/kernel/vserver/Kconfig
new file mode 100644 (file)
index 0000000..635d8d4
--- /dev/null
@@ -0,0 +1,72 @@
+#
+# Linux VServer configuration
+#
+
+menu "Linux VServer"
+
+config VSERVER_LEGACY
+       bool    "Enable Legacy Kernel API"
+       default y
+       help
+         This enables the legacy API used in vs1.xx, which allows
+         to use older tools (for migration purposes).
+
+config PROC_SECURE
+       bool    "Enable Proc Security"
+       depends on PROC_FS
+       default y
+       help
+         Hide proc entries by default for xid>1
+
+config VSERVER_HARDCPU
+       bool    "Enable Hard CPU Limits"
+       depends on EXPERIMENTAL
+       default n
+       help
+         Activate the Hard CPU Limits
+
+choice
+       prompt  "Persistent Inode Context Tagging"
+       default INOXID_GID24
+       help
+         This adds persistent context information to filesystems
+         mounted with the tagxid option. Tagging is a requirement
+         for per context disk limits and per context quota.
+
+
+config INOXID_NONE
+       bool    "Disabled"
+       help
+         no context information is store for inodes
+
+config INOXID_GID16
+       bool    "UID32/GID16"
+       help
+         reduces GID to 16 bit, but leaves UID at 32 bit.
+
+config INOXID_GID24
+       bool    "UID24/GID24"
+       help
+         uses the upper 8bit from UID and GID for XID tagging
+         which leaves 24bit for UID/GID each, which should be
+         more than sufficient for normal use.
+
+config INOXID_GID32
+       bool    "UID32/GID32"
+       help
+         this uses otherwise reserved inode fields in the on
+         disk representation, which limits the use to a few
+         filesystems (currently ext2 and ext3)
+
+config INOXID_MAGIC
+       bool    "Runtime"
+       depends on EXPERIMENTAL
+       help
+         inodes are tagged when first accessed, this doesn't
+         require any persistant information, but might give
+         funny results for mixed access.
+
+endchoice
+
+endmenu
+
diff --git a/kernel/vserver/Makefile b/kernel/vserver/Makefile
new file mode 100644 (file)
index 0000000..c035a77
--- /dev/null
@@ -0,0 +1,12 @@
+#
+# Makefile for the Linux vserver routines.
+#
+
+
+obj-y          += vserver.o
+
+vserver-y      := switch.o context.o namespace.o sched.o network.o inode.o \
+                  limit.o cvirt.o signal.o proc.o sysctl.o init.o
+
+vserver-$(CONFIG_VSERVER_LEGACY) += legacy.o
+
diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c
new file mode 100644 (file)
index 0000000..538834c
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ *  linux/kernel/vserver/context.c
+ *
+ *  Virtual Server: Context Support
+ *
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  context helper
+ *  V0.02  vx_ctx_kill syscall command
+ *  V0.03  replaced context_info calls
+ *  V0.04  redesign of struct (de)alloc
+ *  V0.05  rlimit basic implementation
+ *  V0.06  task_xid and info commands
+ *  V0.07  context flags and caps
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/vserver/context.h>
+#include <linux/vserver/legacy.h>
+#include <linux/vinline.h>
+#include <linux/kernel_stat.h>
+#include <linux/namespace.h>
+
+#include <asm/errno.h>
+
+
+/*  system functions */
+
+
+LIST_HEAD(vx_infos);
+
+spinlock_t vxlist_lock
+       __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ *     struct vx_info allocation and deallocation
+ */
+
+static struct vx_info *alloc_vx_info(int id)
+{
+       struct vx_info *new = NULL;
+       
+       vxdprintk("alloc_vx_info(%d)\n", id);
+       /* would this benefit from a slab cache? */
+       new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
+       if (!new)
+               return 0;
+
+       memset (new, 0, sizeof(struct vx_info));
+       new->vx_id = id;
+       INIT_LIST_HEAD(&new->vx_list);
+       /* rest of init goes here */
+       
+       vx_info_init_limit(&new->limit);
+       vx_info_init_sched(&new->sched);
+       vx_info_init_cvirt(&new->cvirt);
+       vx_info_init_cacct(&new->cacct);
+
+       new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT;
+       new->vx_bcaps = CAP_INIT_EFF_SET;
+       new->vx_ccaps = 0;
+
+       vxdprintk("alloc_vx_info(%d) = %p\n", id, new);
+       return new;
+}
+
+void free_vx_info(struct vx_info *vxi)
+{
+       vxdprintk("free_vx_info(%p)\n", vxi);
+       if (vxi->vx_namespace)
+               put_namespace(vxi->vx_namespace);
+       if (vxi->vx_fs)
+               put_fs_struct(vxi->vx_fs);
+       
+       vx_info_exit_limit(&vxi->limit);
+       vx_info_exit_sched(&vxi->sched);
+       vx_info_exit_cvirt(&vxi->cvirt);
+       vx_info_exit_cacct(&vxi->cacct);
+       
+       BUG_ON(atomic_read(&vxi->vx_refcount));
+       vxi->vx_id = -1;
+
+       kfree(vxi);
+}
+
+
+/*
+ *     struct vx_info search by id
+ *     assumes vxlist_lock is held
+ */
+
+static __inline__ struct vx_info *__find_vx_info(int id)
+{
+       struct vx_info *vxi;
+
+       list_for_each_entry(vxi, &vx_infos, vx_list)
+               if (vxi->vx_id == id)
+                       return vxi;
+       return 0;
+}
+
+
+/*
+ *     struct vx_info ref stuff
+ */
+
+struct vx_info *find_vx_info(int id)
+{
+       struct vx_info *vxi;
+       
+       if (id < 0) {
+               vxi = current->vx_info;
+               get_vx_info(vxi);
+       } else {
+               spin_lock(&vxlist_lock);
+               if ((vxi = __find_vx_info(id)))
+                       get_vx_info(vxi);
+               spin_unlock(&vxlist_lock);
+       }
+       return vxi;
+}
+
+/*
+ *     verify that id is a valid xid
+ */
+
+int vx_info_id_valid(int id)
+{
+       int valid;
+
+       spin_lock(&vxlist_lock);
+       valid = (__find_vx_info(id) != NULL);
+       spin_unlock(&vxlist_lock);
+       return valid;
+}
+
+
+/*
+ *     dynamic context id ...
+ */
+
+static __inline__ xid_t __vx_dynamic_id(void)
+{
+       static xid_t seq = MAX_S_CONTEXT;
+       xid_t barrier = seq;
+       
+       do {
+               if (++seq > MAX_S_CONTEXT)
+                       seq = MIN_D_CONTEXT;
+               if (!__find_vx_info(seq))
+                       return seq;
+       } while (barrier != seq);
+       return 0;
+}
+
+static struct vx_info * __foc_vx_info(int id, int *err)
+{
+       struct vx_info *new, *vxi = NULL;
+       
+       vxdprintk("foc_vx_info(%d)\n", id);
+       if (!(new = alloc_vx_info(id))) {
+               *err = -ENOMEM;
+               return NULL;
+       }
+
+       /* dirty hack until Spectator becomes a cap */
+       if (id == 0 || id == 1) {
+               *err = -EBUSY;
+               return NULL;
+       }
+
+       spin_lock(&vxlist_lock);
+
+       /* dynamic context requested */
+       if (id == VX_DYNAMIC_ID) {
+               id = __vx_dynamic_id();
+               if (!id) {
+                       printk(KERN_ERR "no dynamic context available.\n");
+                       goto out_unlock;
+               }
+               new->vx_id = id;
+       }
+       /* existing context requested */
+       else if ((vxi = __find_vx_info(id))) {
+               /* context in setup is not available */
+               if (vxi->vx_flags & VXF_STATE_SETUP) {
+                       vxdprintk("foc_vx_info(%d) = %p (not available)\n", id, vxi);
+                       vxi = NULL;
+                       *err = -EBUSY;
+               } else {
+                       vxdprintk("foc_vx_info(%d) = %p (found)\n", id, vxi);
+                       get_vx_info(vxi);
+                       *err = 0;
+               }
+               goto out_unlock;
+       }
+
+       /* new context requested */
+       vxdprintk("foc_vx_info(%d) = %p (new)\n", id, new);
+       atomic_set(&new->vx_refcount, 1);
+       list_add(&new->vx_list, &vx_infos);
+       vxi = new, new = NULL;
+       *err = 1;
+
+out_unlock:
+       spin_unlock(&vxlist_lock);
+       if (new)
+               free_vx_info(new);
+       return vxi;
+}
+
+
+struct vx_info *find_or_create_vx_info(int id)
+{
+       int err;
+
+       return __foc_vx_info(id, &err);
+}
+
+
+int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
+{
+       struct user_struct *new_user, *old_user;
+       
+       if (!p || !vxi)
+               BUG();
+       new_user = alloc_uid(vxi->vx_id, p->uid);
+       if (!new_user)
+               return -ENOMEM;
+
+       old_user = p->user;
+       if (new_user != old_user) {
+               atomic_inc(&new_user->processes);
+               atomic_dec(&old_user->processes);
+               p->user = new_user;
+       }
+       free_uid(old_user);
+       return 0;
+}
+
+void vx_mask_bcaps(struct task_struct *p)
+{
+       struct vx_info *vxi = p->vx_info;
+
+       p->cap_effective &= vxi->vx_bcaps;
+       p->cap_inheritable &= vxi->vx_bcaps;
+       p->cap_permitted &= vxi->vx_bcaps;
+}
+
+
+#include <linux/file.h>
+
+static inline int vx_nofiles_task(struct task_struct *tsk)
+{
+       struct files_struct *files = tsk->files;
+       const unsigned long *obptr, *cbptr;
+       int count, total;
+
+       spin_lock(&files->file_lock);
+       obptr = files->open_fds->fds_bits;
+       cbptr = files->close_on_exec->fds_bits;
+       count = files->max_fds / (sizeof(unsigned long) * 8);
+       for (total = 0; count > 0; count--) {
+               if (*obptr)
+                       total += hweight_long(*obptr);
+               obptr++;
+       /*      if (*cbptr)
+                       total += hweight_long(*cbptr);
+               cbptr++; */
+       }
+       spin_unlock(&files->file_lock);
+       return total;
+}
+
+static inline int vx_openfd_task(struct task_struct *tsk)
+{
+       struct files_struct *files = tsk->files;
+       const unsigned long *bptr;
+       int count, total;
+
+       spin_lock(&files->file_lock);
+       bptr = files->open_fds->fds_bits;
+       count = files->max_fds / (sizeof(unsigned long) * 8);
+       for (total = 0; count > 0; count--) {
+               if (*bptr)
+                       total += hweight_long(*bptr);
+               bptr++;
+       }
+       spin_unlock(&files->file_lock);
+       return total;
+}
+
+/*
+ *     migrate task to new context
+ *     gets vxi, puts old_vxi on change
+ */
+
+int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
+{
+       struct vx_info *old_vxi = task_get_vx_info(p);
+       int ret = 0;
+       
+       if (!p || !vxi)
+               BUG();
+
+       vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
+               vxi->vx_id, atomic_read(&vxi->vx_refcount));
+       if (old_vxi == vxi)
+               goto out;
+
+       if (!(ret = vx_migrate_user(p, vxi))) {
+               task_lock(p);
+               if (old_vxi) {
+                       atomic_dec(&old_vxi->cacct.nr_threads);
+                       atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]);
+               }               
+               atomic_inc(&vxi->cacct.nr_threads);
+               atomic_inc(&vxi->limit.res[RLIMIT_NPROC]);
+               atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]);
+               atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]);
+               set_vx_info(&p->vx_info, vxi);
+               p->xid = vxi->vx_id;
+               vx_mask_bcaps(p);
+               task_unlock(p);
+
+               put_vx_info(old_vxi);
+       }
+out:
+       put_vx_info(old_vxi);
+       return ret;
+}
+
+int vx_set_init(struct vx_info *vxi, struct task_struct *p)
+{
+       if (!vxi)
+               return -EINVAL;
+        if (vxi->vx_initpid)
+                return -EPERM;
+
+        vxi->vx_initpid = p->tgid;
+       return 0;
+}
+
+
+/* vserver syscall commands below here */
+
+/* taks xid and vx_info functions */
+
+#include <asm/uaccess.h>
+
+
+int vc_task_xid(uint32_t id, void __user *data)
+{
+        xid_t xid;
+
+        if (id) {
+                struct task_struct *tsk;
+
+                if (!vx_check(0, VX_ADMIN|VX_WATCH))
+                        return -EPERM;
+
+                read_lock(&tasklist_lock);
+                tsk = find_task_by_pid(id);
+                xid = (tsk) ? tsk->xid : -ESRCH;
+                read_unlock(&tasklist_lock);
+        }
+        else
+                xid = current->xid;
+        return xid;
+}
+
+
+int vc_vx_info(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_vx_info_v0 vc_data;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
+               return -EPERM;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       vc_data.xid = vxi->vx_id;
+       vc_data.initpid = vxi->vx_initpid;
+       put_vx_info(vxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+
+/* context functions */
+
+int vc_ctx_create(uint32_t xid, void __user *data)
+{
+        // int ret = -ENOMEM;
+       struct vx_info *new_vxi;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID))
+               return -EINVAL;
+
+       if (xid < 1)
+               return -EINVAL;
+
+       new_vxi = __foc_vx_info(xid, &ret);
+       if (!new_vxi)
+               return ret;
+       if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) {
+               ret = -EEXIST;
+               goto out_put;
+       }
+
+       ret = new_vxi->vx_id;
+       vx_migrate_task(current, new_vxi);
+out_put:
+       put_vx_info(new_vxi);
+       return ret;
+}
+
+
+int vc_ctx_migrate(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* dirty hack until Spectator becomes a cap */
+       if (id == 1) {
+               current->xid = 1;
+               return 0;
+       }
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+       vx_migrate_task(current, vxi);
+       put_vx_info(vxi);
+       return 0;
+}
+
+
+int vc_get_cflags(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_ctx_flags_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       vc_data.flagword = vxi->vx_flags;
+
+       // vc_data.mask = ~0UL;
+       /* special STATE flag handling */
+       vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME);
+
+       put_vx_info(vxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+int vc_set_cflags(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_ctx_flags_v0 vc_data;
+       uint64_t mask, trigger;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       /* special STATE flag handling */
+       mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
+       trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
+
+       if (trigger & VXF_STATE_SETUP)
+               vx_mask_bcaps(current);
+       if (trigger & VXF_STATE_INIT)
+               if (vxi == current->vx_info)
+                       vx_set_init(vxi, current);
+
+       vxi->vx_flags = vx_mask_flags(vxi->vx_flags,
+               vc_data.flagword, mask);
+       put_vx_info(vxi);
+       return 0;
+}
+
+int vc_get_ccaps(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_ctx_caps_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       vc_data.bcaps = vxi->vx_bcaps;
+       vc_data.ccaps = vxi->vx_ccaps;
+       vc_data.cmask = ~0UL;
+       put_vx_info(vxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+int vc_set_ccaps(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_ctx_caps_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       vxi->vx_bcaps &= vc_data.bcaps;
+       vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps,
+               vc_data.ccaps, vc_data.cmask);
+       put_vx_info(vxi);
+       return 0;
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL_GPL(free_vx_info);
+EXPORT_SYMBOL_GPL(vxlist_lock);
+
diff --git a/kernel/vserver/cvirt.c b/kernel/vserver/cvirt.c
new file mode 100644 (file)
index 0000000..2b5c81e
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ *  linux/kernel/vserver/cvirt.c
+ *
+ *  Virtual Server: Context Virtualization
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  broken out from limit.c
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/vserver/cvirt.h>
+#include <linux/vserver/context.h>
+#include <linux/vserver/switch.h>
+#include <linux/vinline.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+
+void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle)
+{
+       struct vx_info *vxi = current->vx_info;
+
+       set_normalized_timespec(uptime,
+               uptime->tv_sec - vxi->cvirt.bias_tp.tv_sec,
+               uptime->tv_nsec - vxi->cvirt.bias_tp.tv_nsec);
+       if (!idle)
+               return;
+       set_normalized_timespec(idle,
+               idle->tv_sec - vxi->cvirt.bias_idle.tv_sec,
+               idle->tv_nsec - vxi->cvirt.bias_idle.tv_nsec);
+       return;
+}
+
+uint64_t vx_idle_jiffies()
+{
+       return init_task.utime + init_task.stime;
+}
+
diff --git a/kernel/vserver/init.c b/kernel/vserver/init.c
new file mode 100644 (file)
index 0000000..8afd1fc
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ *  linux/kernel/init.c
+ *
+ *  Virtual Server Init
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  basic structure
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/vserver.h>
+// #include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+int    vserver_register_sysctl(void);
+void   vserver_unregister_sysctl(void);
+
+
+static int __init init_vserver(void)
+{
+       int ret = 0;
+
+       vserver_register_sysctl();
+       return ret;
+}
+
+
+static void __exit exit_vserver(void)
+{
+
+       vserver_unregister_sysctl();
+       return;
+}
+
+
+module_init(init_vserver);
+module_exit(exit_vserver);
+
diff --git a/kernel/vserver/inode.c b/kernel/vserver/inode.c
new file mode 100644 (file)
index 0000000..87e2849
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ *  linux/kernel/vserver/inode.c
+ *
+ *  Virtual Server: File System Support
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  separated from vcontext V0.05
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/vinline.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/namei.h>
+#include <linux/vserver/inode.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+
+static int __vc_get_iattr(struct inode *in, uint32_t *xid, uint32_t *flags, uint32_t *mask)
+{
+       if (!in || !in->i_sb)
+               return -ESRCH;
+
+       *flags = IATTR_XID
+               | (IS_BARRIER(in) ? IATTR_BARRIER : 0)
+               | (IS_IUNLINK(in) ? IATTR_IUNLINK : 0)
+               | (IS_IMMUTABLE(in) ? IATTR_IMMUTABLE : 0);     
+       *mask = IATTR_IUNLINK | IATTR_IMMUTABLE;
+
+       if (S_ISDIR(in->i_mode))
+               *mask |= IATTR_BARRIER;
+
+       if (in->i_sb->s_flags & MS_TAGXID) {
+               *xid = in->i_xid;
+               *mask |= IATTR_XID;
+       }
+
+       if (in->i_sb->s_magic == PROC_SUPER_MAGIC) {
+               struct proc_dir_entry *entry = PROC_I(in)->pde;
+               
+               // check for specific inodes ?
+               if (entry)
+                       *mask |= IATTR_FLAGS;
+               if (entry)
+                       *flags |= (entry->vx_flags & IATTR_FLAGS);      
+               else
+                       *flags |= (PROC_I(in)->vx_flags & IATTR_FLAGS);
+       }
+       return 0;
+}
+
+int vc_get_iattr(uint32_t id, void __user *data)
+{
+       struct nameidata nd;
+       struct vcmd_ctx_iattr_v1 vc_data;
+       int ret;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       ret = user_path_walk_link(vc_data.name, &nd);
+       if (!ret) {
+               ret = __vc_get_iattr(nd.dentry->d_inode,
+                       &vc_data.xid, &vc_data.flags, &vc_data.mask);
+               path_release(&nd);
+       }
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               ret = -EFAULT;
+       return ret;
+}
+
+static int __vc_set_iattr(struct dentry *de, uint32_t *xid, uint32_t *flags, uint32_t *mask)
+{
+       struct inode *in = de->d_inode;
+       int error = 0, is_proc = 0;
+
+       if (!in || !in->i_sb)
+               return -ESRCH;
+
+       is_proc = (in->i_sb->s_magic == PROC_SUPER_MAGIC);
+       if ((*mask & IATTR_FLAGS) && !is_proc)
+               return -EINVAL;
+       if ((*mask & IATTR_XID) && !(in->i_sb->s_flags & MS_TAGXID))
+               return -EINVAL;
+
+       down(&in->i_sem);
+       if (*mask & IATTR_XID)
+               in->i_xid = *xid;
+
+       if (*mask & IATTR_FLAGS) {
+               struct proc_dir_entry *entry = PROC_I(in)->pde;
+               unsigned int iflags = PROC_I(in)->vx_flags;
+
+               iflags = (iflags & ~(*mask & IATTR_FLAGS))
+                       | (*flags & IATTR_FLAGS);
+               PROC_I(in)->vx_flags = iflags;
+               if (entry)
+                       entry->vx_flags = iflags;
+       }
+       
+       if (*mask & (IATTR_BARRIER | IATTR_IUNLINK | IATTR_IMMUTABLE)) {
+               struct iattr attr;
+
+               attr.ia_valid = ATTR_ATTR_FLAG;
+               attr.ia_attr_flags =
+                       (IS_IMMUTABLE(in) ? ATTR_FLAG_IMMUTABLE : 0) |
+                       (IS_IUNLINK(in) ? ATTR_FLAG_IUNLINK : 0) |
+                       (IS_BARRIER(in) ? ATTR_FLAG_BARRIER : 0);
+
+               if (*mask & IATTR_IMMUTABLE) {
+                       if (*flags & IATTR_IMMUTABLE)
+                               attr.ia_attr_flags |= ATTR_FLAG_IMMUTABLE;
+                       else
+                               attr.ia_attr_flags &= ~ATTR_FLAG_IMMUTABLE;
+               }
+               if (*mask & IATTR_IUNLINK) {
+                       if (*flags & IATTR_IUNLINK)
+                               attr.ia_attr_flags |= ATTR_FLAG_IUNLINK;
+                       else
+                               attr.ia_attr_flags &= ~ATTR_FLAG_IUNLINK;
+               }
+               if (S_ISDIR(in->i_mode) && (*mask & IATTR_BARRIER)) {
+                       if (*flags & IATTR_BARRIER)
+                               attr.ia_attr_flags |= ATTR_FLAG_BARRIER;
+                       else
+                               attr.ia_attr_flags &= ~ATTR_FLAG_BARRIER;
+               }
+               if (in->i_op && in->i_op->setattr)
+                       error = in->i_op->setattr(de, &attr);
+               else {
+                       error = inode_change_ok(in, &attr);
+                       if (!error)
+                               error = inode_setattr(in, &attr);
+               }
+       }
+               
+       mark_inode_dirty(in);
+       up(&in->i_sem);
+       return 0;
+}
+
+int vc_set_iattr(uint32_t id, void __user *data)
+{
+       struct nameidata nd;
+       struct vcmd_ctx_iattr_v1 vc_data;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN) || !capable(CAP_LINUX_IMMUTABLE))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       ret = user_path_walk_link(vc_data.name, &nd);
+       if (!ret) {
+               ret = __vc_set_iattr(nd.dentry,
+                       &vc_data.xid, &vc_data.flags, &vc_data.mask);
+               path_release(&nd);
+       }
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               ret = -EFAULT;
+       return ret;
+}
+
+
+#ifdef CONFIG_VSERVER_LEGACY           
+#include <linux/proc_fs.h>
+
+#define PROC_DYNAMIC_FIRST 0xF0000000UL
+
+int vx_proc_ioctl(struct inode * inode, struct file * filp,
+       unsigned int cmd, unsigned long arg)
+{
+       struct proc_dir_entry *entry;
+       int error = 0;
+       int flags;
+
+       if (inode->i_ino < PROC_DYNAMIC_FIRST)
+               return -ENOTTY;
+
+       entry = PROC_I(inode)->pde;
+
+       switch(cmd) {
+       case FIOC_GETXFLG: {
+               /* fixme: if stealth, return -ENOTTY */
+               error = -EPERM;
+               flags = entry->vx_flags;
+               if (capable(CAP_CONTEXT))
+                       error = put_user(flags, (int *) arg);
+               break;
+       }
+       case FIOC_SETXFLG: {
+               /* fixme: if stealth, return -ENOTTY */
+               error = -EPERM;
+               if (!capable(CAP_CONTEXT))
+                       break;
+               error = -EROFS;
+               if (IS_RDONLY(inode))
+                       break;
+               error = -EFAULT;
+               if (get_user(flags, (int *) arg))
+                       break;
+               error = 0;
+               entry->vx_flags = flags;
+               break;
+       }
+       default:
+               return -ENOTTY;
+       }
+       return error;
+}
+#endif
+
diff --git a/kernel/vserver/legacy.c b/kernel/vserver/legacy.c
new file mode 100644 (file)
index 0000000..a620ae3
--- /dev/null
@@ -0,0 +1,161 @@
+/*
+ *  linux/kernel/vserver/legacy.c
+ *
+ *  Virtual Server: Legacy Funtions
+ *
+ *  Copyright (C) 2001-2003  Jacques Gelinas
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  broken out from vcontext.c V0.05
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/vserver/legacy.h>
+#include <linux/vserver/context.h>
+#include <linux/vserver/namespace.h>
+#include <linux/vserver.h>
+#include <linux/sched.h>
+#include <linux/namespace.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+
+
+static int vx_set_initpid(struct vx_info *vxi, int pid)
+{
+       if (vxi->vx_initpid)
+               return -EPERM;
+
+       vxi->vx_initpid = pid;
+       return 0;
+}
+
+int vc_new_s_context(uint32_t ctx, void __user *data)
+{
+       int ret = -ENOMEM;
+       struct vcmd_new_s_context_v1 vc_data;
+       struct vx_info *new_vxi;
+
+       if (copy_from_user(&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       /* legacy hack, will be removed soon */
+       if (ctx == -2) {
+               /* assign flags and initpid */
+               if (!current->vx_info)
+                       return -EINVAL;
+               ret = 0;
+               if (vc_data.flags & VX_INFO_INIT)
+                       ret = vx_set_initpid(current->vx_info, current->tgid);
+               if (ret == 0) {
+                       /* We keep the same vx_id, but lower the capabilities */
+                       current->vx_info->vx_bcaps &= (~vc_data.remove_cap);
+                       // current->cap_bset &= (~vc_data.remove_cap);
+                       ret = vx_current_xid();
+                       current->vx_info->vx_flags |= vc_data.flags;
+               }
+               return ret;
+       }
+       
+       if (!vx_check(0, VX_ADMIN) ||
+               !capable(CAP_SYS_ADMIN) || vx_flags(VX_INFO_LOCK, 0))
+               return -EPERM;
+
+       /* ugly hack for Spectator */
+       if (ctx == 1) {
+               current->xid = 1;
+               return 0;
+       }
+
+       if (((ctx > MAX_S_CONTEXT) && (ctx != VX_DYNAMIC_ID)) ||
+               (ctx == 0))
+               return -EINVAL;
+               
+       if ((ctx == VX_DYNAMIC_ID) || (ctx < MIN_D_CONTEXT))
+               new_vxi = find_or_create_vx_info(ctx);
+       else
+               new_vxi = find_vx_info(ctx);
+
+       if (!new_vxi)
+               return -EINVAL;
+       new_vxi->vx_flags &= ~(VXF_STATE_SETUP|VXF_STATE_INIT);
+       
+       ret = vx_migrate_task(current, new_vxi);
+       if (ret == 0) {
+               current->vx_info->vx_bcaps &= (~vc_data.remove_cap);
+               // current->cap_bset &= (~vc_data.remove_cap);
+               new_vxi->vx_flags |= vc_data.flags;
+               if (vc_data.flags & VX_INFO_INIT)
+                       vx_set_initpid(new_vxi, current->tgid);
+               if (vc_data.flags & VX_INFO_NAMESPACE)
+                       vx_set_namespace(new_vxi,
+                               current->namespace, current->fs);
+               if (vc_data.flags & VX_INFO_NPROC)
+                       new_vxi->limit.rlim[RLIMIT_NPROC] =
+                               current->rlim[RLIMIT_NPROC].rlim_max;
+               ret = new_vxi->vx_id;
+       }
+       put_vx_info(new_vxi);
+       return ret;
+}
+
+
+
+/*  set ipv4 root (syscall) */
+
+int vc_set_ipv4root(uint32_t nbip, void __user *data)
+{
+       int i, err = -EPERM;
+       struct vcmd_set_ipv4root_v3 vc_data;
+       struct nx_info *new_nxi, *nxi = current->nx_info;
+
+       if (nbip < 0 || nbip > NB_IPV4ROOT)
+               return -EINVAL;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       if (!nxi || nxi->ipv4[0] == 0 || capable(CAP_NET_ADMIN))
+               // We are allowed to change everything
+               err = 0;
+       else if (nxi) {
+               int found = 0;
+               
+               // We are allowed to select a subset of the currently
+               // installed IP numbers. No new one allowed
+               // We can't change the broadcast address though
+               for (i=0; i<nbip; i++) {
+                       int j;
+                       __u32 nxip = vc_data.nx_mask_pair[i].ip;
+                       for (j=0; j<nxi->nbipv4; j++) {
+                               if (nxip == nxi->ipv4[j]) {
+                                       found++;
+                                       break;
+                               }
+                       }
+               }
+               if ((found == nbip) &&
+                       (vc_data.broadcast == nxi->v4_bcast))
+                       err = 0;
+       }
+       if (err)
+               return err;
+
+       new_nxi = create_nx_info();
+       if (!new_nxi)
+               return -EINVAL;
+
+       new_nxi->nbipv4 = nbip;
+       for (i=0; i<nbip; i++) {
+               new_nxi->ipv4[i] = vc_data.nx_mask_pair[i].ip;
+               new_nxi->mask[i] = vc_data.nx_mask_pair[i].mask;
+       }
+       new_nxi->v4_bcast = vc_data.broadcast;
+       current->nx_info = new_nxi;
+       current->nid = new_nxi->nx_id;
+       put_nx_info(nxi);
+       return 0;
+}
+
+
diff --git a/kernel/vserver/limit.c b/kernel/vserver/limit.c
new file mode 100644 (file)
index 0000000..5bd2fdc
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ *  linux/kernel/vserver/limit.c
+ *
+ *  Virtual Server: Context Limits
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  broken out from vcontext V0.05
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/vserver/limit.h>
+#include <linux/vserver/context.h>
+#include <linux/vserver/switch.h>
+#include <linux/vinline.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+
+static int is_valid_rlimit(int id)
+{
+       int valid = 0;
+
+       switch (id) {
+               case RLIMIT_NPROC:
+               case RLIMIT_AS:
+               case RLIMIT_RSS:
+               case RLIMIT_MEMLOCK:
+               case RLIMIT_NOFILE:
+                       valid = 1;
+                       break;
+       }
+       return valid;
+}
+
+static inline uint64_t vc_get_rlim(struct vx_info *vxi, int id)
+{
+       unsigned long limit;
+
+       limit = vxi->limit.rlim[id];
+       if (limit == RLIM_INFINITY)
+               return CRLIM_INFINITY;
+       return limit;   
+}
+
+int vc_get_rlimit(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_ctx_rlimit_v0 vc_data;
+
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+       if (!is_valid_rlimit(vc_data.id))
+               return -ENOTSUPP;
+               
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       vc_data.maximum = vc_get_rlim(vxi, vc_data.id);
+       vc_data.minimum = CRLIM_UNSET;
+       vc_data.softlimit = CRLIM_UNSET;
+       put_vx_info(vxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+int vc_set_rlimit(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_ctx_rlimit_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+       if (!is_valid_rlimit(vc_data.id))
+               return -ENOTSUPP;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       if (vc_data.maximum != CRLIM_KEEP)
+               vxi->limit.rlim[vc_data.id] = vc_data.maximum;
+       printk("setting [%d] = %d\n", vc_data.id, (int)vc_data.maximum);
+       put_vx_info(vxi);
+
+       return 0;
+}
+
+int vc_get_rlimit_mask(uint32_t id, void __user *data)
+{
+       static struct vcmd_ctx_rlimit_mask_v0 mask = {
+                       /* minimum */
+               0
+               ,       /* softlimit */
+               0
+               ,       /* maximum */
+               (1 << RLIMIT_NPROC) |
+               (1 << RLIMIT_NOFILE) |
+               (1 << RLIMIT_MEMLOCK) |
+               (1 << RLIMIT_AS) |
+               (1 << RLIMIT_RSS)
+               };
+
+       if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
+               return -EPERM;
+       if (copy_to_user(data, &mask, sizeof(mask)))
+                return -EFAULT;
+       return 0;
+}
+
+
+void vx_vsi_meminfo(struct sysinfo *val)
+{
+       struct vx_info *vxi = current->vx_info;
+       unsigned long v;
+
+       v = vxi->limit.rlim[RLIMIT_RSS];
+       if (v != RLIM_INFINITY)
+               val->totalram = min(val->totalram, v);
+       v = atomic_read(&vxi->limit.res[RLIMIT_RSS]);
+       val->freeram = (v < val->totalram) ? val->totalram - v : 0;
+       val->bufferram = 0;
+        val->totalhigh = 0;
+        val->freehigh = 0;
+       return;
+}
+
+void vx_vsi_swapinfo(struct sysinfo *val)
+{
+       struct vx_info *vxi = current->vx_info;
+       unsigned long w,v;
+
+       v = vxi->limit.rlim[RLIMIT_RSS];
+       w = vxi->limit.rlim[RLIMIT_AS];
+       if (w != RLIM_INFINITY)
+               val->totalswap = min(val->totalswap, w -
+               ((v != RLIM_INFINITY) ? v : 0));
+       w = atomic_read(&vxi->limit.res[RLIMIT_AS]);
+       val->freeswap = (w < val->totalswap) ? val->totalswap - w : 0;
+       return;
+}
+
diff --git a/kernel/vserver/namespace.c b/kernel/vserver/namespace.c
new file mode 100644 (file)
index 0000000..2c76c6f
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ *  linux/kernel/vserver/namespace.c
+ *
+ *  Virtual Server: Context Namespace Support
+ *
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  broken out from context.c 0.07
+ *  V0.02  added task locking for namespace
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/utsname.h>
+#include <linux/vserver/namespace.h>
+#include <linux/vinline.h>
+#include <linux/namespace.h>
+#include <linux/dcache.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+
+/* virtual host info names */
+
+static char * vx_vhi_name(struct vx_info *vxi, int id)
+{
+       switch (id) {
+               case VHIN_CONTEXT:
+                       return vxi->vx_name;
+               case VHIN_SYSNAME:
+                       return vxi->cvirt.utsname.sysname;
+               case VHIN_NODENAME:
+                       return vxi->cvirt.utsname.nodename;
+               case VHIN_RELEASE:
+                       return vxi->cvirt.utsname.release;
+               case VHIN_VERSION:
+                       return vxi->cvirt.utsname.version;
+               case VHIN_MACHINE:
+                       return vxi->cvirt.utsname.machine;
+               case VHIN_DOMAINNAME:
+                       return vxi->cvirt.utsname.domainname;
+               default:
+                       return NULL;
+       }
+       return NULL;
+}
+
+int vc_set_vhi_name(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_vx_vhi_name_v0 vc_data;
+       char *name;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+       
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+       
+       name = vx_vhi_name(vxi, vc_data.field);
+       if (name)
+               memcpy(name, vc_data.name, 65);
+       put_vx_info(vxi);
+       return (name ? 0 : -EFAULT);
+}
+
+int vc_get_vhi_name(uint32_t id, void __user *data)
+{
+       struct vx_info *vxi;
+       struct vcmd_vx_vhi_name_v0 vc_data;
+       char *name;
+
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       name = vx_vhi_name(vxi, vc_data.field);
+       if (!name)
+               goto out_put;
+                       
+       memcpy(vc_data.name, name, 65);
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+out_put:
+       put_vx_info(vxi);
+       return (name ? 0 : -EFAULT);
+}
+
+/* namespace functions */
+
+#include <linux/namespace.h>
+
+int vx_set_namespace(struct vx_info *vxi, struct namespace *ns, struct fs_struct *fs)
+{
+       struct fs_struct *fs_copy;
+
+       if (vxi->vx_namespace)
+               return -EPERM;
+       if (!ns || !fs)
+               return -EINVAL;
+
+       fs_copy = copy_fs_struct(fs);
+       if (!fs_copy)
+               return -ENOMEM;
+
+       get_namespace(ns);
+       vxi->vx_namespace = ns;
+       vxi->vx_fs = fs_copy;
+       return 0;
+}
+
+int vc_enter_namespace(uint32_t id, void *data)
+{
+       struct vx_info *vxi;
+       struct fs_struct *old_fs, *fs;
+       struct namespace *old_ns;
+       int ret = 0;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       ret = -EINVAL;
+       if (!vxi->vx_namespace)
+               goto out_put;
+
+       ret = -ENOMEM;
+       fs = copy_fs_struct(vxi->vx_fs);
+       if (!fs)
+               goto out_put;
+
+       ret = 0;
+       task_lock(current);
+       old_ns = current->namespace;
+       old_fs = current->fs;
+       get_namespace(vxi->vx_namespace);
+       current->namespace = vxi->vx_namespace; 
+       current->fs = fs;
+       task_unlock(current);
+
+       put_namespace(old_ns);
+       put_fs_struct(old_fs);
+out_put:
+       put_vx_info(vxi);
+       return ret;
+}
+
+int vc_cleanup_namespace(uint32_t id, void *data)
+{
+       down_write(&current->namespace->sem);
+       // spin_lock(&dcache_lock);
+       spin_lock(&vfsmount_lock);
+       umount_unused(current->namespace->root, current->fs);
+       spin_unlock(&vfsmount_lock);
+       // spin_unlock(&dcache_lock);
+       up_write(&current->namespace->sem);
+       return 0;
+}
+
+int vc_set_namespace(uint32_t id, void __user *data)
+{
+       struct fs_struct *fs;
+       struct namespace *ns;
+       struct vx_info *vxi;
+       int ret;
+
+       if (vx_check(0, VX_ADMIN|VX_WATCH))
+               return -ENOSYS;
+
+       task_lock(current);
+       vxi = get_vx_info(current->vx_info);
+       fs = current->fs;
+       atomic_inc(&fs->count);
+       ns = current->namespace;
+       get_namespace(current->namespace);
+       task_unlock(current);
+
+       ret = vx_set_namespace(vxi, ns, fs);
+
+       put_namespace(ns);
+       put_fs_struct(fs);
+       put_vx_info(vxi);
+       return ret;
+}
+
diff --git a/kernel/vserver/network.c b/kernel/vserver/network.c
new file mode 100644 (file)
index 0000000..479a19b
--- /dev/null
@@ -0,0 +1,513 @@
+/*
+ *  linux/kernel/vserver/network.c
+ *
+ *  Virtual Server: Network Support
+ *
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  broken out from vcontext V0.05
+ *  V0.02  cleaned up implementation
+ *  V0.03  added equiv nx commands
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/vserver/network.h>
+#include <linux/ninline.h>
+
+#include <asm/errno.h>
+
+
+LIST_HEAD(nx_infos);
+
+spinlock_t nxlist_lock
+       __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ *     struct nx_info allocation and deallocation
+ */
+
+static struct nx_info *alloc_nx_info(void)
+{
+       struct nx_info *new = NULL;
+       
+       nxdprintk("alloc_nx_info()\n");
+       /* would this benefit from a slab cache? */
+       new = kmalloc(sizeof(struct nx_info), GFP_KERNEL);
+       if (!new)
+               return 0;
+       
+       memset (new, 0, sizeof(struct nx_info));
+       /* rest of init goes here */
+       
+       nxdprintk("alloc_nx_info() = %p\n", new);
+       return new;
+}
+
+void free_nx_info(struct nx_info *nxi)
+{
+       nxdprintk("free_nx_info(%p)\n", nxi);
+       kfree(nxi);
+}
+
+struct nx_info *create_nx_info(void)
+{
+       struct nx_info *new;
+       static int gnid = 1;
+       
+       nxdprintk("create_nx_info()\n");
+       if (!(new = alloc_nx_info()))
+               return 0;
+
+       spin_lock(&nxlist_lock);
+
+       /* new ip info */
+       atomic_set(&new->nx_refcount, 1);
+       new->nx_id = gnid++;
+       list_add(&new->nx_list, &nx_infos);
+
+       spin_unlock(&nxlist_lock);
+       return new;
+}
+
+
+/*
+ *     struct nx_info search by id
+ *     assumes nxlist_lock is held
+ */
+
+static __inline__ struct nx_info *__find_nx_info(int id)
+{
+       struct nx_info *nxi;
+
+       list_for_each_entry(nxi, &nx_infos, nx_list)
+               if (nxi->nx_id == id)
+                       return nxi;
+       return 0;
+}
+
+
+/*
+ *     struct nx_info ref stuff
+ */
+
+struct nx_info *find_nx_info(int id)
+{
+       struct nx_info *nxi;
+       
+       if (id < 0) {
+               nxi = current->nx_info;
+               get_nx_info(nxi);
+       } else {
+               spin_lock(&nxlist_lock);
+               if ((nxi = __find_nx_info(id)))
+                       get_nx_info(nxi);
+               spin_unlock(&nxlist_lock);
+       }
+       return nxi;
+}
+
+/*
+ *      verify that id is a valid nid
+ */
+
+int nx_info_id_valid(int id)
+{
+       int valid;
+       
+       spin_lock(&nxlist_lock);
+       valid = (__find_nx_info(id) != NULL);
+       spin_unlock(&nxlist_lock);
+       return valid;
+}
+
+
+/*
+ *     dynamic context id ...
+ */
+
+static __inline__ nid_t __nx_dynamic_id(void)
+{
+       static nid_t seq = MAX_N_CONTEXT;
+       nid_t barrier = seq;
+       
+       do {
+               if (++seq > MAX_N_CONTEXT)
+                       seq = MIN_D_CONTEXT;
+               if (!__find_nx_info(seq))
+                       return seq;
+       } while (barrier != seq);
+       return 0;
+}
+
+static struct nx_info * __foc_nx_info(int id, int *err)
+{
+       struct nx_info *new, *nxi = NULL;
+       
+       nxdprintk("foc_nx_info(%d)\n", id);
+       // if (!(new = alloc_nx_info(id))) {
+       if (!(new = alloc_nx_info())) {
+               *err = -ENOMEM;
+               return NULL;
+       }
+
+       spin_lock(&nxlist_lock);
+
+       /* dynamic context requested */
+       if (id == IP_DYNAMIC_ID) {
+               id = __nx_dynamic_id();
+               if (!id) {
+                       printk(KERN_ERR "no dynamic context available.\n");
+                       goto out_unlock;
+               }
+               new->nx_id = id;
+       }
+       /* existing context requested */
+       else if ((nxi = __find_nx_info(id))) {
+               /* context in setup is not available */
+               if (nxi->nx_flags & VXF_STATE_SETUP) {
+                       nxdprintk("foc_nx_info(%d) = %p (not available)\n", id, nxi);
+                       nxi = NULL;
+                       *err = -EBUSY;
+               } else {
+                       nxdprintk("foc_nx_info(%d) = %p (found)\n", id, nxi);
+                       get_nx_info(nxi);
+                       *err = 0;
+               }
+               goto out_unlock;
+       }
+
+       /* new context requested */
+       nxdprintk("foc_nx_info(%d) = %p (new)\n", id, new);
+       atomic_set(&new->nx_refcount, 1);
+       list_add(&new->nx_list, &nx_infos);
+       nxi = new, new = NULL;
+       *err = 1;
+
+out_unlock:
+       spin_unlock(&nxlist_lock);
+       if (new)
+               free_nx_info(new);
+       return nxi;
+}
+
+
+struct nx_info *find_or_create_nx_info(int id)
+{
+       int err;
+
+       return __foc_nx_info(id, &err);
+}
+
+/*
+ *     migrate task to new network
+ */
+
+int nx_migrate_task(struct task_struct *p, struct nx_info *nxi)
+{
+       struct nx_info *old_nxi = task_get_nx_info(p);
+       int ret = 0;
+       
+       if (!p || !nxi)
+               BUG();
+
+       nxdprintk("nx_migrate_task(%p,%p[#%d.%d)\n", p, nxi,
+               nxi->nx_id, atomic_read(&nxi->nx_refcount));
+       if (old_nxi == nxi)
+               goto out;
+
+       task_lock(p);
+       set_nx_info(&p->nx_info, nxi);
+       p->nid = nxi->nx_id;
+       task_unlock(p);
+
+       put_nx_info(old_nxi);
+out:
+       put_nx_info(old_nxi);
+       return ret;
+}
+
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+
+static inline int __addr_in_nx_info(u32 addr, struct nx_info *nxi)
+{
+       int i, nbip;
+
+       nbip = nxi->nbipv4;
+       for (i=0; i<nbip; i++)
+               if (nxi->ipv4[i] == addr)
+                       return 1;
+       return 0;
+}
+
+int ifa_in_nx_info(struct in_ifaddr *ifa, struct nx_info *nxi)
+{
+       if (!nxi)
+               return 1;
+       
+       return __addr_in_nx_info(ifa->ifa_address, nxi);
+}
+
+int dev_in_nx_info(struct net_device *dev, struct nx_info *nxi)
+{
+       struct in_device *in_dev = __in_dev_get(dev);
+       struct in_ifaddr **ifap = NULL;
+       struct in_ifaddr *ifa = NULL;
+
+       if (!nxi)
+               return 1;
+       if (!in_dev)
+               return 0;
+
+       for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+               ifap = &ifa->ifa_next) {
+               if (__addr_in_nx_info(ifa->ifa_address, nxi))
+                       return 1;
+       }
+       return 0;
+}
+
+
+
+
+/* vserver syscall commands below here */
+
+/* taks nid and nx_info functions */
+
+#include <asm/uaccess.h>
+
+
+int vc_task_nid(uint32_t id, void __user *data)
+{
+        nid_t nid;
+
+        if (id) {
+                struct task_struct *tsk;
+
+                if (!vx_check(0, VX_ADMIN|VX_WATCH))
+                        return -EPERM;
+
+                read_lock(&tasklist_lock);
+                tsk = find_task_by_pid(id);
+                nid = (tsk) ? tsk->nid : -ESRCH;
+                read_unlock(&tasklist_lock);
+        }
+        else
+                nid = current->nid;
+        return nid;
+}
+
+
+int vc_nx_info(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_nx_info_v0 vc_data;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE))
+               return -EPERM;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       vc_data.nid = nxi->nx_id;
+       put_nx_info(nxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+
+/* network functions */
+
+int vc_net_create(uint32_t nid, void __user *data)
+{
+        // int ret = -ENOMEM;
+       struct nx_info *new_nxi;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if ((nid >= MIN_D_CONTEXT) && (nid != VX_DYNAMIC_ID))
+               return -EINVAL;
+
+       if (nid < 1)
+               return -EINVAL;
+
+       new_nxi = __foc_nx_info(nid, &ret);
+       if (!new_nxi)
+               return ret;
+       if (!(new_nxi->nx_flags & VXF_STATE_SETUP)) {
+               ret = -EEXIST;
+               goto out_put;
+       }
+
+       ret = new_nxi->nx_id;
+       nx_migrate_task(current, new_nxi);
+out_put:
+       put_nx_info(new_nxi);
+       return ret;
+}
+
+
+int vc_net_migrate(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+       nx_migrate_task(current, nxi);
+       put_nx_info(nxi);
+       return 0;
+}
+
+int vc_net_add(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_net_nx_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       // add ip to net context here
+       put_nx_info(nxi);
+       return 0;
+}
+
+int vc_net_remove(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_net_nx_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       // rem ip from net context here
+       put_nx_info(nxi);
+       return 0;
+}
+
+
+
+int vc_get_nflags(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_net_flags_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       vc_data.flagword = nxi->nx_flags;
+
+       // vc_data.mask = ~0UL;
+       /* special STATE flag handling */
+       vc_data.mask = vx_mask_flags(~0UL, nxi->nx_flags, IPF_ONE_TIME);
+
+       put_nx_info(nxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+int vc_set_nflags(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_net_flags_v0 vc_data;
+       uint64_t mask, trigger;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       /* special STATE flag handling */
+       mask = vx_mask_mask(vc_data.mask, nxi->nx_flags, IPF_ONE_TIME);
+       trigger = (mask & nxi->nx_flags) ^ (mask & vc_data.flagword);
+       // if (trigger & IPF_STATE_SETUP)
+
+       nxi->nx_flags = vx_mask_flags(nxi->nx_flags,
+               vc_data.flagword, mask);
+       put_nx_info(nxi);
+       return 0;
+}
+
+int vc_get_ncaps(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_net_caps_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       vc_data.ncaps = nxi->nx_ncaps;
+       vc_data.cmask = ~0UL;
+       put_nx_info(nxi);
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
+       return 0;
+}
+
+int vc_set_ncaps(uint32_t id, void __user *data)
+{
+       struct nx_info *nxi;
+       struct vcmd_net_caps_v0 vc_data;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       nxi = find_nx_info(id);
+       if (!nxi)
+               return -ESRCH;
+
+       nxi->nx_ncaps = vx_mask_flags(nxi->nx_ncaps,
+               vc_data.ncaps, vc_data.cmask);
+       put_nx_info(nxi);
+       return 0;
+}
+
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL_GPL(free_nx_info);
+EXPORT_SYMBOL_GPL(nxlist_lock);
+
diff --git a/kernel/vserver/proc.c b/kernel/vserver/proc.c
new file mode 100644 (file)
index 0000000..42bc182
--- /dev/null
@@ -0,0 +1,905 @@
+/*
+ *  linux/kernel/vserver/proc.c
+ *
+ *  Virtual Context Support
+ *
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  basic structure
+ *  V0.02  adaptation vs1.3.0
+ *  V0.03  proc permissions
+ *  V0.04  locking/generic
+ *  V0.05  next generation procfs
+ *  V0.06  inode validation
+ *  V0.07  generic rewrite vid
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/vserver.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+
+static struct proc_dir_entry *proc_virtual;
+
+static struct proc_dir_entry *proc_vnet;
+
+
+enum vid_directory_inos {
+       PROC_XID_INO = 32,
+       PROC_XID_INFO,
+       PROC_XID_STATUS,
+       PROC_XID_LIMIT,
+       PROC_XID_SCHED,
+       PROC_XID_CVIRT,
+       PROC_XID_CACCT,
+
+       PROC_NID_INO = 64,
+       PROC_NID_INFO,
+       PROC_NID_STATUS,
+};
+
+#define        PROC_VID_MASK   0x60
+
+
+/* first the actual feeds */
+
+
+static int proc_virtual_info(int vid, char *buffer)
+{
+       return sprintf(buffer,
+               "VCIVersion:\t%04x:%04x\n"
+               "VCISyscall:\t%d\n"
+               ,VCI_VERSION >> 16
+               ,VCI_VERSION & 0xFFFF
+               ,__NR_vserver
+               );
+}
+
+
+int proc_xid_info (int vid, char *buffer)
+{
+       struct vx_info *vxi;
+       int length;
+
+       vxi = find_vx_info(vid);
+       if (!vxi)
+               return 0;
+       length = sprintf(buffer,
+               "ID:\t%d\n"
+               "Info:\t%p\n"
+               "Init:\t%d\n"
+               ,vxi->vx_id
+               ,vxi
+               ,vxi->vx_initpid
+               );
+       put_vx_info(vxi);
+       return length;
+}
+
+int proc_xid_status (int vid, char *buffer)
+{
+       struct vx_info *vxi;
+       int length;
+
+       vxi = find_vx_info(vid);
+       if (!vxi)
+               return 0;
+       length = sprintf(buffer,
+               "RefC:\t%d\n"           
+               "Flags:\t%016llx\n"
+               "BCaps:\t%016llx\n"
+               "CCaps:\t%016llx\n"
+               "Ticks:\t%d\n"          
+               ,atomic_read(&vxi->vx_refcount)
+               ,vxi->vx_flags
+               ,vxi->vx_bcaps
+               ,vxi->vx_ccaps
+               ,atomic_read(&vxi->limit.ticks)
+               );
+       put_vx_info(vxi);
+       return length;
+}
+
+int proc_xid_limit (int vid, char *buffer)
+{
+       struct vx_info *vxi;
+       int length;
+
+       vxi = find_vx_info(vid);
+       if (!vxi)
+               return 0;
+       length = vx_info_proc_limit(&vxi->limit, buffer);
+       put_vx_info(vxi);
+       return length;
+}
+
+int proc_xid_sched (int vid, char *buffer)
+{
+       struct vx_info *vxi;
+       int length;
+
+       vxi = find_vx_info(vid);
+       if (!vxi)
+               return 0;
+       length = vx_info_proc_sched(&vxi->sched, buffer);
+       put_vx_info(vxi);
+       return length;
+}
+
+int proc_xid_cvirt (int vid, char *buffer)
+{
+       struct vx_info *vxi;
+       int length;
+
+       vxi = find_vx_info(vid);
+       if (!vxi)
+               return 0;
+       length = vx_info_proc_cvirt(&vxi->cvirt, buffer);
+       put_vx_info(vxi);
+       return length;
+}
+
+int proc_xid_cacct (int vid, char *buffer)
+{
+       struct vx_info *vxi;
+       int length;
+
+       vxi = find_vx_info(vid);
+       if (!vxi)
+               return 0;
+       length = vx_info_proc_cacct(&vxi->cacct, buffer);
+       put_vx_info(vxi);
+       return length;
+}
+
+
+static int proc_vnet_info(int vid, char *buffer)
+{
+       return sprintf(buffer,
+               "VCIVersion:\t%04x:%04x\n"
+               "VCISyscall:\t%d\n"
+               ,VCI_VERSION >> 16
+               ,VCI_VERSION & 0xFFFF
+               ,__NR_vserver
+               );
+}
+
+#define        atoquad(a) \
+       (((a)>>0) & 0xff), (((a)>>8) & 0xff), \
+       (((a)>>16) & 0xff), (((a)>>24) & 0xff)
+
+int proc_nid_info (int vid, char *buffer)
+{
+       struct nx_info *nxi;
+       int length, i;
+
+       nxi = find_nx_info(vid);
+       if (!nxi)
+               return 0;
+       length = sprintf(buffer,
+               "ID:\t%d\n"
+               "Info:\t%p\n"
+               ,nxi->nx_id
+               ,nxi
+               );
+       for (i=0; i<nxi->nbipv4; i++) {
+               length += sprintf(buffer + length,
+                       "%d:\t%d.%d.%d.%d/%d.%d.%d.%d\n", i,
+                       atoquad(nxi->ipv4[i]),
+                       atoquad(nxi->mask[i]));
+       }
+       put_nx_info(nxi);
+       return length;
+}
+
+int proc_nid_status (int vid, char *buffer)
+{
+       struct nx_info *nxi;
+       int length;
+
+       nxi = find_nx_info(vid);
+       if (!nxi)
+               return 0;
+       length = sprintf(buffer,
+               "RefC:\t%d\n"           
+               ,atomic_read(&nxi->nx_refcount)
+               );
+       put_nx_info(nxi);
+       return length;
+}
+
+/* here the inode helpers */
+
+
+
+#define fake_ino(id,ino) (((id)<<16)|(ino))
+
+#define        inode_vid(i)    ((i)->i_ino >> 16)
+#define        inode_type(i)   ((i)->i_ino & 0xFFFF)
+
+#define MAX_MULBY10    ((~0U-9)/10)
+
+
+static struct inode *proc_vid_make_inode(struct super_block * sb,
+       int vid, int ino)
+{
+       struct inode *inode = new_inode(sb);
+
+       if (!inode)
+               goto out;
+
+       inode->i_mtime = inode->i_atime =
+               inode->i_ctime = CURRENT_TIME;
+       inode->i_ino = fake_ino(vid, ino);
+
+       inode->i_uid = 0;
+       inode->i_gid = 0;
+       // inode->i_xid = xid;
+out:
+       return inode;
+}
+
+static int proc_vid_revalidate(struct dentry * dentry, struct nameidata *nd)
+{
+       struct inode * inode = dentry->d_inode;
+       int vid, valid=0;
+
+       vid = inode_vid(inode);
+       switch (inode_type(inode) & PROC_VID_MASK) {
+               case PROC_XID_INO:
+                       valid = vx_info_id_valid(vid);
+                       break;
+               case PROC_NID_INO:
+                       valid = nx_info_id_valid(vid);
+                       break;
+       }       
+       if (valid)
+               return 1;
+       d_drop(dentry);
+       return 0;
+}
+
+/*
+static int proc_vid_delete_dentry(struct dentry * dentry)
+{
+        return 1;
+}
+*/
+
+
+#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024)
+
+static ssize_t proc_vid_info_read(struct file * file, char * buf,
+                         size_t count, loff_t *ppos)
+{
+       struct inode * inode = file->f_dentry->d_inode;
+       unsigned long page;
+       ssize_t length;
+       ssize_t end;
+       int vid;
+
+       if (count > PROC_BLOCK_SIZE)
+               count = PROC_BLOCK_SIZE;
+       if (!(page = __get_free_page(GFP_KERNEL)))
+               return -ENOMEM;
+
+       vid = inode_vid(inode);
+       length = PROC_I(inode)->op.proc_vid_read(vid, (char*)page);
+
+       if (length < 0) {
+               free_page(page);
+               return length;
+       }
+       /* Static 4kB (or whatever) block capacity */
+       if (*ppos >= length) {
+               free_page(page);
+               return 0;
+       }
+       if (count + *ppos > length)
+               count = length - *ppos;
+       end = count + *ppos;
+       copy_to_user(buf, (char *) page + *ppos, count);
+       *ppos = end;
+       free_page(page);
+       return count;
+}
+
+
+
+
+
+/* here comes the lower level (vid) */
+
+static struct file_operations proc_vid_info_file_operations = {
+       read:           proc_vid_info_read,
+};
+
+static struct dentry_operations proc_vid_dentry_operations = {
+       d_revalidate:   proc_vid_revalidate,
+//     d_delete:       proc_vid_delete_dentry,
+};
+
+
+struct vid_entry {
+       int type;
+       int len;
+       char *name;
+       mode_t mode;
+};
+
+#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)}
+
+static struct vid_entry vx_base_stuff[] = {
+       E(PROC_XID_INFO,        "info",         S_IFREG|S_IRUGO),
+       E(PROC_XID_STATUS,      "status",       S_IFREG|S_IRUGO),
+       E(PROC_XID_LIMIT,       "limit",        S_IFREG|S_IRUGO),
+       E(PROC_XID_SCHED,       "sched",        S_IFREG|S_IRUGO),
+       E(PROC_XID_CVIRT,       "cvirt",        S_IFREG|S_IRUGO),
+       E(PROC_XID_CACCT,       "cacct",        S_IFREG|S_IRUGO),
+       {0,0,NULL,0}
+};
+
+static struct vid_entry vn_base_stuff[] = {
+       E(PROC_NID_INFO,        "info",         S_IFREG|S_IRUGO),
+       E(PROC_NID_STATUS,      "status",       S_IFREG|S_IRUGO),
+       {0,0,NULL,0}
+};
+
+
+
+static struct dentry *proc_vid_lookup(struct inode *dir,
+       struct dentry *dentry, struct nameidata *nd)
+{
+       struct inode *inode;
+       struct vid_entry *p;
+       int error;
+
+       error = -ENOENT;
+       inode = NULL;
+
+       switch (inode_type(dir)) {
+               case PROC_XID_INO:
+                       p = vx_base_stuff;      
+                       break;
+               case PROC_NID_INO:
+                       p = vn_base_stuff;      
+                       break;
+               default:
+                       goto out;
+       }
+
+       for (; p->name; p++) {
+               if (p->len != dentry->d_name.len)
+                       continue;
+               if (!memcmp(dentry->d_name.name, p->name, p->len))
+                       break;
+       }
+       if (!p->name)
+               goto out;
+
+       error = -EINVAL;
+       inode = proc_vid_make_inode(dir->i_sb, inode_vid(dir), p->type);
+       if (!inode)
+               goto out;
+
+       switch(p->type) {
+               case PROC_XID_INFO:
+                       PROC_I(inode)->op.proc_vid_read = proc_xid_info;
+                       break;
+               case PROC_XID_STATUS:
+                       PROC_I(inode)->op.proc_vid_read = proc_xid_status;
+                       break;
+               case PROC_XID_LIMIT:
+                       PROC_I(inode)->op.proc_vid_read = proc_xid_limit;
+                       break;
+               case PROC_XID_SCHED:
+                       PROC_I(inode)->op.proc_vid_read = proc_xid_sched;
+                       break;
+               case PROC_XID_CVIRT:
+                       PROC_I(inode)->op.proc_vid_read = proc_xid_cvirt;
+                       break;
+               case PROC_XID_CACCT:
+                       PROC_I(inode)->op.proc_vid_read = proc_xid_cacct;
+                       break;
+
+               case PROC_NID_INFO:
+                       PROC_I(inode)->op.proc_vid_read = proc_nid_info;
+                       break;
+               case PROC_NID_STATUS:
+                       PROC_I(inode)->op.proc_vid_read = proc_nid_status;
+                       break;
+               
+               default:
+                       printk("procfs: impossible type (%d)",p->type);
+                       iput(inode);
+                       return ERR_PTR(-EINVAL);
+       }
+       inode->i_mode = p->mode;
+//     inode->i_op = &proc_vid_info_inode_operations;
+       inode->i_fop = &proc_vid_info_file_operations;
+       inode->i_nlink = 1;
+       inode->i_flags|=S_IMMUTABLE;
+       
+       dentry->d_op = &proc_vid_dentry_operations;
+       d_add(dentry, inode);
+       error = 0;
+out:
+       return ERR_PTR(error);
+}
+
+
+static int proc_vid_readdir(struct file * filp,
+       void * dirent, filldir_t filldir)
+{
+       int i, size;
+       struct inode *inode = filp->f_dentry->d_inode;
+       struct vid_entry *p;
+       
+       i = filp->f_pos;
+       switch (i) {
+               case 0:
+                       if (filldir(dirent, ".", 1, i,
+                               inode->i_ino, DT_DIR) < 0)
+                               return 0;
+                       i++;
+                       filp->f_pos++;
+                       /* fall through */
+               case 1:
+                       if (filldir(dirent, "..", 2, i,
+                               PROC_ROOT_INO, DT_DIR) < 0)
+                               return 0;
+                       i++;
+                       filp->f_pos++;
+                       /* fall through */
+               default:
+                       i -= 2;
+                       switch (inode_type(inode)) {
+                               case PROC_XID_INO:
+                                       size = sizeof(vx_base_stuff);
+                                       p = vx_base_stuff + i;  
+                                       break;
+                               case PROC_NID_INO:
+                                       size = sizeof(vn_base_stuff);
+                                       p = vn_base_stuff + i;  
+                                       break;
+                               default:
+                                       return 1;
+                       }
+                       if (i >= size/sizeof(struct vid_entry))
+                               return 1;
+                       while (p->name) {
+                               if (filldir(dirent, p->name, p->len,
+                                       filp->f_pos, fake_ino(inode_vid(inode),
+                                       p->type), p->mode >> 12) < 0)
+                                       return 0;
+                               filp->f_pos++;
+                               p++;
+                       }
+       }
+       return 1;
+}
+
+
+
+
+/* now the upper level (virtual) */
+
+static struct file_operations proc_vid_file_operations = {
+       read:           generic_read_dir,
+       readdir:        proc_vid_readdir,
+};
+
+static struct inode_operations proc_vid_inode_operations = {
+       lookup:         proc_vid_lookup,
+};
+
+
+
+static __inline__ int atovid(const char *str, int len)
+{
+       int vid, c;
+
+       vid = 0;
+       while (len-- > 0) {
+               c = *str - '0';
+               str++;
+               if (c > 9)
+                       return -1;
+               if (vid >= MAX_MULBY10)
+                       return -1;
+               vid *= 10;
+               vid += c;
+               if (!vid)
+                       return -1;
+       }
+       return vid;
+}
+
+
+struct dentry *proc_virtual_lookup(struct inode *dir,
+       struct dentry * dentry, struct nameidata *nd)
+{
+       int xid, len, ret;
+       struct vx_info *vxi;
+       const char *name;
+       struct inode *inode;
+
+       name = dentry->d_name.name;
+       len = dentry->d_name.len;
+       ret = -ENOMEM;
+
+       if (len == 7 && !memcmp(name, "current", 7)) {
+               inode = new_inode(dir->i_sb);
+               if (!inode)
+                       goto out;
+               inode->i_mtime = inode->i_atime =
+                       inode->i_ctime = CURRENT_TIME;
+               inode->i_ino = fake_ino(1, PROC_XID_INO);
+               inode->i_mode = S_IFLNK|S_IRWXUGO;
+               inode->i_uid = inode->i_gid = 0;
+               inode->i_size = 64;
+//             inode->i_op = &proc_current_inode_operations;
+               d_add(dentry, inode);
+               return NULL;
+       }
+       if (len == 4 && !memcmp(name, "info", 4)) {
+               inode = proc_vid_make_inode(dir->i_sb, 0, PROC_XID_INFO);
+               if (!inode)
+                       goto out;
+               inode->i_fop = &proc_vid_info_file_operations;
+               PROC_I(inode)->op.proc_vid_read = proc_virtual_info;
+               inode->i_mode = S_IFREG|S_IRUGO;
+//             inode->i_size = 64;
+//             inode->i_op = &proc_current_inode_operations;
+               d_add(dentry, inode);
+               return NULL;
+       }
+
+       ret = -ENOENT;
+       xid = atovid(name, len);
+       if (xid < 0)
+               goto out;
+       vxi = find_vx_info(xid);
+       if (!vxi)
+               goto out;
+
+       inode = NULL;
+       if (vx_check(xid, VX_ADMIN|VX_WATCH|VX_IDENT))
+               inode = proc_vid_make_inode(dir->i_sb,
+                       vxi->vx_id, PROC_XID_INO);
+       if (!inode)
+               goto out_release;
+
+       inode->i_mode = S_IFDIR|S_IRUGO;
+       inode->i_op = &proc_vid_inode_operations;
+       inode->i_fop = &proc_vid_file_operations;
+       inode->i_nlink = 2;
+       inode->i_flags|=S_IMMUTABLE;
+
+       dentry->d_op = &proc_vid_dentry_operations;
+       d_add(dentry, inode);
+       ret = 0;
+       
+out_release:
+       put_vx_info(vxi);
+out:
+       return ERR_PTR(ret);
+}
+
+
+struct dentry *proc_vnet_lookup(struct inode *dir,
+       struct dentry * dentry, struct nameidata *nd)
+{
+       int nid, len, ret;
+       struct nx_info *nxi;
+       const char *name;
+       struct inode *inode;
+
+       name = dentry->d_name.name;
+       len = dentry->d_name.len;
+       ret = -ENOMEM;
+       if (len == 7 && !memcmp(name, "current", 7)) {
+               inode = new_inode(dir->i_sb);
+               if (!inode)
+                       goto out;
+               inode->i_mtime = inode->i_atime =
+                       inode->i_ctime = CURRENT_TIME;
+               inode->i_ino = fake_ino(1, PROC_NID_INO);
+               inode->i_mode = S_IFLNK|S_IRWXUGO;
+               inode->i_uid = inode->i_gid = 0;
+               inode->i_size = 64;
+//             inode->i_op = &proc_current_inode_operations;
+               d_add(dentry, inode);
+               return NULL;
+       }
+       if (len == 4 && !memcmp(name, "info", 4)) {
+               inode = proc_vid_make_inode(dir->i_sb, 0, PROC_NID_INFO);
+               if (!inode)
+                       goto out;
+               inode->i_fop = &proc_vid_info_file_operations;
+               PROC_I(inode)->op.proc_vid_read = proc_vnet_info;
+               inode->i_mode = S_IFREG|S_IRUGO;
+//             inode->i_size = 64;
+//             inode->i_op = &proc_current_inode_operations;
+               d_add(dentry, inode);
+               return NULL;
+       }
+
+       ret = -ENOENT;
+       nid = atovid(name, len);
+       if (nid < 0)
+               goto out;
+       nxi = find_nx_info(nid);
+       if (!nxi)
+               goto out;
+
+       inode = NULL;
+       if (1)
+               inode = proc_vid_make_inode(dir->i_sb,
+                       nxi->nx_id, PROC_NID_INO);
+       if (!inode)
+               goto out_release;
+
+       inode->i_mode = S_IFDIR|S_IRUGO;
+       inode->i_op = &proc_vid_inode_operations;
+       inode->i_fop = &proc_vid_file_operations;
+       inode->i_nlink = 2;
+       inode->i_flags|=S_IMMUTABLE;
+
+       dentry->d_op = &proc_vid_dentry_operations;
+       d_add(dentry, inode);
+       ret = 0;
+       
+out_release:
+       put_nx_info(nxi);
+out:
+       return ERR_PTR(ret);
+}
+
+
+
+
+#define PROC_NUMBUF 10
+#define PROC_MAXVIDS 32
+
+
+static int get_xid_list(int index, unsigned int *xids)
+{
+       struct vx_info *p;
+       int nr_xids = 0;
+
+       index--;
+       spin_lock(&vxlist_lock);
+       list_for_each_entry(p, &vx_infos, vx_list) {
+               int xid = p->vx_id;
+
+               if (--index >= 0)
+                       continue;
+               xids[nr_xids] = xid;
+               if (++nr_xids >= PROC_MAXVIDS)
+                       break;
+       }
+       spin_unlock(&vxlist_lock);
+       return nr_xids;
+}
+
+int proc_virtual_readdir(struct file * filp,
+       void * dirent, filldir_t filldir)
+{
+       unsigned int xid_array[PROC_MAXVIDS];
+       char buf[PROC_NUMBUF];
+       unsigned int nr = filp->f_pos-3;
+       unsigned int nr_xids, i;
+       ino_t ino;
+
+       switch ((long)filp->f_pos) {
+               case 0:
+                       ino = fake_ino(0, PROC_XID_INO);
+                       if (filldir(dirent, ".", 1,
+                               filp->f_pos, ino, DT_DIR) < 0)
+                               return 0;
+                       filp->f_pos++;
+                       /* fall through */
+               case 1:
+                       ino = filp->f_dentry->d_parent->d_inode->i_ino;
+                       if (filldir(dirent, "..", 2,
+                               filp->f_pos, ino, DT_DIR) < 0)
+                               return 0;
+                       filp->f_pos++;
+                       /* fall through */
+               case 2:
+                       ino = fake_ino(0, PROC_XID_INFO);
+                       if (filldir(dirent, "info", 4,
+                               filp->f_pos, ino, DT_LNK) < 0)
+                               return 0;
+                       filp->f_pos++;
+                       /* fall through */
+               case 3:
+                       if (current->xid > 1) {
+                               ino = fake_ino(1, PROC_XID_INO);
+                               if (filldir(dirent, "current", 7,
+                                       filp->f_pos, ino, DT_LNK) < 0)
+                                       return 0;
+                       }
+                       filp->f_pos++;
+       }
+
+       nr_xids = get_xid_list(nr, xid_array);
+
+       for (i = 0; i < nr_xids; i++) {
+               int xid = xid_array[i];
+               ino_t ino = fake_ino(xid, PROC_XID_INO);
+               unsigned long j = PROC_NUMBUF;
+
+               do buf[--j] = '0' + (xid % 10); while (xid/=10);
+
+               if (filldir(dirent, buf+j, PROC_NUMBUF-j,
+                       filp->f_pos, ino, DT_DIR) < 0)
+                       break;
+               filp->f_pos++;
+       }
+       return 0;
+}
+
+
+static struct file_operations proc_virtual_dir_operations = {
+       read:           generic_read_dir,
+       readdir:        proc_virtual_readdir,
+};
+
+static struct inode_operations proc_virtual_dir_inode_operations = {
+       lookup:         proc_virtual_lookup,
+};
+
+
+
+static int get_nid_list(int index, unsigned int *nids)
+{
+       struct nx_info *p;
+       int nr_nids = 0;
+
+       index--;
+       spin_lock(&nxlist_lock);
+       list_for_each_entry(p, &nx_infos, nx_list) {
+               int nid = p->nx_id;
+
+               if (--index >= 0)
+                       continue;
+               nids[nr_nids] = nid;
+               if (++nr_nids >= PROC_MAXVIDS)
+                       break;
+       }
+       spin_unlock(&nxlist_lock);
+       return nr_nids;
+}
+
+int proc_vnet_readdir(struct file * filp,
+       void * dirent, filldir_t filldir)
+{
+       unsigned int nid_array[PROC_MAXVIDS];
+       char buf[PROC_NUMBUF];
+       unsigned int nr = filp->f_pos-3;
+       unsigned int nr_nids, i;
+       ino_t ino;
+
+       switch ((long)filp->f_pos) {
+               case 0:
+                       ino = fake_ino(0, PROC_NID_INO);
+                       if (filldir(dirent, ".", 1,
+                               filp->f_pos, ino, DT_DIR) < 0)
+                               return 0;
+                       filp->f_pos++;
+                       /* fall through */
+               case 1:
+                       ino = filp->f_dentry->d_parent->d_inode->i_ino;
+                       if (filldir(dirent, "..", 2,
+                               filp->f_pos, ino, DT_DIR) < 0)
+                               return 0;
+                       filp->f_pos++;
+                       /* fall through */
+               case 2:
+                       ino = fake_ino(0, PROC_NID_INFO);
+                       if (filldir(dirent, "info", 4,
+                               filp->f_pos, ino, DT_LNK) < 0)
+                               return 0;
+                       filp->f_pos++;
+                       /* fall through */
+               case 3:
+                       if (current->xid > 1) {
+                               ino = fake_ino(1, PROC_NID_INO);
+                               if (filldir(dirent, "current", 7,
+                                       filp->f_pos, ino, DT_LNK) < 0)
+                                       return 0;
+                       }
+                       filp->f_pos++;
+       }
+
+       nr_nids = get_nid_list(nr, nid_array);
+
+       for (i = 0; i < nr_nids; i++) {
+               int nid = nid_array[i];
+               ino_t ino = fake_ino(nid, PROC_NID_INO);
+               unsigned long j = PROC_NUMBUF;
+
+               do buf[--j] = '0' + (nid % 10); while (nid/=10);
+
+               if (filldir(dirent, buf+j, PROC_NUMBUF-j,
+                       filp->f_pos, ino, DT_DIR) < 0)
+                       break;
+               filp->f_pos++;
+       }
+       return 0;
+}
+
+
+static struct file_operations proc_vnet_dir_operations = {
+       read:           generic_read_dir,
+       readdir:        proc_vnet_readdir,
+};
+
+static struct inode_operations proc_vnet_dir_inode_operations = {
+       lookup:         proc_vnet_lookup,
+};
+
+
+
+void proc_vx_init(void)
+{
+       struct proc_dir_entry *ent;
+
+       ent = proc_mkdir("virtual", 0);
+       if (ent) {
+               ent->proc_fops = &proc_virtual_dir_operations;
+               ent->proc_iops = &proc_virtual_dir_inode_operations;
+       }
+       proc_virtual = ent;
+
+       ent = proc_mkdir("vnet", 0);
+       if (ent) {
+               ent->proc_fops = &proc_vnet_dir_operations;
+               ent->proc_iops = &proc_vnet_dir_inode_operations;
+       }
+       proc_vnet = ent;
+}
+
+
+
+
+/* per pid info */
+
+
+char *task_vx_info(struct task_struct *p, char *buffer)
+{
+       return buffer + sprintf(buffer,
+               "XID:\t%d\n"
+               ,p->xid);
+}
+
+int proc_pid_vx_info(struct task_struct *p, char *buffer)
+{
+       char * orig = buffer;
+
+       buffer = task_vx_info(p, buffer);
+       return buffer - orig;
+}
+
+char *task_nx_info(struct task_struct *p, char *buffer)
+{
+       return buffer + sprintf(buffer,
+               "NID:\t%d\n"
+               ,p->nid);
+}
+
+int proc_pid_nx_info(struct task_struct *p, char *buffer)
+{
+       char * orig = buffer;
+
+       buffer = task_nx_info(p, buffer);
+       return buffer - orig;
+}
+
diff --git a/kernel/vserver/sched.c b/kernel/vserver/sched.c
new file mode 100644 (file)
index 0000000..a75195a
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ *  linux/kernel/vserver/sched.c
+ *
+ *  Virtual Server: Scheduler Support
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  adapted Sam Vilains version to 2.6.3
+ *  V0.02  removed legacy interface
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/vinline.h>
+#include <linux/vserver/context.h>
+#include <linux/vserver/sched.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+
+/*
+ * recalculate the context's scheduling tokens
+ *
+ * ret > 0 : number of tokens available
+ * ret = 0 : context is paused
+ * ret < 0 : number of jiffies until new tokens arrive
+ *
+ */
+int vx_tokens_recalc(struct vx_info *vxi)
+{
+       long delta, tokens = 0;
+
+       if (__vx_flags(vxi->vx_flags, VXF_SCHED_PAUSE, 0))
+               /* we are paused */
+               return 0;
+
+       delta = jiffies - vxi->sched.jiffies;
+
+       if (delta >= vxi->sched.interval) {
+               /* lockdown scheduler info */
+               spin_lock(&vxi->sched.tokens_lock);
+
+               /* calc integral token part */
+               delta = jiffies - vxi->sched.jiffies;
+               tokens = delta / vxi->sched.interval;
+               delta = tokens * vxi->sched.interval;
+               tokens *= vxi->sched.fill_rate;
+
+               atomic_add(tokens, &vxi->sched.tokens);
+               vxi->sched.jiffies += delta;
+               tokens = atomic_read(&vxi->sched.tokens);
+       
+               if (tokens > vxi->sched.tokens_max) {
+                       tokens = vxi->sched.tokens_max;
+                       atomic_set(&vxi->sched.tokens, tokens);
+               }
+               spin_unlock(&vxi->sched.tokens_lock);
+       } else {
+               /* no new tokens */
+               if ((tokens = vx_tokens_avail(vxi)) < vxi->sched.tokens_min) {
+                       /* enough tokens will be available in */
+                       if (vxi->sched.tokens_min == 0)
+                               return delta - vxi->sched.interval;
+                       return delta - vxi->sched.interval *
+                               vxi->sched.tokens_min / vxi->sched.fill_rate;
+               }
+       }
+       /* we have some tokens left */
+       return tokens;
+}
+
+/*
+ * effective_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into a -4 ... 0 ... +4 bonus/penalty range.
+ *
+ * Additionally, we scale another amount based on the number of
+ * CPU tokens currently held by the context, if the process is
+ * part of a context (and the appropriate SCHED flag is set).
+ * This ranges from -5 ... 0 ... +15, quadratically.
+ *
+ * So, the total bonus is -9 .. 0 .. +19
+ * We use ~50% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *    unless that context is far exceeding its CPU allocation.
+ *
+ * Both properties are important to certain workloads.
+ */
+int effective_vavavoom(task_t *p, int max_prio)
+{
+       struct vx_info *vxi = p->vx_info;
+       int vavavoom, max;
+
+       /* lots of tokens = lots of vavavoom
+        *      no tokens = no vavavoom      */
+       if ((vavavoom = atomic_read(&vxi->sched.tokens)) >= 0) {
+               max = vxi->sched.tokens_max;
+               vavavoom = max - vavavoom;
+               max = max * max;
+               vavavoom = max_prio * VAVAVOOM_RATIO / 100
+                       * (vavavoom*vavavoom - (max >> 2)) / max;
+               /*  alternative, geometric mapping
+               vavavoom = -( MAX_USER_PRIO*VAVAVOOM_RATIO/100 * vavavoom
+                       / vxi->sched.tokens_max -
+                       MAX_USER_PRIO*VAVAVOOM_RATIO/100/2); */
+       } else
+               vavavoom = 0;
+       /* vavavoom = ( MAX_USER_PRIO*VAVAVOOM_RATIO/100*tokens_left(p) -
+               MAX_USER_PRIO*VAVAVOOM_RATIO/100/2); */
+
+       return vavavoom;
+}
+
+
+int vc_set_sched(uint32_t xid, void __user *data)
+{
+       struct vcmd_set_sched_v2 vc_data;
+       struct vx_info *vxi;
+
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+       
+       vxi = find_vx_info(xid);
+       if (!vxi)
+               return -EINVAL;
+
+       spin_lock(&vxi->sched.tokens_lock);
+
+       if (vc_data.interval != SCHED_KEEP)
+               vxi->sched.interval = vc_data.interval;
+       if (vc_data.fill_rate != SCHED_KEEP)
+               vxi->sched.fill_rate = vc_data.fill_rate;
+       if (vc_data.tokens_min != SCHED_KEEP)
+               vxi->sched.tokens_min = vc_data.tokens_min;
+       if (vc_data.tokens_max != SCHED_KEEP)
+               vxi->sched.tokens_max = vc_data.tokens_max;
+       if (vc_data.tokens != SCHED_KEEP)
+               atomic_set(&vxi->sched.tokens, vc_data.tokens);
+
+       /* Sanity check the resultant values */
+       if (vxi->sched.fill_rate <= 0)
+               vxi->sched.fill_rate = 1;
+       if (vxi->sched.interval <= 0)
+               vxi->sched.interval = HZ;
+       if (vxi->sched.tokens_max == 0)
+               vxi->sched.tokens_max = 1;
+       if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
+               atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
+       if (vxi->sched.tokens_min > vxi->sched.tokens_max)
+               vxi->sched.tokens_min = vxi->sched.tokens_max;
+
+       spin_unlock(&vxi->sched.tokens_lock);
+       put_vx_info(vxi);
+       return 0;
+}
+
diff --git a/kernel/vserver/signal.c b/kernel/vserver/signal.c
new file mode 100644 (file)
index 0000000..464ea1b
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ *  linux/kernel/vserver/signal.c
+ *
+ *  Virtual Server: Signal Support
+ *
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  broken out from vcontext V0.05
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+#include <linux/vinline.h>
+#include <linux/vserver/signal.h>
+
+
+int vc_ctx_kill(uint32_t id, void __user *data)
+{
+       int retval, count=0;
+       struct vcmd_ctx_kill_v0 vc_data;
+       struct siginfo info;
+       struct task_struct *p;
+       struct vx_info *vxi;
+
+       if (!vx_check(0, VX_ADMIN))
+               return -ENOSYS;
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+       
+       info.si_signo = vc_data.sig;
+       info.si_errno = 0;
+       info.si_code = SI_USER;
+       info.si_pid = current->pid;
+       info.si_uid = current->uid;
+
+       vxi = find_vx_info(id);
+       if (!vxi)
+               return -ESRCH;
+
+       retval = -ESRCH;
+       read_lock(&tasklist_lock);
+       switch (vc_data.pid) {
+       case -1:
+       case  0:
+               for_each_process(p) {
+                       int err = 0;
+
+                       if (vx_task_xid(p) != id || p->pid <= 1 ||
+                               (vc_data.pid && vxi->vx_initpid == p->pid) ||
+                               !thread_group_leader(p))
+                               continue;
+
+                       err = send_sig_info(vc_data.sig, &info, p);
+                       ++count;
+                       if (err != -EPERM)
+                               retval = err;
+               }
+               break;
+               
+       default:
+       p = find_task_by_pid(vc_data.pid);
+               if (p) {
+                       if (!thread_group_leader(p)) {
+                               struct task_struct *tg;
+                       
+                               tg = find_task_by_pid(p->tgid);
+                               if (tg)
+                                       p = tg;
+                       }
+                       if ((id == -1) || (vx_task_xid(p) == id))
+                               retval = send_sig_info(vc_data.sig, &info, p);
+               }
+               break;
+       }
+       read_unlock(&tasklist_lock);
+       put_vx_info(vxi);
+       return retval;
+}
+
+
diff --git a/kernel/vserver/switch.c b/kernel/vserver/switch.c
new file mode 100644 (file)
index 0000000..90fee14
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ *  linux/kernel/vserver/switch.c
+ *
+ *  Virtual Server: Syscall Switch
+ *
+ *  Copyright (C) 2003-2004  Herbert Pötzl
+ *
+ *  V0.01  syscall switch
+ *  V0.02  added signal to context
+ *  V0.03  added rlimit functions
+ *  V0.04  added iattr, task/xid functions
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/errno.h>
+
+#include <linux/vserver/switch.h>
+#include <linux/vinline.h>
+
+
+static inline int
+vc_get_version(uint32_t id)
+{
+       return VCI_VERSION;
+}
+
+
+#include <linux/vserver/legacy.h>
+#include <linux/vserver/context.h>
+#include <linux/vserver/network.h>
+#include <linux/vserver/namespace.h>
+#include <linux/vserver/sched.h>
+#include <linux/vserver/limit.h>
+#include <linux/vserver/inode.h>
+#include <linux/vserver/signal.h>
+
+
+extern unsigned int vx_debug_switch;
+
+
+extern asmlinkage long
+sys_vserver(uint32_t cmd, uint32_t id, void __user *data)
+{
+
+       if (vx_debug_switch)
+               printk( "vc: VCMD_%02d_%d[%d], %d\n",
+                       VC_CATEGORY(cmd), VC_COMMAND(cmd),
+                       VC_VERSION(cmd), id);
+
+       switch (cmd) {
+       case VCMD_get_version:
+               return vc_get_version(id);
+
+#ifdef CONFIG_VSERVER_LEGACY           
+       case VCMD_new_s_context:
+               return vc_new_s_context(id, data);
+       case VCMD_set_ipv4root:
+               return vc_set_ipv4root(id, data);
+#endif
+
+       case VCMD_task_xid:
+               return vc_task_xid(id, data);
+       case VCMD_vx_info:
+               return vc_vx_info(id, data);
+
+       case VCMD_task_nid:
+               return vc_task_nid(id, data);
+       case VCMD_nx_info:
+               return vc_nx_info(id, data);
+
+       case VCMD_set_namespace:
+               return vc_set_namespace(id, data);
+       case VCMD_cleanup_namespace:
+               return vc_cleanup_namespace(id, data);
+       }
+
+       /* those are allowed while in setup too */
+       if (!vx_check(0, VX_ADMIN|VX_WATCH) &&
+               !vx_flags(VXF_STATE_SETUP,0))
+               return -EPERM;
+
+#ifdef CONFIG_VSERVER_LEGACY
+       switch (cmd) {
+       case VCMD_set_cflags:
+       case VCMD_set_ccaps:
+               if (vx_check(0, VX_WATCH))
+                       return 0;
+       }
+#endif
+
+       switch (cmd) {
+       case VCMD_get_rlimit:
+               return vc_get_rlimit(id, data);
+       case VCMD_set_rlimit:
+               return vc_set_rlimit(id, data);
+       case VCMD_get_rlimit_mask:
+               return vc_get_rlimit_mask(id, data);
+               
+       case VCMD_vx_get_vhi_name:
+               return vc_get_vhi_name(id, data);
+       case VCMD_vx_set_vhi_name:
+               return vc_set_vhi_name(id, data);
+
+       case VCMD_set_cflags:
+               return vc_set_cflags(id, data);
+       case VCMD_get_cflags:
+               return vc_get_cflags(id, data);
+
+       case VCMD_set_ccaps:
+               return vc_set_ccaps(id, data);
+       case VCMD_get_ccaps:
+               return vc_get_ccaps(id, data);
+
+       case VCMD_set_nflags:
+               return vc_set_nflags(id, data);
+       case VCMD_get_nflags:
+               return vc_get_nflags(id, data);
+
+       case VCMD_set_ncaps:
+               return vc_set_ncaps(id, data);
+       case VCMD_get_ncaps:
+               return vc_get_ncaps(id, data);
+
+       case VCMD_set_sched:
+               return vc_set_sched(id, data);
+       }
+
+       /* below here only with VX_ADMIN */
+       if (!vx_check(0, VX_ADMIN|VX_WATCH))
+               return -EPERM;
+
+       switch (cmd) {
+       case VCMD_ctx_kill:
+               return vc_ctx_kill(id, data);
+
+#ifdef CONFIG_VSERVER_LEGACY           
+       case VCMD_create_context:
+               return vc_ctx_create(id, data);
+#endif
+
+       case VCMD_get_iattr:
+               return vc_get_iattr(id, data);
+       case VCMD_set_iattr:
+               return vc_set_iattr(id, data);
+
+       case VCMD_enter_namespace:
+               return vc_enter_namespace(id, data);
+
+       case VCMD_ctx_create:
+#ifdef CONFIG_VSERVER_LEGACY           
+               if (id == 1) {
+                       current->xid = 1;
+                       return 1;
+               }
+#endif
+               return vc_ctx_create(id, data);
+       case VCMD_ctx_migrate:
+               return vc_ctx_migrate(id, data);
+
+       case VCMD_net_create:
+               return vc_net_create(id, data);
+       case VCMD_net_migrate:
+               return vc_net_migrate(id, data);
+
+       }
+       return -ENOSYS;
+}
+
diff --git a/kernel/vserver/sysctl.c b/kernel/vserver/sysctl.c
new file mode 100644 (file)
index 0000000..562fc0e
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ *  linux/kernel/sysctl.c
+ *
+ *  Virtual Context Support
+ *
+ *  Copyright (C) 2004  Herbert Pötzl
+ *
+ *  V0.01  basic structure
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/vserver.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+
+#define CTL_VSERVER    4242    /* unused? */
+
+enum {
+        CTL_DEBUG_SWITCH = 1,
+        CTL_DEBUG_LIMIT,
+};
+
+
+unsigned int vx_debug_switch = 0;
+unsigned int vx_debug_limit = 0;
+
+
+static struct ctl_table_header *vserver_table_header;
+static ctl_table vserver_table[];
+
+
+void vserver_register_sysctl(void)
+{
+       if (!vserver_table_header) {
+               vserver_table_header = register_sysctl_table(vserver_table, 1);
+#ifdef CONFIG_PROC_FS
+//             if (vserver_table[0].de)
+//                     vserver_table[0].de->owner = THIS_MODULE;
+#endif
+       }
+                       
+}
+
+void vserver_unregister_sysctl(void)
+{
+       if (vserver_table_header) {
+               unregister_sysctl_table(vserver_table_header);
+               vserver_table_header = NULL;
+       }
+}
+
+
+static int proc_dodebug(ctl_table *table, int write,
+       struct file *file, void *buffer, size_t *lenp)
+{
+       char            tmpbuf[20], *p, c;
+       unsigned int    value;
+       size_t          left, len;
+
+       if ((file->f_pos && !write) || !*lenp) {
+               *lenp = 0;
+               return 0;
+       }
+
+       left = *lenp;
+
+       if (write) {
+               if (!access_ok(VERIFY_READ, buffer, left))
+                       return -EFAULT;
+               p = (char *) buffer;
+               while (left && __get_user(c, p) >= 0 && isspace(c))
+                       left--, p++;
+               if (!left)
+                       goto done;
+
+               if (left > sizeof(tmpbuf) - 1)
+                       return -EINVAL;
+               if (copy_from_user(tmpbuf, p, left))
+                       return -EFAULT;
+               tmpbuf[left] = '\0';
+
+               for (p = tmpbuf, value = 0; '0' <= *p && *p <= '9'; p++, left--)
+                       value = 10 * value + (*p - '0');
+               if (*p && !isspace(*p))
+                       return -EINVAL;
+               while (left && isspace(*p))
+                       left--, p++;
+               *(unsigned int *) table->data = value;
+       } else {
+               if (!access_ok(VERIFY_WRITE, buffer, left))
+                       return -EFAULT;
+               len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data);
+               if (len > left)
+                       len = left;
+               if (__copy_to_user(buffer, tmpbuf, len))
+                       return -EFAULT;
+               if ((left -= len) > 0) {
+                       if (put_user('\n', (char *)buffer + len))
+                               return -EFAULT;
+                       left--;
+               }
+       }
+
+done:
+       *lenp -= left;
+       file->f_pos += *lenp;
+       return 0;
+}
+       
+
+
+static ctl_table debug_table[] = {
+        {
+                .ctl_name       = CTL_DEBUG_SWITCH,
+                .procname       = "debug_switch",
+                .data           = &vx_debug_switch,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dodebug
+        },
+        {
+                .ctl_name       = CTL_DEBUG_LIMIT,
+                .procname       = "debug_limit",
+                .data           = &vx_debug_limit,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dodebug
+        },
+        { .ctl_name = 0 }
+};
+
+static ctl_table vserver_table[] = {
+        {
+                .ctl_name       = CTL_VSERVER,
+                .procname       = "vserver",
+                .mode           = 0555,
+                .child          = debug_table
+        },
+        { .ctl_name = 0 }
+};
+