From: Mark Huang Date: Wed, 2 Jun 2004 20:45:37 +0000 (+0000) Subject: Initial revision X-Git-Tag: before-ipod-patch~38 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=aaa9c8b34087e01cd85c92137198ff2ded23938d;p=linux-2.6.git Initial revision --- diff --git a/include/linux/ninline.h b/include/linux/ninline.h new file mode 100644 index 000000000..d3f752516 --- /dev/null +++ b/include/linux/ninline.h @@ -0,0 +1,151 @@ +#ifndef _NX_INLINE_H +#define _NX_INLINE_H + + +// #define NX_DEBUG + +#include +#include + +#include "vserver/network.h" + +#if defined(NX_DEBUG) +#define nxdprintk(x...) printk("nxd: " x) +#else +#define nxdprintk(x...) +#endif + + +void free_nx_info(struct nx_info *); + +extern int proc_pid_nx_info(struct task_struct *, char *); + + +#define get_nx_info(i) __get_nx_info(i,__FILE__,__LINE__) + +static __inline__ struct nx_info *__get_nx_info(struct nx_info *nxi, const char *_file, int _line) +{ + if (!nxi) + return NULL; + nxdprintk("get_nx_info(%p[%d.%d])\t%s:%d\n", + nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_refcount):0, + _file, _line); + atomic_inc(&nxi->nx_refcount); + return nxi; +} + +#define put_nx_info(i) __put_nx_info(i,__FILE__,__LINE__) + +static __inline__ void __put_nx_info(struct nx_info *nxi, const char *_file, int _line) +{ + if (!nxi) + return; + nxdprintk("put_nx_info(%p[%d.%d])\t%s:%d\n", + nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_refcount):0, + _file, _line); + if (atomic_dec_and_lock(&nxi->nx_refcount, &nxlist_lock)) { + list_del(&nxi->nx_list); + spin_unlock(&nxlist_lock); + free_nx_info(nxi); + } +} + + +#define set_nx_info(p,i) __set_nx_info(p,i,__FILE__,__LINE__) + +static inline void __set_nx_info(struct nx_info **nxp, struct nx_info *nxi, + const char *_file, int _line) +{ + BUG_ON(*nxp); + if (!nxi) + return; + nxdprintk("set_nx_info(%p[#%d.%d])\t%s:%d\n", + nxi, nxi?nxi->nx_id:0, nxi?atomic_read(&nxi->nx_refcount):0, + _file, _line); + *nxp = __get_nx_info(nxi, _file, _line); +} + +#define clr_nx_info(p) __clr_nx_info(p,__FILE__,__LINE__) + +static inline void __clr_nx_info(struct nx_info **nxp, + const char *_file, int _line) +{ + struct nx_info *nxo = *nxp; + + if (!nxo) + return; + nxdprintk("clr_nx_info(%p[#%d.%d])\t%s:%d\n", + nxo, nxo?nxo->nx_id:0, nxo?atomic_read(&nxo->nx_refcount):0, + _file, _line); + *nxp = NULL; + wmb(); + __put_nx_info(nxo, _file, _line); +} + + +#define task_get_nx_info(i) __task_get_nx_info(i,__FILE__,__LINE__) + +static __inline__ struct nx_info *__task_get_nx_info(struct task_struct *p, + const char *_file, int _line) +{ + struct nx_info *nxi; + + task_lock(p); + nxi = __get_nx_info(p->nx_info, _file, _line); + task_unlock(p); + return nxi; +} + +#define nx_verify_info(p,i) \ + __nx_verify_info((p)->nx_info,i,__FILE__,__LINE__) + +static __inline__ void __nx_verify_info( + struct nx_info *ipa, struct nx_info *ipb, + const char *_file, int _line) +{ + if (ipa == ipb) + return; + printk(KERN_ERR "ip bad assumption (%p==%p) at %s:%d\n", + ipa, ipb, _file, _line); +} + + +#define nx_task_nid(t) ((t)->nid) + +#define nx_current_nid() nx_task_nid(current) + +#define nx_check(c,m) __nx_check(nx_current_nid(),c,m) + +#define nx_weak_check(c,m) ((m) ? nx_check(c,m) : 1) + +#undef nxdprintk +#define nxdprintk(x...) + + +#define __nx_flags(v,m,f) (((v) & (m)) ^ (f)) + +#define __nx_task_flags(t,m,f) \ + (((t) && ((t)->nx_info)) ? \ + __nx_flags((t)->nx_info->nx_flags,(m),(f)) : 0) + +#define nx_current_flags() \ + ((current->nx_info) ? current->nx_info->nx_flags : 0) + +#define nx_flags(m,f) __nx_flags(nx_current_flags(),(m),(f)) + + +#define nx_current_ncaps() \ + ((current->nx_info) ? current->nx_info->nx_ncaps : 0) + +#define nx_ncaps(c) (nx_current_ncaps() & (c)) + + + +#define sock_nx_init(s) do { \ + (s)->sk_nid = 0; \ + (s)->sk_nx_info = NULL; \ + } while (0) + + + +#endif diff --git a/include/linux/vinline.h b/include/linux/vinline.h new file mode 100644 index 000000000..07bb3698a --- /dev/null +++ b/include/linux/vinline.h @@ -0,0 +1,462 @@ +#ifndef _VX_INLINE_H +#define _VX_INLINE_H + + +// #define VX_DEBUG + +#include +#include + +#include "vserver/context.h" +#include "vserver/limit.h" +#include "vserver/cvirt.h" + +#if defined(VX_DEBUG) +#define vxdprintk(x...) printk("vxd: " x) +#else +#define vxdprintk(x...) +#endif + + + +void free_vx_info(struct vx_info *); + +extern int proc_pid_vx_info(struct task_struct *, char *); + + +#define get_vx_info(i) __get_vx_info(i,__FILE__,__LINE__) + +static __inline__ struct vx_info *__get_vx_info(struct vx_info *vxi, + const char *_file, int _line) +{ + if (!vxi) + return NULL; + vxdprintk("get_vx_info(%p[#%d.%d])\t%s:%d\n", + vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_refcount):0, + _file, _line); + atomic_inc(&vxi->vx_refcount); + return vxi; +} + +#define put_vx_info(i) __put_vx_info(i,__FILE__,__LINE__) + +static __inline__ void __put_vx_info(struct vx_info *vxi, const char *_file, int _line) +{ + if (!vxi) + return; + vxdprintk("put_vx_info(%p[#%d.%d])\t%s:%d\n", + vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_refcount):0, + _file, _line); + if (atomic_dec_and_lock(&vxi->vx_refcount, &vxlist_lock)) { + list_del(&vxi->vx_list); + spin_unlock(&vxlist_lock); + free_vx_info(vxi); + } +} + +#define set_vx_info(p,i) __set_vx_info(p,i,__FILE__,__LINE__) + +static inline void __set_vx_info(struct vx_info **vxp, struct vx_info *vxi, + const char *_file, int _line) +{ + BUG_ON(*vxp); + if (!vxi) + return; + vxdprintk("set_vx_info(%p[#%d.%d])\t%s:%d\n", + vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_refcount):0, + _file, _line); + *vxp = __get_vx_info(vxi, _file, _line); +} + +#define clr_vx_info(p) __clr_vx_info(p,__FILE__,__LINE__) + +static inline void __clr_vx_info(struct vx_info **vxp, + const char *_file, int _line) +{ + struct vx_info *vxo = *vxp; + + vxdprintk("clr_vx_info(%p[#%d.%d])\t%s:%d\n", + vxo, vxo?vxo->vx_id:0, vxo?atomic_read(&vxo->vx_refcount):0, + _file, _line); + *vxp = NULL; + wmb(); + __put_vx_info(vxo, _file, _line); +} + + +#define task_get_vx_info(i) __task_get_vx_info(i,__FILE__,__LINE__) + +static __inline__ struct vx_info *__task_get_vx_info(struct task_struct *p, + const char *_file, int _line) +{ + struct vx_info *vxi; + + task_lock(p); + vxi = __get_vx_info(p->vx_info, _file, _line); + task_unlock(p); + return vxi; +} + + +#define vx_verify_info(p,i) \ + __vx_verify_info((p)->vx_info,i,__FILE__,__LINE__) + +static __inline__ void __vx_verify_info( + struct vx_info *vxa, struct vx_info *vxb, + const char *_file, int _line) +{ + if (vxa == vxb) + return; + printk(KERN_ERR "vx bad assumption (%p==%p) at %s:%d\n", + vxa, vxb, _file, _line); +} + + +#define vx_task_xid(t) ((t)->xid) + +#define vx_current_xid() vx_task_xid(current) + +#define vx_check(c,m) __vx_check(vx_current_xid(),c,m) + +#define vx_weak_check(c,m) ((m) ? vx_check(c,m) : 1) + + +/* + * check current context for ADMIN/WATCH and + * optionally agains supplied argument + */ +static __inline__ int __vx_check(xid_t cid, xid_t id, unsigned int mode) +{ + if (mode & VX_ARG_MASK) { + if ((mode & VX_IDENT) && + (id == cid)) + return 1; + } + if (mode & VX_ATR_MASK) { + if ((mode & VX_DYNAMIC) && + (id >= MIN_D_CONTEXT) && + (id <= MAX_S_CONTEXT)) + return 1; + if ((mode & VX_STATIC) && + (id > 1) && (id < MIN_D_CONTEXT)) + return 1; + } + return (((mode & VX_ADMIN) && (cid == 0)) || + ((mode & VX_WATCH) && (cid == 1))); +} + + +#define __vx_flags(v,m,f) (((v) & (m)) ^ (f)) + +#define __vx_task_flags(t,m,f) \ + (((t) && ((t)->vx_info)) ? \ + __vx_flags((t)->vx_info->vx_flags,(m),(f)) : 0) + +#define vx_current_flags() \ + ((current->vx_info) ? current->vx_info->vx_flags : 0) + +#define vx_flags(m,f) __vx_flags(vx_current_flags(),(m),(f)) + + +#define vx_current_ccaps() \ + ((current->vx_info) ? current->vx_info->vx_ccaps : 0) + +#define vx_ccaps(c) (vx_current_ccaps() & (c)) + +#define vx_current_bcaps() \ + (((current->vx_info) && !vx_flags(VXF_STATE_SETUP, 0)) ? \ + current->vx_info->vx_bcaps : cap_bset) + + +#define VX_DEBUG_ACC_RSS 0 +#define VX_DEBUG_ACC_VM 0 +#define VX_DEBUG_ACC_VML 0 + +#undef vxdprintk +#if (VX_DEBUG_ACC_RSS) || (VX_DEBUG_ACC_VM) || (VX_DEBUG_ACC_VML) +#define vxdprintk(x...) printk("vxd: " x) +#else +#define vxdprintk(x...) +#endif + +#define vx_acc_page(m, d, v, r) \ + __vx_acc_page(&(m->v), m->mm_vx_info, r, d, __FILE__, __LINE__) + +static inline void __vx_acc_page(unsigned long *v, struct vx_info *vxi, + int res, int dir, char *file, int line) +{ + if (v) { + if (dir > 0) + ++(*v); + else + --(*v); + } + if (vxi) { + if (dir > 0) + atomic_inc(&vxi->limit.res[res]); + else + atomic_dec(&vxi->limit.res[res]); + } +} + + +#define vx_acc_pages(m, p, v, r) \ + __vx_acc_pages(&(m->v), m->mm_vx_info, r, p, __FILE__, __LINE__) + +static inline void __vx_acc_pages(unsigned long *v, struct vx_info *vxi, + int res, int pages, char *file, int line) +{ + if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) || + (res == RLIMIT_AS && VX_DEBUG_ACC_VM) || + (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML)) + vxdprintk("vx_acc_pages [%5d,%2d]: %5d += %5d in %s:%d\n", + (vxi?vxi->vx_id:-1), res, + (vxi?atomic_read(&vxi->limit.res[res]):0), + pages, file, line); + if (pages == 0) + return; + if (v) + *v += pages; + if (vxi) + atomic_add(pages, &vxi->limit.res[res]); +} + + + +#define vx_acc_vmpage(m,d) vx_acc_page(m, d, total_vm, RLIMIT_AS) +#define vx_acc_vmlpage(m,d) vx_acc_page(m, d, locked_vm, RLIMIT_MEMLOCK) +#define vx_acc_rsspage(m,d) vx_acc_page(m, d, rss, RLIMIT_RSS) + +#define vx_acc_vmpages(m,p) vx_acc_pages(m, p, total_vm, RLIMIT_AS) +#define vx_acc_vmlpages(m,p) vx_acc_pages(m, p, locked_vm, RLIMIT_MEMLOCK) +#define vx_acc_rsspages(m,p) vx_acc_pages(m, p, rss, RLIMIT_RSS) + +#define vx_pages_add(s,r,p) __vx_acc_pages(0, s, r, p, __FILE__, __LINE__) +#define vx_pages_sub(s,r,p) __vx_pages_add(s, r, -(p)) + +#define vx_vmpages_inc(m) vx_acc_vmpage(m, 1) +#define vx_vmpages_dec(m) vx_acc_vmpage(m,-1) +#define vx_vmpages_add(m,p) vx_acc_vmpages(m, p) +#define vx_vmpages_sub(m,p) vx_acc_vmpages(m,-(p)) + +#define vx_vmlocked_inc(m) vx_acc_vmlpage(m, 1) +#define vx_vmlocked_dec(m) vx_acc_vmlpage(m,-1) +#define vx_vmlocked_add(m,p) vx_acc_vmlpages(m, p) +#define vx_vmlocked_sub(m,p) vx_acc_vmlpages(m,-(p)) + +#define vx_rsspages_inc(m) vx_acc_rsspage(m, 1) +#define vx_rsspages_dec(m) vx_acc_rsspage(m,-1) +#define vx_rsspages_add(m,p) vx_acc_rsspages(m, p) +#define vx_rsspages_sub(m,p) vx_acc_rsspages(m,-(p)) + + + +#define vx_pages_avail(m, p, r) \ + __vx_pages_avail((m)->mm_vx_info, (r), (p), __FILE__, __LINE__) + +static inline int __vx_pages_avail(struct vx_info *vxi, + int res, int pages, char *file, int line) +{ + if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) || + (res == RLIMIT_AS && VX_DEBUG_ACC_VM) || + (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML)) + printk("vx_pages_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n", + (vxi?vxi->vx_id:-1), res, + (vxi?vxi->limit.rlim[res]:1), + (vxi?atomic_read(&vxi->limit.res[res]):0), + pages, file, line); + if (!vxi) + return 1; + if (vxi->limit.rlim[res] == RLIM_INFINITY) + return 1; + if (atomic_read(&vxi->limit.res[res]) + pages < vxi->limit.rlim[res]) + return 1; + return 0; +} + +#define vx_vmpages_avail(m,p) vx_pages_avail(m, p, RLIMIT_AS) +#define vx_vmlocked_avail(m,p) vx_pages_avail(m, p, RLIMIT_MEMLOCK) +#define vx_rsspages_avail(m,p) vx_pages_avail(m, p, RLIMIT_RSS) + +/* file limits */ + +#define VX_DEBUG_ACC_FILE 0 +#define VX_DEBUG_ACC_OPENFD 0 + +#undef vxdprintk +#if (VX_DEBUG_ACC_FILE) || (VX_DEBUG_ACC_OPENFD) +#define vxdprintk(x...) printk("vxd: " x) +#else +#define vxdprintk(x...) +#endif + + +#define vx_acc_cres(v,d,r) \ + __vx_acc_cres((v), (r), (d), __FILE__, __LINE__) + +static inline void __vx_acc_cres(struct vx_info *vxi, + int res, int dir, char *file, int line) +{ + if (vxi) { + if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) || + (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD)) + printk("vx_acc_cres[%5d,%2d]: %5d%s in %s:%d\n", + (vxi?vxi->vx_id:-1), res, + (vxi?atomic_read(&vxi->limit.res[res]):0), + (dir>0)?"++":"--", file, line); + if (dir > 0) + atomic_inc(&vxi->limit.res[res]); + else + atomic_dec(&vxi->limit.res[res]); + } +} + +#define vx_files_inc(f) vx_acc_cres(current->vx_info, 1, RLIMIT_NOFILE) +#define vx_files_dec(f) vx_acc_cres(current->vx_info,-1, RLIMIT_NOFILE) + +#define vx_openfd_inc(f) vx_acc_cres(current->vx_info, 1, RLIMIT_OPENFD) +#define vx_openfd_dec(f) vx_acc_cres(current->vx_info,-1, RLIMIT_OPENFD) + +#define vx_cres_avail(v,n,r) \ + __vx_cres_avail((v), (r), (n), __FILE__, __LINE__) + +static inline int __vx_cres_avail(struct vx_info *vxi, + int res, int num, char *file, int line) +{ + if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) || + (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD)) + printk("vx_cres_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n", + (vxi?vxi->vx_id:-1), res, + (vxi?vxi->limit.rlim[res]:1), + (vxi?atomic_read(&vxi->limit.res[res]):0), + num, file, line); + if (!vxi) + return 1; + if (vxi->limit.rlim[res] == RLIM_INFINITY) + return 1; + if (vxi->limit.rlim[res] < atomic_read(&vxi->limit.res[res]) + num) + return 0; + return 1; +} + +#define vx_files_avail(n) \ + vx_cres_avail(current->vx_info, (n), RLIMIT_NOFILE) + +#define vx_openfd_avail(n) \ + vx_cres_avail(current->vx_info, (n), RLIMIT_OPENFD) + +/* socket limits */ + +#define vx_sock_inc(f) vx_acc_cres(current->vx_info, 1, VLIMIT_SOCK) +#define vx_sock_dec(f) vx_acc_cres(current->vx_info,-1, VLIMIT_SOCK) + +#define vx_sock_avail(n) \ + vx_cres_avail(current->vx_info, (n), VLIMIT_SOCK) + +/* procfs ioctls */ + +#define FIOC_GETXFLG _IOR('x', 5, long) +#define FIOC_SETXFLG _IOW('x', 6, long) + +/* utsname virtualization */ + +static inline struct new_utsname *vx_new_utsname(void) +{ + if (current->vx_info) + return ¤t->vx_info->cvirt.utsname; + return &system_utsname; +} + +#define vx_new_uts(x) ((vx_new_utsname())->x) + +/* generic flag merging */ + +#define vx_mask_flags(v,f,m) (((v) & ~(m)) | ((f) & (m))) + +#define vx_mask_mask(v,f,m) (((v) & ~(m)) | ((v) & (f) & (m))) + + +/* socket accounting */ + +#include + +static inline int vx_sock_type(int family) +{ + int type = 4; + + if (family > 0 && family < 3) + type = family; + else if (family == PF_INET6) + type = 3; + return type; +} + +#define vx_acc_sock(v,f,p,s) \ + __vx_acc_sock((v), (f), (p), (s), __FILE__, __LINE__) + +static inline void __vx_acc_sock(struct vx_info *vxi, + int family, int pos, int size, char *file, int line) +{ + if (vxi) { + int type = vx_sock_type(family); + + atomic_inc(&vxi->cacct.sock[type][pos].count); + atomic_add(size, &vxi->cacct.sock[type][pos].total); + } +} + +#define vx_sock_recv(sk,s) \ + vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 0, (s)) +#define vx_sock_send(sk,s) \ + vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 1, (s)) +#define vx_sock_fail(sk,s) \ + vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 2, (s)) + + +#define sock_vx_init(s) do { \ + (s)->sk_xid = 0; \ + (s)->sk_vx_info = NULL; \ + } while (0) + + +/* pid faking stuff */ + + +#define vx_map_tgid(v,p) \ + __vx_map_tgid((v), (p), __FILE__, __LINE__) + +static inline int __vx_map_tgid(struct vx_info *vxi, int pid, + char *file, int line) +{ + if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) { + vxdprintk("vx_map_tgid: %p/%llx: %d -> %d in %s:%d\n", + vxi, vxi->vx_flags, pid, + (pid == vxi->vx_initpid)?1:pid, + file, line); + if (pid == vxi->vx_initpid) + return 1; + } + return pid; +} + +#define vx_rmap_tgid(v,p) \ + __vx_rmap_tgid((v), (p), __FILE__, __LINE__) + +static inline int __vx_rmap_tgid(struct vx_info *vxi, int pid, + char *file, int line) +{ + if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) { + vxdprintk("vx_rmap_tgid: %p/%llx: %d -> %d in %s:%d\n", + vxi, vxi->vx_flags, pid, + (pid == 1)?vxi->vx_initpid:pid, + file, line); + if ((pid == 1) && vxi->vx_initpid) + return vxi->vx_initpid; + } + return pid; +} + +#undef vxdprintk +#define vxdprintk(x...) + +#endif diff --git a/include/linux/vserver.h b/include/linux/vserver.h new file mode 100644 index 000000000..2c39ebbe0 --- /dev/null +++ b/include/linux/vserver.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_VSERVER_H +#define _LINUX_VSERVER_H + +#include +#include +#include +#include + +#endif diff --git a/include/linux/vserver/context.h b/include/linux/vserver/context.h new file mode 100644 index 000000000..76926038e --- /dev/null +++ b/include/linux/vserver/context.h @@ -0,0 +1,176 @@ +#ifndef _VX_CONTEXT_H +#define _VX_CONTEXT_H + +#include + +#define MAX_S_CONTEXT 65535 /* Arbitrary limit */ +#define MIN_D_CONTEXT 49152 /* dynamic contexts start here */ + +#define VX_DYNAMIC_ID ((uint32_t)-1) /* id for dynamic context */ + +#ifdef __KERNEL__ + +#include +#include + +#define _VX_INFO_DEF_ +#include "cvirt.h" +#include "limit.h" +#include "sched.h" +#undef _VX_INFO_DEF_ + +struct vx_info { + struct list_head vx_list; /* linked list of contexts */ + xid_t vx_id; /* context id */ + atomic_t vx_refcount; /* refcount */ + struct vx_info *vx_parent; /* parent context */ + + struct namespace *vx_namespace; /* private namespace */ + struct fs_struct *vx_fs; /* private namespace fs */ + uint64_t vx_flags; /* VX_INFO_xxx */ + uint64_t vx_bcaps; /* bounding caps (system) */ + uint64_t vx_ccaps; /* context caps (vserver) */ + + pid_t vx_initpid; /* PID of fake init process */ + + struct _vx_limit limit; /* vserver limits */ + struct _vx_sched sched; /* vserver scheduler */ + struct _vx_cvirt cvirt; /* virtual/bias stuff */ + struct _vx_cacct cacct; /* context accounting */ + + char vx_name[65]; /* vserver name */ +}; + + +extern spinlock_t vxlist_lock; +extern struct list_head vx_infos; + + +#define VX_ADMIN 0x0001 +#define VX_WATCH 0x0002 +#define VX_DUMMY 0x0008 + +#define VX_IDENT 0x0010 +#define VX_EQUIV 0x0020 +#define VX_PARENT 0x0040 +#define VX_CHILD 0x0080 + +#define VX_ARG_MASK 0x00F0 + +#define VX_DYNAMIC 0x0100 +#define VX_STATIC 0x0200 + +#define VX_ATR_MASK 0x0F00 + + +void free_vx_info(struct vx_info *); + +extern struct vx_info *find_vx_info(int); +extern struct vx_info *find_or_create_vx_info(int); +extern int vx_info_id_valid(int); + +extern int vx_migrate_task(struct task_struct *, struct vx_info *); + +#endif /* __KERNEL__ */ + +#include "switch.h" + +/* vinfo commands */ + +#define VCMD_task_xid VC_CMD(VINFO, 1, 0) +#define VCMD_task_nid VC_CMD(VINFO, 2, 0) + +#ifdef __KERNEL__ +extern int vc_task_xid(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_vx_info VC_CMD(VINFO, 5, 0) +#define VCMD_nx_info VC_CMD(VINFO, 6, 0) + +struct vcmd_vx_info_v0 { + uint32_t xid; + uint32_t initpid; + /* more to come */ +}; + +#ifdef __KERNEL__ +extern int vc_vx_info(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_ctx_create VC_CMD(VPROC, 1, 0) +#define VCMD_ctx_migrate VC_CMD(PROCMIG, 1, 0) + +#ifdef __KERNEL__ +extern int vc_ctx_create(uint32_t, void __user *); +extern int vc_ctx_migrate(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_get_cflags VC_CMD(FLAGS, 1, 0) +#define VCMD_set_cflags VC_CMD(FLAGS, 2, 0) + +struct vcmd_ctx_flags_v0 { + uint64_t flagword; + uint64_t mask; +}; + +#ifdef __KERNEL__ +extern int vc_get_cflags(uint32_t, void __user *); +extern int vc_set_cflags(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VXF_INFO_LOCK 0x00000001 +#define VXF_INFO_SCHED 0x00000002 +#define VXF_INFO_NPROC 0x00000004 +#define VXF_INFO_PRIVATE 0x00000008 + +#define VXF_INFO_INIT 0x00000010 +#define VXF_INFO_HIDE 0x00000020 +#define VXF_INFO_ULIMIT 0x00000040 +#define VXF_INFO_NSPACE 0x00000080 + +#define VXF_SCHED_HARD 0x00000100 +#define VXF_SCHED_PRIO 0x00000200 +#define VXF_SCHED_PAUSE 0x00000400 + +#define VXF_VIRT_MEM 0x00010000 +#define VXF_VIRT_UPTIME 0x00020000 +#define VXF_VIRT_CPU 0x00040000 + +#define VXF_HIDE_MOUNT 0x01000000 +#define VXF_HIDE_NETIF 0x02000000 + +#define VXF_STATE_SETUP (1ULL<<32) +#define VXF_STATE_INIT (1ULL<<33) + +#define VXF_FORK_RSS (1ULL<<48) + +#define VXF_ONE_TIME (0x0003ULL<<32) + +#define VCMD_get_ccaps VC_CMD(FLAGS, 3, 0) +#define VCMD_set_ccaps VC_CMD(FLAGS, 4, 0) + +struct vcmd_ctx_caps_v0 { + uint64_t bcaps; + uint64_t ccaps; + uint64_t cmask; +}; + +#ifdef __KERNEL__ +extern int vc_get_ccaps(uint32_t, void __user *); +extern int vc_set_ccaps(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VXC_SET_UTSNAME 0x00000001 +#define VXC_SET_RLIMIT 0x00000002 + +#define VXC_ICMP_PING 0x00000100 + +#define VXC_SECURE_MOUNT 0x00010000 + + +#endif /* _VX_CONTEXT_H */ diff --git a/include/linux/vserver/cvirt.h b/include/linux/vserver/cvirt.h new file mode 100644 index 000000000..ba3a25356 --- /dev/null +++ b/include/linux/vserver/cvirt.h @@ -0,0 +1,133 @@ +#if defined(__KERNEL__) && defined(_VX_INFO_DEF_) + +#include +#include +#include +#include +#include + +/* context sub struct */ + +struct _vx_cvirt { + int max_threads; + + unsigned int bias_cswtch; + struct timespec bias_idle; + struct timespec bias_tp; + uint64_t bias_jiffies; + + struct new_utsname utsname; +}; + +struct sock_acc { + atomic_t count; + atomic_t total; +}; + +struct _vx_cacct { + atomic_t nr_threads; + int nr_running; + + unsigned long total_forks; + + struct sock_acc sock[5][3]; +}; + + +static inline long vx_sock_count(struct _vx_cacct *cacct, int type, int pos) +{ + return atomic_read(&cacct->sock[type][pos].count); +} + + +static inline long vx_sock_total(struct _vx_cacct *cacct, int type, int pos) +{ + return atomic_read(&cacct->sock[type][pos].total); +} + + +extern uint64_t vx_idle_jiffies(void); + +static inline void vx_info_init_cvirt(struct _vx_cvirt *cvirt) +{ + uint64_t idle_jiffies = vx_idle_jiffies(); + + // new->virt.bias_cswtch = kstat.context_swtch; + cvirt->bias_jiffies = get_jiffies_64(); + + jiffies_to_timespec(idle_jiffies, &cvirt->bias_idle); + do_posix_clock_monotonic_gettime(&cvirt->bias_tp); + + down_read(&uts_sem); + cvirt->utsname = system_utsname; + up_read(&uts_sem); +} + +static inline void vx_info_exit_cvirt(struct _vx_cvirt *cvirt) +{ + return; +} + +static inline void vx_info_init_cacct(struct _vx_cacct *cacct) +{ + int i,j; + + atomic_set(&cacct->nr_threads, 1); + for (i=0; i<5; i++) { + for (j=0; j<3; j++) { + atomic_set(&cacct->sock[i][j].count, 0); + atomic_set(&cacct->sock[i][j].total, 0); + } + } +} + +static inline void vx_info_exit_cacct(struct _vx_cacct *cacct) +{ + return; +} + +static inline int vx_info_proc_cvirt(struct _vx_cvirt *cvirt, char *buffer) +{ + int length = 0; + return length; +} + +static inline int vx_info_proc_cacct(struct _vx_cacct *cacct, char *buffer) +{ + int i,j, length = 0; + static char *type[] = { "UNSPEC", "UNIX", "INET", "INET6", "OTHER" }; + + for (i=0; i<5; i++) { + length += sprintf(buffer + length, + "%s:", type[i]); + for (j=0; j<3; j++) { + length += sprintf(buffer + length, + "\t%12lu/%-12lu" + ,vx_sock_count(cacct, i, j) + ,vx_sock_total(cacct, i, j) + ); + } + buffer[length++] = '\n'; + } + return length; +} + +#else /* _VX_INFO_DEF_ */ +#ifndef _VX_CVIRT_H +#define _VX_CVIRT_H + +#include "switch.h" + +/* cvirt vserver commands */ + + +#ifdef __KERNEL__ + +struct timespec; + +void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle); + +#endif /* __KERNEL__ */ + +#endif /* _VX_CVIRT_H */ +#endif diff --git a/include/linux/vserver/inode.h b/include/linux/vserver/inode.h new file mode 100644 index 000000000..aa8852f43 --- /dev/null +++ b/include/linux/vserver/inode.h @@ -0,0 +1,67 @@ +#ifndef _VX_INODE_H +#define _VX_INODE_H + +#include "switch.h" + +/* inode vserver commands */ + +#define VCMD_get_iattr_v0 VC_CMD(INODE, 1, 0) +#define VCMD_set_iattr_v0 VC_CMD(INODE, 2, 0) + +#define VCMD_get_iattr VC_CMD(INODE, 1, 1) +#define VCMD_set_iattr VC_CMD(INODE, 2, 1) + +struct vcmd_ctx_iattr_v0 { + /* device handle in id */ + uint64_t ino; + uint32_t xid; + uint32_t flags; + uint32_t mask; +}; + +struct vcmd_ctx_iattr_v1 { + const char __user *name; + uint32_t xid; + uint32_t flags; + uint32_t mask; +}; + + +#define IATTR_XID 0x01000000 + +#define IATTR_ADMIN 0x00000001 +#define IATTR_WATCH 0x00000002 +#define IATTR_HIDE 0x00000004 +#define IATTR_FLAGS 0x00000007 + +#define IATTR_BARRIER 0x00010000 +#define IATTR_IUNLINK 0x00020000 +#define IATTR_IMMUTABLE 0x00040000 + + +#ifdef CONFIG_PROC_SECURE +#define IATTR_PROC_DEFAULT ( IATTR_ADMIN | IATTR_HIDE ) +#define IATTR_PROC_SYMLINK ( IATTR_ADMIN ) +#else +#define IATTR_PROC_DEFAULT ( IATTR_ADMIN ) +#define IATTR_PROC_SYMLINK ( IATTR_ADMIN ) +#endif + +#ifdef __KERNEL__ + +#define vx_hide_check(c,m) (((m) & IATTR_HIDE) ? vx_check(c,m) : 1) + +extern int vc_get_iattr_v0(uint32_t, void __user *); +extern int vc_set_iattr_v0(uint32_t, void __user *); + +extern int vc_get_iattr(uint32_t, void __user *); +extern int vc_set_iattr(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +/* inode ioctls */ + +#define FIOC_GETXFLG _IOR('x', 5, long) +#define FIOC_SETXFLG _IOW('x', 6, long) + +#endif /* _VX_INODE_H */ diff --git a/include/linux/vserver/legacy.h b/include/linux/vserver/legacy.h new file mode 100644 index 000000000..1372c0fa6 --- /dev/null +++ b/include/linux/vserver/legacy.h @@ -0,0 +1,54 @@ +#ifndef _VX_LEGACY_H +#define _VX_LEGACY_H + +#include "switch.h" +#include "network.h" + +/* compatibiliy vserver commands */ + +#define VCMD_new_s_context VC_CMD(COMPAT, 1, 1) +#define VCMD_set_ipv4root VC_CMD(COMPAT, 2, 3) + +#define VCMD_create_context VC_CMD(VSETUP, 1, 0) + +/* compatibiliy vserver arguments */ + +struct vcmd_new_s_context_v1 { + uint32_t remove_cap; + uint32_t flags; +}; + +struct vcmd_set_ipv4root_v3 { + /* number of pairs in id */ + uint32_t broadcast; + struct { + uint32_t ip; + uint32_t mask; + } nx_mask_pair[NB_IPV4ROOT]; +}; + + +#define VX_INFO_LOCK 1 /* Can't request a new vx_id */ +#define VX_INFO_NPROC 4 /* Limit number of processes in a context */ +#define VX_INFO_PRIVATE 8 /* Noone can join this security context */ +#define VX_INFO_INIT 16 /* This process wants to become the */ + /* logical process 1 of the security */ + /* context */ +#define VX_INFO_HIDEINFO 32 /* Hide some information in /proc */ +#define VX_INFO_ULIMIT 64 /* Use ulimit of the current process */ + /* to become the global limits */ + /* of the context */ +#define VX_INFO_NAMESPACE 128 /* save private namespace */ + + +#define NB_S_CONTEXT 16 + +#define NB_IPV4ROOT 16 + + +#ifdef __KERNEL__ +extern int vc_new_s_context(uint32_t, void __user *); +extern int vc_set_ipv4root(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_LEGACY_H */ diff --git a/include/linux/vserver/limit.h b/include/linux/vserver/limit.h new file mode 100644 index 000000000..27496c1f2 --- /dev/null +++ b/include/linux/vserver/limit.h @@ -0,0 +1,117 @@ +#if defined(__KERNEL__) && defined(_VX_INFO_DEF_) + +#include +#include + +/* context sub struct */ + +#define RLIMIT_OPENFD 12 + +#define NUM_RLIMITS 16 + +#define VLIMIT_SOCK 16 + + +struct _vx_limit { + atomic_t ticks; + + unsigned long rlim[NUM_RLIMITS]; /* Per context limit */ + atomic_t res[NUM_RLIMITS]; /* Current value */ +}; + +static inline void vx_info_init_limit(struct _vx_limit *limit) +{ + int lim; + + for (lim=0; limrlim[lim] = RLIM_INFINITY; + atomic_set(&limit->res[lim], 0); + } +} + +extern unsigned int vx_debug_limit; + +static inline void vx_info_exit_limit(struct _vx_limit *limit) +{ + int lim, value; + + for (lim=0; limres[lim]); + if (value && vx_debug_limit) + printk("!!! limit: %p[%d] = %d on exit.\n", + limit, lim, value); + } +} + + +static inline int vx_info_proc_limit(struct _vx_limit *limit, char *buffer) +{ + return sprintf(buffer, + "PROC:\t%8d/%ld\n" + "VM:\t%8d/%ld\n" + "VML:\t%8d/%ld\n" + "RSS:\t%8d/%ld\n" + "FILES:\t%8d/%ld\n" + "OFD:\t%8d/%ld\n" + ,atomic_read(&limit->res[RLIMIT_NPROC]) + ,limit->rlim[RLIMIT_NPROC] + ,atomic_read(&limit->res[RLIMIT_AS]) + ,limit->rlim[RLIMIT_AS] + ,atomic_read(&limit->res[RLIMIT_MEMLOCK]) + ,limit->rlim[RLIMIT_MEMLOCK] + ,atomic_read(&limit->res[RLIMIT_RSS]) + ,limit->rlim[RLIMIT_RSS] + ,atomic_read(&limit->res[RLIMIT_NOFILE]) + ,limit->rlim[RLIMIT_NOFILE] + ,atomic_read(&limit->res[RLIMIT_OPENFD]) + ,limit->rlim[RLIMIT_OPENFD] + ); +} + +#else /* _VX_INFO_DEF_ */ +#ifndef _VX_LIMIT_H +#define _VX_LIMIT_H + +#include "switch.h" + +/* rlimit vserver commands */ + +#define VCMD_get_rlimit VC_CMD(RLIMIT, 1, 0) +#define VCMD_set_rlimit VC_CMD(RLIMIT, 2, 0) +#define VCMD_get_rlimit_mask VC_CMD(RLIMIT, 3, 0) + +struct vcmd_ctx_rlimit_v0 { + uint32_t id; + uint64_t minimum; + uint64_t softlimit; + uint64_t maximum; +}; + +struct vcmd_ctx_rlimit_mask_v0 { + uint32_t minimum; + uint32_t softlimit; + uint32_t maximum; +}; + +#define CRLIM_UNSET (0ULL) +#define CRLIM_INFINITY (~0ULL) +#define CRLIM_KEEP (~1ULL) + +#ifdef __KERNEL__ + +#include + +extern int vc_get_rlimit(uint32_t, void __user *); +extern int vc_set_rlimit(uint32_t, void __user *); +extern int vc_get_rlimit_mask(uint32_t, void __user *); + +struct sysinfo; + +void vx_vsi_meminfo(struct sysinfo *); +void vx_vsi_swapinfo(struct sysinfo *); + + +#endif /* __KERNEL__ */ + +#endif /* _VX_LIMIT_H */ +#endif diff --git a/include/linux/vserver/namespace.h b/include/linux/vserver/namespace.h new file mode 100644 index 000000000..140fc79f2 --- /dev/null +++ b/include/linux/vserver/namespace.h @@ -0,0 +1,55 @@ +#ifndef _VX_NAMESPACE_H +#define _VX_NAMESPACE_H + +#include + + +/* virtual host info names */ + +#define VCMD_vx_set_vhi_name VC_CMD(VHOST, 1, 0) +#define VCMD_vx_get_vhi_name VC_CMD(VHOST, 2, 0) + +struct vcmd_vx_vhi_name_v0 { + uint32_t field; + char name[65]; +}; + + +enum vx_vhi_name_field { + VHIN_CONTEXT=0, + VHIN_SYSNAME, + VHIN_NODENAME, + VHIN_RELEASE, + VHIN_VERSION, + VHIN_MACHINE, + VHIN_DOMAINNAME, +}; + + +#ifdef __KERNEL__ + +#include + +extern int vc_set_vhi_name(uint32_t, void __user *); +extern int vc_get_vhi_name(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_enter_namespace VC_CMD(PROCALT, 1, 0) +#define VCMD_cleanup_namespace VC_CMD(PROCALT, 2, 0) +#define VCMD_set_namespace VC_CMD(PROCALT, 3, 0) + +#ifdef __KERNEL__ + +struct vx_info; +struct namespace; +struct fs_struct; + +extern int vx_set_namespace(struct vx_info *, struct namespace *, struct fs_struct *); + +extern int vc_enter_namespace(uint32_t, void __user *); +extern int vc_cleanup_namespace(uint32_t, void __user *); +extern int vc_set_namespace(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_NAMESPACE_H */ diff --git a/include/linux/vserver/network.h b/include/linux/vserver/network.h new file mode 100644 index 000000000..b3c39b062 --- /dev/null +++ b/include/linux/vserver/network.h @@ -0,0 +1,142 @@ +#ifndef _VX_NETWORK_H +#define _VX_NETWORK_H + +#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ + +#define IP_DYNAMIC_ID ((uint32_t)-1) /* id for dynamic context */ + +#define NB_IPV4ROOT 16 + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + + +struct nx_info { + struct list_head nx_list; /* linked list of nxinfos */ + nid_t nx_id; /* vnet id */ + atomic_t nx_refcount; + + uint64_t nx_flags; /* network flag word */ + uint64_t nx_ncaps; /* network capabilities */ + + int nbipv4; + __u32 ipv4[NB_IPV4ROOT]; /* Process can only bind to these IPs */ + /* The first one is used to connect */ + /* and for bind any service */ + /* The other must be used explicity */ + __u32 mask[NB_IPV4ROOT]; /* Netmask for each ipv4 */ + /* Used to select the proper source */ + /* address for sockets */ + __u32 v4_bcast; /* Broadcast address to receive UDP */ + + char nx_name[65]; /* network context name */ +}; + + +extern spinlock_t nxlist_lock; +extern struct list_head nx_infos; + + +void free_nx_info(struct nx_info *); +struct nx_info *create_nx_info(void); + +extern struct nx_info *find_nx_info(int); +extern int nx_info_id_valid(int); + +struct in_ifaddr; +struct net_device; + +int ifa_in_nx_info(struct in_ifaddr *, struct nx_info *); +int dev_in_nx_info(struct net_device *, struct nx_info *); + + +#endif /* __KERNEL__ */ + +#include "switch.h" + +/* vinfo commands */ + +#define VCMD_task_nid VC_CMD(VINFO, 2, 0) + +#ifdef __KERNEL__ +extern int vc_task_nid(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_nx_info VC_CMD(VINFO, 6, 0) + +struct vcmd_nx_info_v0 { + uint32_t nid; + /* more to come */ +}; + +#ifdef __KERNEL__ +extern int vc_nx_info(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_net_create VC_CMD(VNET, 1, 0) +#define VCMD_net_migrate VC_CMD(NETMIG, 1, 0) + +#define VCMD_net_add VC_CMD(NETALT, 1, 0) +#define VCMD_net_remove VC_CMD(NETALT, 2, 0) + +struct vcmd_net_nx_v0 { + uint16_t type; + uint16_t count; + uint32_t ip[4]; + uint32_t mask[4]; + /* more to come */ +}; + +// IPN_TYPE_IPV4 + + +#ifdef __KERNEL__ +extern int vc_net_create(uint32_t, void __user *); +extern int vc_net_migrate(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_get_nflags VC_CMD(FLAGS, 5, 0) +#define VCMD_set_nflags VC_CMD(FLAGS, 6, 0) + +struct vcmd_net_flags_v0 { + uint64_t flagword; + uint64_t mask; +}; + +#ifdef __KERNEL__ +extern int vc_get_nflags(uint32_t, void __user *); +extern int vc_set_nflags(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define IPF_STATE_SETUP (1ULL<<32) + + +#define IPF_ONE_TIME (0x0001ULL<<32) + +#define VCMD_get_ncaps VC_CMD(FLAGS, 7, 0) +#define VCMD_set_ncaps VC_CMD(FLAGS, 8, 0) + +struct vcmd_net_caps_v0 { + uint64_t ncaps; + uint64_t cmask; +}; + +#ifdef __KERNEL__ +extern int vc_get_ncaps(uint32_t, void __user *); +extern int vc_set_ncaps(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define IPC_WOSSNAME 0x00000001 + + +#endif /* _VX_NETWORK_H */ diff --git a/include/linux/vserver/sched.h b/include/linux/vserver/sched.h new file mode 100644 index 000000000..d1a206800 --- /dev/null +++ b/include/linux/vserver/sched.h @@ -0,0 +1,139 @@ +#if defined(__KERNEL__) && defined(_VX_INFO_DEF_) + +#include +#include +#include +#include +#include + +/* context sub struct */ + +struct _vx_sched { + spinlock_t tokens_lock; /* lock for this structure */ + + int fill_rate; /* Fill rate: add X tokens... */ + int interval; /* Divisor: per Y jiffies */ + atomic_t tokens; /* number of CPU tokens in this context */ + int tokens_min; /* Limit: minimum for unhold */ + int tokens_max; /* Limit: no more than N tokens */ + uint32_t jiffies; /* add an integral multiple of Y to this */ + + uint64_t ticks; /* token tick events */ + cpumask_t cpus_allowed; /* cpu mask for context */ +}; + +static inline void vx_info_init_sched(struct _vx_sched *sched) +{ + /* scheduling; hard code starting values as constants */ + sched->fill_rate = 1; + sched->interval = 4; + sched->tokens_min = HZ >> 4; + sched->tokens_max = HZ >> 1; + sched->jiffies = jiffies; + sched->tokens_lock = SPIN_LOCK_UNLOCKED; + + atomic_set(&sched->tokens, HZ >> 2); + sched->cpus_allowed = CPU_MASK_ALL; +} + +static inline void vx_info_exit_sched(struct _vx_sched *sched) +{ + return; +} + +static inline int vx_info_proc_sched(struct _vx_sched *sched, char *buffer) +{ + return sprintf(buffer, + "Ticks:\t%16lld\n" + "Token:\t\t%8d\n" + "FillRate:\t%8d\n" + "Interval:\t%8d\n" + "TokensMin:\t%8d\n" + "TokensMax:\t%8d\n" + ,sched->ticks + ,atomic_read(&sched->tokens) + ,sched->fill_rate + ,sched->interval + ,sched->tokens_min + ,sched->tokens_max + ); +} + + +#else /* _VX_INFO_DEF_ */ +#ifndef _VX_SCHED_H +#define _VX_SCHED_H + +#include "switch.h" + +/* sched vserver commands */ + +#define VCMD_set_sched VC_CMD(SCHED, 1, 2) + +struct vcmd_set_sched_v2 { + int32_t fill_rate; + int32_t interval; + int32_t tokens; + int32_t tokens_min; + int32_t tokens_max; + uint64_t cpu_mask; +}; + +#define SCHED_KEEP (-2) + +#ifdef __KERNEL__ + +extern int vc_set_sched_v1(uint32_t, void __user *); +extern int vc_set_sched(uint32_t, void __user *); + + +#define VAVAVOOM_RATIO 50 + +#include "context.h" + + +/* scheduling stuff */ + +int effective_vavavoom(struct task_struct *, int); + +int vx_tokens_recalc(struct vx_info *); + +/* new stuff ;) */ + +static inline int vx_tokens_avail(struct vx_info *vxi) +{ + return atomic_read(&vxi->sched.tokens); +} + +static inline void vx_consume_token(struct vx_info *vxi) +{ + atomic_dec(&vxi->sched.tokens); +} + +static inline int vx_need_resched(struct task_struct *p) +{ +#ifdef CONFIG_VSERVER_HARDCPU + struct vx_info *vxi = p->vx_info; + + if (vxi) { + int tokens; + + p->time_slice--; + if (atomic_read(&vxi->vx_refcount) < 1) + printk("need_resched: p=%p, s=%ld, ref=%d, id=%d/%d\n", + p, p->state, atomic_read(&vxi->vx_refcount), + vxi->vx_id, p->xid); + if ((tokens = vx_tokens_avail(vxi)) > 0) + vx_consume_token(vxi); + return ((p->time_slice == 0) || (tokens < 1)); + } +#endif + p->time_slice--; + return (p->time_slice == 0); +} + + +#endif /* __KERNEL__ */ + +#endif /* _VX_SCHED_H */ +#endif diff --git a/include/linux/vserver/signal.h b/include/linux/vserver/signal.h new file mode 100644 index 000000000..391112768 --- /dev/null +++ b/include/linux/vserver/signal.h @@ -0,0 +1,19 @@ +#ifndef _VX_SIGNAL_H +#define _VX_SIGNAL_H + +#include "switch.h" + +/* context signalling */ + +#define VCMD_ctx_kill VC_CMD(PROCTRL, 1, 0) + +struct vcmd_ctx_kill_v0 { + int32_t pid; + int32_t sig; +}; + +#ifdef __KERNEL__ +extern int vc_ctx_kill(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_SIGNAL_H */ diff --git a/include/linux/vserver/switch.h b/include/linux/vserver/switch.h new file mode 100644 index 000000000..5fef6907b --- /dev/null +++ b/include/linux/vserver/switch.h @@ -0,0 +1,95 @@ +#ifndef _VX_SWITCH_H +#define _VX_SWITCH_H + +#include + +#define VC_CATEGORY(c) (((c) >> 24) & 0x3F) +#define VC_COMMAND(c) (((c) >> 16) & 0xFF) +#define VC_VERSION(c) ((c) & 0xFFF) + +#define VC_CMD(c,i,v) ((((VC_CAT_ ## c) & 0x3F) << 24) \ + | (((i) & 0xFF) << 16) | ((v) & 0xFFF)) + +/* + + Syscall Matrix V2.6 + + |VERSION|CREATE |MODIFY |MIGRATE|CONTROL|EXPERIM| |SPECIAL|SPECIAL| + |STATS |DESTROY|ALTER |CHANGE |LIMIT |TEST | | | | + |INFO |SETUP | |MOVE | | | | | | + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + SYSTEM |VERSION|VSETUP |VHOST | | | | |DEVICES| | + HOST | 00| 01| 02| 03| 04| 05| | 06| 07| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + CPU | |VPROC |PROCALT|PROCMIG|PROCTRL| | |SCHED. | | + PROCESS| 08| 09| 10| 11| 12| 13| | 14| 15| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + MEMORY | | | | | | | |SWAP | | + | 16| 17| 18| 19| 20| 21| | 22| 23| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + NETWORK| |VNET |NETALT |NETMIG |NETCTL | | |SERIAL | | + | 24| 25| 26| 27| 28| 29| | 30| 31| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + DISK | | | | | | | |INODE | | + VFS | 32| 33| 34| 35| 36| 37| | 38| 39| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + OTHER | | | | | | | |VINFO | | + | 40| 41| 42| 43| 44| 45| | 46| 47| + =======+=======+=======+=======+=======+=======+=======+ +=======+=======+ + SPECIAL| | | | |FLAGS | | | | | + | 48| 49| 50| 51| 52| 53| | 54| 55| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + SPECIAL| | | | |RLIMIT |SYSCALL| | |COMPAT | + | 56| 57| 58| 59| 60|TEST 61| | 62| 63| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + +*/ + +#define VC_CAT_VERSION 0 + +#define VC_CAT_VSETUP 1 +#define VC_CAT_VHOST 2 + +#define VC_CAT_VPROC 9 +#define VC_CAT_PROCALT 10 +#define VC_CAT_PROCMIG 11 +#define VC_CAT_PROCTRL 12 + +#define VC_CAT_SCHED 14 + +#define VC_CAT_VNET 25 +#define VC_CAT_NETALT 26 +#define VC_CAT_NETMIG 27 +#define VC_CAT_NETCTRL 28 + +#define VC_CAT_INODE 38 + +#define VC_CAT_VINFO 46 + +#define VC_CAT_FLAGS 52 +#define VC_CAT_RLIMIT 60 + +#define VC_CAT_SYSTEST 61 +#define VC_CAT_COMPAT 63 + +/* interface version */ + +#define VCI_VERSION 0x00010016 + + +/* query version */ + +#define VCMD_get_version VC_CMD(VERSION, 0, 0) + + +#ifdef __KERNEL__ + +#include + +#define ENOTSUP -EOPNOTSUPP + +#else /* __KERNEL__ */ +#define __user +#endif /* __KERNEL__ */ + +#endif /* _VX_SWITCH_H */ diff --git a/include/linux/vserver/xid.h b/include/linux/vserver/xid.h new file mode 100644 index 000000000..ba52c2588 --- /dev/null +++ b/include/linux/vserver/xid.h @@ -0,0 +1,94 @@ +#ifndef _LINUX_XID_H_ +#define _LINUX_XID_H_ + +#ifdef CONFIG_INOXID_NONE + +#define MAX_UID 0xFFFFFFFF +#define MAX_GID 0xFFFFFFFF + +#define INOXID_XID(uid, gid, xid) (0) + +#define XIDINO_UID(uid, xid) (uid) +#define XIDINO_GID(gid, xid) (gid) + +#endif + + +#ifdef CONFIG_INOXID_GID16 + +#define MAX_UID 0xFFFFFFFF +#define MAX_GID 0x0000FFFF + +#define INOXID_XID(uid, gid, xid) (((gid) >> 16) & 0xFFFF) + +#define XIDINO_UID(uid, xid) (uid) +#define XIDINO_GID(gid, xid) (((gid) & 0xFFFF) | ((xid) << 16)) + + +#endif + + +#ifdef CONFIG_INOXID_GID24 + +#define MAX_UID 0x00FFFFFF +#define MAX_GID 0x00FFFFFF + +#define INOXID_XID(uid, gid, xid) ((((uid) >> 16) & 0xFF00) | (((gid) >> 24) & 0xFF)) + +#define XIDINO_UID(uid, xid) (((uid) & 0xFFFFFF) | (((xid) & 0xFF00) << 16)) +#define XIDINO_GID(gid, xid) (((gid) & 0xFFFFFF) | (((xid) & 0x00FF) << 24)) + +#endif + + +#ifdef CONFIG_INOXID_GID32 + +#define MAX_UID 0xFFFFFFFF +#define MAX_GID 0xFFFFFFFF + +#define INOXID_XID(uid, gid, xid) (xid) + +#define XIDINO_UID(uid, xid) (uid) +#define XIDINO_GID(gid, xid) (gid) + +#endif + + +#ifdef CONFIG_INOXID_RUNTIME + +#define MAX_UID 0xFFFFFFFF +#define MAX_GID 0xFFFFFFFF + +#define INOXID_XID(uid, gid, xid) (0) + +#define XIDINO_UID(uid, xid) (uid) +#define XIDINO_GID(gid, xid) (gid) + +#endif + + +#define INOXID_UID(uid, gid) ((uid) & MAX_UID) +#define INOXID_GID(uid, gid) ((gid) & MAX_GID) + +static inline uid_t vx_map_uid(uid_t uid) +{ + if ((uid > MAX_UID) && (uid != -1)) + uid = -2; + return (uid & MAX_UID); +} + +static inline gid_t vx_map_gid(gid_t gid) +{ + if ((gid > MAX_GID) && (gid != -1)) + gid = -2; + return (gid & MAX_GID); +} + + +#ifdef CONFIG_VSERVER_LEGACY +#define FIOC_GETXID _IOR('x', 1, long) +#define FIOC_SETXID _IOW('x', 2, long) +#define FIOC_SETXIDJ _IOW('x', 3, long) +#endif + +#endif /* _LINUX_XID_H_ */ diff --git a/kernel/vserver/Kconfig b/kernel/vserver/Kconfig new file mode 100644 index 000000000..635d8d488 --- /dev/null +++ b/kernel/vserver/Kconfig @@ -0,0 +1,72 @@ +# +# Linux VServer configuration +# + +menu "Linux VServer" + +config VSERVER_LEGACY + bool "Enable Legacy Kernel API" + default y + help + This enables the legacy API used in vs1.xx, which allows + to use older tools (for migration purposes). + +config PROC_SECURE + bool "Enable Proc Security" + depends on PROC_FS + default y + help + Hide proc entries by default for xid>1 + +config VSERVER_HARDCPU + bool "Enable Hard CPU Limits" + depends on EXPERIMENTAL + default n + help + Activate the Hard CPU Limits + +choice + prompt "Persistent Inode Context Tagging" + default INOXID_GID24 + help + This adds persistent context information to filesystems + mounted with the tagxid option. Tagging is a requirement + for per context disk limits and per context quota. + + +config INOXID_NONE + bool "Disabled" + help + no context information is store for inodes + +config INOXID_GID16 + bool "UID32/GID16" + help + reduces GID to 16 bit, but leaves UID at 32 bit. + +config INOXID_GID24 + bool "UID24/GID24" + help + uses the upper 8bit from UID and GID for XID tagging + which leaves 24bit for UID/GID each, which should be + more than sufficient for normal use. + +config INOXID_GID32 + bool "UID32/GID32" + help + this uses otherwise reserved inode fields in the on + disk representation, which limits the use to a few + filesystems (currently ext2 and ext3) + +config INOXID_MAGIC + bool "Runtime" + depends on EXPERIMENTAL + help + inodes are tagged when first accessed, this doesn't + require any persistant information, but might give + funny results for mixed access. + +endchoice + +endmenu + diff --git a/kernel/vserver/Makefile b/kernel/vserver/Makefile new file mode 100644 index 000000000..c035a77cd --- /dev/null +++ b/kernel/vserver/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Linux vserver routines. +# + + +obj-y += vserver.o + +vserver-y := switch.o context.o namespace.o sched.o network.o inode.o \ + limit.o cvirt.o signal.o proc.o sysctl.o init.o + +vserver-$(CONFIG_VSERVER_LEGACY) += legacy.o + diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c new file mode 100644 index 000000000..538834c57 --- /dev/null +++ b/kernel/vserver/context.c @@ -0,0 +1,558 @@ +/* + * linux/kernel/vserver/context.c + * + * Virtual Server: Context Support + * + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 context helper + * V0.02 vx_ctx_kill syscall command + * V0.03 replaced context_info calls + * V0.04 redesign of struct (de)alloc + * V0.05 rlimit basic implementation + * V0.06 task_xid and info commands + * V0.07 context flags and caps + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* system functions */ + + +LIST_HEAD(vx_infos); + +spinlock_t vxlist_lock + __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + + +/* + * struct vx_info allocation and deallocation + */ + +static struct vx_info *alloc_vx_info(int id) +{ + struct vx_info *new = NULL; + + vxdprintk("alloc_vx_info(%d)\n", id); + /* would this benefit from a slab cache? */ + new = kmalloc(sizeof(struct vx_info), GFP_KERNEL); + if (!new) + return 0; + + memset (new, 0, sizeof(struct vx_info)); + new->vx_id = id; + INIT_LIST_HEAD(&new->vx_list); + /* rest of init goes here */ + + vx_info_init_limit(&new->limit); + vx_info_init_sched(&new->sched); + vx_info_init_cvirt(&new->cvirt); + vx_info_init_cacct(&new->cacct); + + new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT; + new->vx_bcaps = CAP_INIT_EFF_SET; + new->vx_ccaps = 0; + + vxdprintk("alloc_vx_info(%d) = %p\n", id, new); + return new; +} + +void free_vx_info(struct vx_info *vxi) +{ + vxdprintk("free_vx_info(%p)\n", vxi); + if (vxi->vx_namespace) + put_namespace(vxi->vx_namespace); + if (vxi->vx_fs) + put_fs_struct(vxi->vx_fs); + + vx_info_exit_limit(&vxi->limit); + vx_info_exit_sched(&vxi->sched); + vx_info_exit_cvirt(&vxi->cvirt); + vx_info_exit_cacct(&vxi->cacct); + + BUG_ON(atomic_read(&vxi->vx_refcount)); + vxi->vx_id = -1; + + kfree(vxi); +} + + +/* + * struct vx_info search by id + * assumes vxlist_lock is held + */ + +static __inline__ struct vx_info *__find_vx_info(int id) +{ + struct vx_info *vxi; + + list_for_each_entry(vxi, &vx_infos, vx_list) + if (vxi->vx_id == id) + return vxi; + return 0; +} + + +/* + * struct vx_info ref stuff + */ + +struct vx_info *find_vx_info(int id) +{ + struct vx_info *vxi; + + if (id < 0) { + vxi = current->vx_info; + get_vx_info(vxi); + } else { + spin_lock(&vxlist_lock); + if ((vxi = __find_vx_info(id))) + get_vx_info(vxi); + spin_unlock(&vxlist_lock); + } + return vxi; +} + +/* + * verify that id is a valid xid + */ + +int vx_info_id_valid(int id) +{ + int valid; + + spin_lock(&vxlist_lock); + valid = (__find_vx_info(id) != NULL); + spin_unlock(&vxlist_lock); + return valid; +} + + +/* + * dynamic context id ... + */ + +static __inline__ xid_t __vx_dynamic_id(void) +{ + static xid_t seq = MAX_S_CONTEXT; + xid_t barrier = seq; + + do { + if (++seq > MAX_S_CONTEXT) + seq = MIN_D_CONTEXT; + if (!__find_vx_info(seq)) + return seq; + } while (barrier != seq); + return 0; +} + +static struct vx_info * __foc_vx_info(int id, int *err) +{ + struct vx_info *new, *vxi = NULL; + + vxdprintk("foc_vx_info(%d)\n", id); + if (!(new = alloc_vx_info(id))) { + *err = -ENOMEM; + return NULL; + } + + /* dirty hack until Spectator becomes a cap */ + if (id == 0 || id == 1) { + *err = -EBUSY; + return NULL; + } + + spin_lock(&vxlist_lock); + + /* dynamic context requested */ + if (id == VX_DYNAMIC_ID) { + id = __vx_dynamic_id(); + if (!id) { + printk(KERN_ERR "no dynamic context available.\n"); + goto out_unlock; + } + new->vx_id = id; + } + /* existing context requested */ + else if ((vxi = __find_vx_info(id))) { + /* context in setup is not available */ + if (vxi->vx_flags & VXF_STATE_SETUP) { + vxdprintk("foc_vx_info(%d) = %p (not available)\n", id, vxi); + vxi = NULL; + *err = -EBUSY; + } else { + vxdprintk("foc_vx_info(%d) = %p (found)\n", id, vxi); + get_vx_info(vxi); + *err = 0; + } + goto out_unlock; + } + + /* new context requested */ + vxdprintk("foc_vx_info(%d) = %p (new)\n", id, new); + atomic_set(&new->vx_refcount, 1); + list_add(&new->vx_list, &vx_infos); + vxi = new, new = NULL; + *err = 1; + +out_unlock: + spin_unlock(&vxlist_lock); + if (new) + free_vx_info(new); + return vxi; +} + + +struct vx_info *find_or_create_vx_info(int id) +{ + int err; + + return __foc_vx_info(id, &err); +} + + +int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) +{ + struct user_struct *new_user, *old_user; + + if (!p || !vxi) + BUG(); + new_user = alloc_uid(vxi->vx_id, p->uid); + if (!new_user) + return -ENOMEM; + + old_user = p->user; + if (new_user != old_user) { + atomic_inc(&new_user->processes); + atomic_dec(&old_user->processes); + p->user = new_user; + } + free_uid(old_user); + return 0; +} + +void vx_mask_bcaps(struct task_struct *p) +{ + struct vx_info *vxi = p->vx_info; + + p->cap_effective &= vxi->vx_bcaps; + p->cap_inheritable &= vxi->vx_bcaps; + p->cap_permitted &= vxi->vx_bcaps; +} + + +#include + +static inline int vx_nofiles_task(struct task_struct *tsk) +{ + struct files_struct *files = tsk->files; + const unsigned long *obptr, *cbptr; + int count, total; + + spin_lock(&files->file_lock); + obptr = files->open_fds->fds_bits; + cbptr = files->close_on_exec->fds_bits; + count = files->max_fds / (sizeof(unsigned long) * 8); + for (total = 0; count > 0; count--) { + if (*obptr) + total += hweight_long(*obptr); + obptr++; + /* if (*cbptr) + total += hweight_long(*cbptr); + cbptr++; */ + } + spin_unlock(&files->file_lock); + return total; +} + +static inline int vx_openfd_task(struct task_struct *tsk) +{ + struct files_struct *files = tsk->files; + const unsigned long *bptr; + int count, total; + + spin_lock(&files->file_lock); + bptr = files->open_fds->fds_bits; + count = files->max_fds / (sizeof(unsigned long) * 8); + for (total = 0; count > 0; count--) { + if (*bptr) + total += hweight_long(*bptr); + bptr++; + } + spin_unlock(&files->file_lock); + return total; +} + +/* + * migrate task to new context + * gets vxi, puts old_vxi on change + */ + +int vx_migrate_task(struct task_struct *p, struct vx_info *vxi) +{ + struct vx_info *old_vxi = task_get_vx_info(p); + int ret = 0; + + if (!p || !vxi) + BUG(); + + vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi, + vxi->vx_id, atomic_read(&vxi->vx_refcount)); + if (old_vxi == vxi) + goto out; + + if (!(ret = vx_migrate_user(p, vxi))) { + task_lock(p); + if (old_vxi) { + atomic_dec(&old_vxi->cacct.nr_threads); + atomic_dec(&old_vxi->limit.res[RLIMIT_NPROC]); + } + atomic_inc(&vxi->cacct.nr_threads); + atomic_inc(&vxi->limit.res[RLIMIT_NPROC]); + atomic_add(vx_nofiles_task(p), &vxi->limit.res[RLIMIT_NOFILE]); + atomic_add(vx_openfd_task(p), &vxi->limit.res[RLIMIT_OPENFD]); + set_vx_info(&p->vx_info, vxi); + p->xid = vxi->vx_id; + vx_mask_bcaps(p); + task_unlock(p); + + put_vx_info(old_vxi); + } +out: + put_vx_info(old_vxi); + return ret; +} + +int vx_set_init(struct vx_info *vxi, struct task_struct *p) +{ + if (!vxi) + return -EINVAL; + if (vxi->vx_initpid) + return -EPERM; + + vxi->vx_initpid = p->tgid; + return 0; +} + + +/* vserver syscall commands below here */ + +/* taks xid and vx_info functions */ + +#include + + +int vc_task_xid(uint32_t id, void __user *data) +{ + xid_t xid; + + if (id) { + struct task_struct *tsk; + + if (!vx_check(0, VX_ADMIN|VX_WATCH)) + return -EPERM; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(id); + xid = (tsk) ? tsk->xid : -ESRCH; + read_unlock(&tasklist_lock); + } + else + xid = current->xid; + return xid; +} + + +int vc_vx_info(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_vx_info_v0 vc_data; + + if (!vx_check(0, VX_ADMIN)) + return -ENOSYS; + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + vc_data.xid = vxi->vx_id; + vc_data.initpid = vxi->vx_initpid; + put_vx_info(vxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +/* context functions */ + +int vc_ctx_create(uint32_t xid, void __user *data) +{ + // int ret = -ENOMEM; + struct vx_info *new_vxi; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((xid >= MIN_D_CONTEXT) && (xid != VX_DYNAMIC_ID)) + return -EINVAL; + + if (xid < 1) + return -EINVAL; + + new_vxi = __foc_vx_info(xid, &ret); + if (!new_vxi) + return ret; + if (!(new_vxi->vx_flags & VXF_STATE_SETUP)) { + ret = -EEXIST; + goto out_put; + } + + ret = new_vxi->vx_id; + vx_migrate_task(current, new_vxi); +out_put: + put_vx_info(new_vxi); + return ret; +} + + +int vc_ctx_migrate(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* dirty hack until Spectator becomes a cap */ + if (id == 1) { + current->xid = 1; + return 0; + } + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + vx_migrate_task(current, vxi); + put_vx_info(vxi); + return 0; +} + + +int vc_get_cflags(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_ctx_flags_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + vc_data.flagword = vxi->vx_flags; + + // vc_data.mask = ~0UL; + /* special STATE flag handling */ + vc_data.mask = vx_mask_flags(~0UL, vxi->vx_flags, VXF_ONE_TIME); + + put_vx_info(vxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_cflags(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_ctx_flags_v0 vc_data; + uint64_t mask, trigger; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + /* special STATE flag handling */ + mask = vx_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); + trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); + + if (trigger & VXF_STATE_SETUP) + vx_mask_bcaps(current); + if (trigger & VXF_STATE_INIT) + if (vxi == current->vx_info) + vx_set_init(vxi, current); + + vxi->vx_flags = vx_mask_flags(vxi->vx_flags, + vc_data.flagword, mask); + put_vx_info(vxi); + return 0; +} + +int vc_get_ccaps(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_ctx_caps_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + vc_data.bcaps = vxi->vx_bcaps; + vc_data.ccaps = vxi->vx_ccaps; + vc_data.cmask = ~0UL; + put_vx_info(vxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_ccaps(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_ctx_caps_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + vxi->vx_bcaps &= vc_data.bcaps; + vxi->vx_ccaps = vx_mask_flags(vxi->vx_ccaps, + vc_data.ccaps, vc_data.cmask); + put_vx_info(vxi); + return 0; +} + +#include + +EXPORT_SYMBOL_GPL(free_vx_info); +EXPORT_SYMBOL_GPL(vxlist_lock); + diff --git a/kernel/vserver/cvirt.c b/kernel/vserver/cvirt.c new file mode 100644 index 000000000..2b5c81e35 --- /dev/null +++ b/kernel/vserver/cvirt.c @@ -0,0 +1,41 @@ +/* + * linux/kernel/vserver/cvirt.c + * + * Virtual Server: Context Virtualization + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 broken out from limit.c + * + */ + +#include +#include +#include +#include +#include + +#include +#include + + +void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle) +{ + struct vx_info *vxi = current->vx_info; + + set_normalized_timespec(uptime, + uptime->tv_sec - vxi->cvirt.bias_tp.tv_sec, + uptime->tv_nsec - vxi->cvirt.bias_tp.tv_nsec); + if (!idle) + return; + set_normalized_timespec(idle, + idle->tv_sec - vxi->cvirt.bias_idle.tv_sec, + idle->tv_nsec - vxi->cvirt.bias_idle.tv_nsec); + return; +} + +uint64_t vx_idle_jiffies() +{ + return init_task.utime + init_task.stime; +} + diff --git a/kernel/vserver/init.c b/kernel/vserver/init.c new file mode 100644 index 000000000..8afd1fc64 --- /dev/null +++ b/kernel/vserver/init.c @@ -0,0 +1,42 @@ +/* + * linux/kernel/init.c + * + * Virtual Server Init + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 basic structure + * + */ + +#include +#include +#include +// #include +#include +#include + +int vserver_register_sysctl(void); +void vserver_unregister_sysctl(void); + + +static int __init init_vserver(void) +{ + int ret = 0; + + vserver_register_sysctl(); + return ret; +} + + +static void __exit exit_vserver(void) +{ + + vserver_unregister_sysctl(); + return; +} + + +module_init(init_vserver); +module_exit(exit_vserver); + diff --git a/kernel/vserver/inode.c b/kernel/vserver/inode.c new file mode 100644 index 000000000..87e2849f3 --- /dev/null +++ b/kernel/vserver/inode.c @@ -0,0 +1,220 @@ +/* + * linux/kernel/vserver/inode.c + * + * Virtual Server: File System Support + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 separated from vcontext V0.05 + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + + +static int __vc_get_iattr(struct inode *in, uint32_t *xid, uint32_t *flags, uint32_t *mask) +{ + if (!in || !in->i_sb) + return -ESRCH; + + *flags = IATTR_XID + | (IS_BARRIER(in) ? IATTR_BARRIER : 0) + | (IS_IUNLINK(in) ? IATTR_IUNLINK : 0) + | (IS_IMMUTABLE(in) ? IATTR_IMMUTABLE : 0); + *mask = IATTR_IUNLINK | IATTR_IMMUTABLE; + + if (S_ISDIR(in->i_mode)) + *mask |= IATTR_BARRIER; + + if (in->i_sb->s_flags & MS_TAGXID) { + *xid = in->i_xid; + *mask |= IATTR_XID; + } + + if (in->i_sb->s_magic == PROC_SUPER_MAGIC) { + struct proc_dir_entry *entry = PROC_I(in)->pde; + + // check for specific inodes ? + if (entry) + *mask |= IATTR_FLAGS; + if (entry) + *flags |= (entry->vx_flags & IATTR_FLAGS); + else + *flags |= (PROC_I(in)->vx_flags & IATTR_FLAGS); + } + return 0; +} + +int vc_get_iattr(uint32_t id, void __user *data) +{ + struct nameidata nd; + struct vcmd_ctx_iattr_v1 vc_data; + int ret; + + if (!vx_check(0, VX_ADMIN)) + return -ENOSYS; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + ret = user_path_walk_link(vc_data.name, &nd); + if (!ret) { + ret = __vc_get_iattr(nd.dentry->d_inode, + &vc_data.xid, &vc_data.flags, &vc_data.mask); + path_release(&nd); + } + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + ret = -EFAULT; + return ret; +} + +static int __vc_set_iattr(struct dentry *de, uint32_t *xid, uint32_t *flags, uint32_t *mask) +{ + struct inode *in = de->d_inode; + int error = 0, is_proc = 0; + + if (!in || !in->i_sb) + return -ESRCH; + + is_proc = (in->i_sb->s_magic == PROC_SUPER_MAGIC); + if ((*mask & IATTR_FLAGS) && !is_proc) + return -EINVAL; + if ((*mask & IATTR_XID) && !(in->i_sb->s_flags & MS_TAGXID)) + return -EINVAL; + + down(&in->i_sem); + if (*mask & IATTR_XID) + in->i_xid = *xid; + + if (*mask & IATTR_FLAGS) { + struct proc_dir_entry *entry = PROC_I(in)->pde; + unsigned int iflags = PROC_I(in)->vx_flags; + + iflags = (iflags & ~(*mask & IATTR_FLAGS)) + | (*flags & IATTR_FLAGS); + PROC_I(in)->vx_flags = iflags; + if (entry) + entry->vx_flags = iflags; + } + + if (*mask & (IATTR_BARRIER | IATTR_IUNLINK | IATTR_IMMUTABLE)) { + struct iattr attr; + + attr.ia_valid = ATTR_ATTR_FLAG; + attr.ia_attr_flags = + (IS_IMMUTABLE(in) ? ATTR_FLAG_IMMUTABLE : 0) | + (IS_IUNLINK(in) ? ATTR_FLAG_IUNLINK : 0) | + (IS_BARRIER(in) ? ATTR_FLAG_BARRIER : 0); + + if (*mask & IATTR_IMMUTABLE) { + if (*flags & IATTR_IMMUTABLE) + attr.ia_attr_flags |= ATTR_FLAG_IMMUTABLE; + else + attr.ia_attr_flags &= ~ATTR_FLAG_IMMUTABLE; + } + if (*mask & IATTR_IUNLINK) { + if (*flags & IATTR_IUNLINK) + attr.ia_attr_flags |= ATTR_FLAG_IUNLINK; + else + attr.ia_attr_flags &= ~ATTR_FLAG_IUNLINK; + } + if (S_ISDIR(in->i_mode) && (*mask & IATTR_BARRIER)) { + if (*flags & IATTR_BARRIER) + attr.ia_attr_flags |= ATTR_FLAG_BARRIER; + else + attr.ia_attr_flags &= ~ATTR_FLAG_BARRIER; + } + if (in->i_op && in->i_op->setattr) + error = in->i_op->setattr(de, &attr); + else { + error = inode_change_ok(in, &attr); + if (!error) + error = inode_setattr(in, &attr); + } + } + + mark_inode_dirty(in); + up(&in->i_sem); + return 0; +} + +int vc_set_iattr(uint32_t id, void __user *data) +{ + struct nameidata nd; + struct vcmd_ctx_iattr_v1 vc_data; + int ret; + + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + ret = user_path_walk_link(vc_data.name, &nd); + if (!ret) { + ret = __vc_set_iattr(nd.dentry, + &vc_data.xid, &vc_data.flags, &vc_data.mask); + path_release(&nd); + } + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + ret = -EFAULT; + return ret; +} + + +#ifdef CONFIG_VSERVER_LEGACY +#include + +#define PROC_DYNAMIC_FIRST 0xF0000000UL + +int vx_proc_ioctl(struct inode * inode, struct file * filp, + unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *entry; + int error = 0; + int flags; + + if (inode->i_ino < PROC_DYNAMIC_FIRST) + return -ENOTTY; + + entry = PROC_I(inode)->pde; + + switch(cmd) { + case FIOC_GETXFLG: { + /* fixme: if stealth, return -ENOTTY */ + error = -EPERM; + flags = entry->vx_flags; + if (capable(CAP_CONTEXT)) + error = put_user(flags, (int *) arg); + break; + } + case FIOC_SETXFLG: { + /* fixme: if stealth, return -ENOTTY */ + error = -EPERM; + if (!capable(CAP_CONTEXT)) + break; + error = -EROFS; + if (IS_RDONLY(inode)) + break; + error = -EFAULT; + if (get_user(flags, (int *) arg)) + break; + error = 0; + entry->vx_flags = flags; + break; + } + default: + return -ENOTTY; + } + return error; +} +#endif + diff --git a/kernel/vserver/legacy.c b/kernel/vserver/legacy.c new file mode 100644 index 000000000..a620ae3b5 --- /dev/null +++ b/kernel/vserver/legacy.c @@ -0,0 +1,161 @@ +/* + * linux/kernel/vserver/legacy.c + * + * Virtual Server: Legacy Funtions + * + * Copyright (C) 2001-2003 Jacques Gelinas + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 broken out from vcontext.c V0.05 + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + + +static int vx_set_initpid(struct vx_info *vxi, int pid) +{ + if (vxi->vx_initpid) + return -EPERM; + + vxi->vx_initpid = pid; + return 0; +} + +int vc_new_s_context(uint32_t ctx, void __user *data) +{ + int ret = -ENOMEM; + struct vcmd_new_s_context_v1 vc_data; + struct vx_info *new_vxi; + + if (copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + /* legacy hack, will be removed soon */ + if (ctx == -2) { + /* assign flags and initpid */ + if (!current->vx_info) + return -EINVAL; + ret = 0; + if (vc_data.flags & VX_INFO_INIT) + ret = vx_set_initpid(current->vx_info, current->tgid); + if (ret == 0) { + /* We keep the same vx_id, but lower the capabilities */ + current->vx_info->vx_bcaps &= (~vc_data.remove_cap); + // current->cap_bset &= (~vc_data.remove_cap); + ret = vx_current_xid(); + current->vx_info->vx_flags |= vc_data.flags; + } + return ret; + } + + if (!vx_check(0, VX_ADMIN) || + !capable(CAP_SYS_ADMIN) || vx_flags(VX_INFO_LOCK, 0)) + return -EPERM; + + /* ugly hack for Spectator */ + if (ctx == 1) { + current->xid = 1; + return 0; + } + + if (((ctx > MAX_S_CONTEXT) && (ctx != VX_DYNAMIC_ID)) || + (ctx == 0)) + return -EINVAL; + + if ((ctx == VX_DYNAMIC_ID) || (ctx < MIN_D_CONTEXT)) + new_vxi = find_or_create_vx_info(ctx); + else + new_vxi = find_vx_info(ctx); + + if (!new_vxi) + return -EINVAL; + new_vxi->vx_flags &= ~(VXF_STATE_SETUP|VXF_STATE_INIT); + + ret = vx_migrate_task(current, new_vxi); + if (ret == 0) { + current->vx_info->vx_bcaps &= (~vc_data.remove_cap); + // current->cap_bset &= (~vc_data.remove_cap); + new_vxi->vx_flags |= vc_data.flags; + if (vc_data.flags & VX_INFO_INIT) + vx_set_initpid(new_vxi, current->tgid); + if (vc_data.flags & VX_INFO_NAMESPACE) + vx_set_namespace(new_vxi, + current->namespace, current->fs); + if (vc_data.flags & VX_INFO_NPROC) + new_vxi->limit.rlim[RLIMIT_NPROC] = + current->rlim[RLIMIT_NPROC].rlim_max; + ret = new_vxi->vx_id; + } + put_vx_info(new_vxi); + return ret; +} + + + +/* set ipv4 root (syscall) */ + +int vc_set_ipv4root(uint32_t nbip, void __user *data) +{ + int i, err = -EPERM; + struct vcmd_set_ipv4root_v3 vc_data; + struct nx_info *new_nxi, *nxi = current->nx_info; + + if (nbip < 0 || nbip > NB_IPV4ROOT) + return -EINVAL; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + if (!nxi || nxi->ipv4[0] == 0 || capable(CAP_NET_ADMIN)) + // We are allowed to change everything + err = 0; + else if (nxi) { + int found = 0; + + // We are allowed to select a subset of the currently + // installed IP numbers. No new one allowed + // We can't change the broadcast address though + for (i=0; inbipv4; j++) { + if (nxip == nxi->ipv4[j]) { + found++; + break; + } + } + } + if ((found == nbip) && + (vc_data.broadcast == nxi->v4_bcast)) + err = 0; + } + if (err) + return err; + + new_nxi = create_nx_info(); + if (!new_nxi) + return -EINVAL; + + new_nxi->nbipv4 = nbip; + for (i=0; iipv4[i] = vc_data.nx_mask_pair[i].ip; + new_nxi->mask[i] = vc_data.nx_mask_pair[i].mask; + } + new_nxi->v4_bcast = vc_data.broadcast; + current->nx_info = new_nxi; + current->nid = new_nxi->nx_id; + put_nx_info(nxi); + return 0; +} + + diff --git a/kernel/vserver/limit.c b/kernel/vserver/limit.c new file mode 100644 index 000000000..5bd2fdcb9 --- /dev/null +++ b/kernel/vserver/limit.c @@ -0,0 +1,149 @@ +/* + * linux/kernel/vserver/limit.c + * + * Virtual Server: Context Limits + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 broken out from vcontext V0.05 + * + */ + +#include +#include +#include +#include +#include + +#include +#include + + +static int is_valid_rlimit(int id) +{ + int valid = 0; + + switch (id) { + case RLIMIT_NPROC: + case RLIMIT_AS: + case RLIMIT_RSS: + case RLIMIT_MEMLOCK: + case RLIMIT_NOFILE: + valid = 1; + break; + } + return valid; +} + +static inline uint64_t vc_get_rlim(struct vx_info *vxi, int id) +{ + unsigned long limit; + + limit = vxi->limit.rlim[id]; + if (limit == RLIM_INFINITY) + return CRLIM_INFINITY; + return limit; +} + +int vc_get_rlimit(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_ctx_rlimit_v0 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + if (!is_valid_rlimit(vc_data.id)) + return -ENOTSUPP; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + vc_data.maximum = vc_get_rlim(vxi, vc_data.id); + vc_data.minimum = CRLIM_UNSET; + vc_data.softlimit = CRLIM_UNSET; + put_vx_info(vxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_rlimit(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_ctx_rlimit_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + if (!is_valid_rlimit(vc_data.id)) + return -ENOTSUPP; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + if (vc_data.maximum != CRLIM_KEEP) + vxi->limit.rlim[vc_data.id] = vc_data.maximum; + printk("setting [%d] = %d\n", vc_data.id, (int)vc_data.maximum); + put_vx_info(vxi); + + return 0; +} + +int vc_get_rlimit_mask(uint32_t id, void __user *data) +{ + static struct vcmd_ctx_rlimit_mask_v0 mask = { + /* minimum */ + 0 + , /* softlimit */ + 0 + , /* maximum */ + (1 << RLIMIT_NPROC) | + (1 << RLIMIT_NOFILE) | + (1 << RLIMIT_MEMLOCK) | + (1 << RLIMIT_AS) | + (1 << RLIMIT_RSS) + }; + + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (copy_to_user(data, &mask, sizeof(mask))) + return -EFAULT; + return 0; +} + + +void vx_vsi_meminfo(struct sysinfo *val) +{ + struct vx_info *vxi = current->vx_info; + unsigned long v; + + v = vxi->limit.rlim[RLIMIT_RSS]; + if (v != RLIM_INFINITY) + val->totalram = min(val->totalram, v); + v = atomic_read(&vxi->limit.res[RLIMIT_RSS]); + val->freeram = (v < val->totalram) ? val->totalram - v : 0; + val->bufferram = 0; + val->totalhigh = 0; + val->freehigh = 0; + return; +} + +void vx_vsi_swapinfo(struct sysinfo *val) +{ + struct vx_info *vxi = current->vx_info; + unsigned long w,v; + + v = vxi->limit.rlim[RLIMIT_RSS]; + w = vxi->limit.rlim[RLIMIT_AS]; + if (w != RLIM_INFINITY) + val->totalswap = min(val->totalswap, w - + ((v != RLIM_INFINITY) ? v : 0)); + w = atomic_read(&vxi->limit.res[RLIMIT_AS]); + val->freeswap = (w < val->totalswap) ? val->totalswap - w : 0; + return; +} + diff --git a/kernel/vserver/namespace.c b/kernel/vserver/namespace.c new file mode 100644 index 000000000..2c76c6fb4 --- /dev/null +++ b/kernel/vserver/namespace.c @@ -0,0 +1,195 @@ +/* + * linux/kernel/vserver/namespace.c + * + * Virtual Server: Context Namespace Support + * + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 broken out from context.c 0.07 + * V0.02 added task locking for namespace + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + + +/* virtual host info names */ + +static char * vx_vhi_name(struct vx_info *vxi, int id) +{ + switch (id) { + case VHIN_CONTEXT: + return vxi->vx_name; + case VHIN_SYSNAME: + return vxi->cvirt.utsname.sysname; + case VHIN_NODENAME: + return vxi->cvirt.utsname.nodename; + case VHIN_RELEASE: + return vxi->cvirt.utsname.release; + case VHIN_VERSION: + return vxi->cvirt.utsname.version; + case VHIN_MACHINE: + return vxi->cvirt.utsname.machine; + case VHIN_DOMAINNAME: + return vxi->cvirt.utsname.domainname; + default: + return NULL; + } + return NULL; +} + +int vc_set_vhi_name(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_vx_vhi_name_v0 vc_data; + char *name; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + name = vx_vhi_name(vxi, vc_data.field); + if (name) + memcpy(name, vc_data.name, 65); + put_vx_info(vxi); + return (name ? 0 : -EFAULT); +} + +int vc_get_vhi_name(uint32_t id, void __user *data) +{ + struct vx_info *vxi; + struct vcmd_vx_vhi_name_v0 vc_data; + char *name; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + name = vx_vhi_name(vxi, vc_data.field); + if (!name) + goto out_put; + + memcpy(vc_data.name, name, 65); + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; +out_put: + put_vx_info(vxi); + return (name ? 0 : -EFAULT); +} + +/* namespace functions */ + +#include + +int vx_set_namespace(struct vx_info *vxi, struct namespace *ns, struct fs_struct *fs) +{ + struct fs_struct *fs_copy; + + if (vxi->vx_namespace) + return -EPERM; + if (!ns || !fs) + return -EINVAL; + + fs_copy = copy_fs_struct(fs); + if (!fs_copy) + return -ENOMEM; + + get_namespace(ns); + vxi->vx_namespace = ns; + vxi->vx_fs = fs_copy; + return 0; +} + +int vc_enter_namespace(uint32_t id, void *data) +{ + struct vx_info *vxi; + struct fs_struct *old_fs, *fs; + struct namespace *old_ns; + int ret = 0; + + if (!vx_check(0, VX_ADMIN)) + return -ENOSYS; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + ret = -EINVAL; + if (!vxi->vx_namespace) + goto out_put; + + ret = -ENOMEM; + fs = copy_fs_struct(vxi->vx_fs); + if (!fs) + goto out_put; + + ret = 0; + task_lock(current); + old_ns = current->namespace; + old_fs = current->fs; + get_namespace(vxi->vx_namespace); + current->namespace = vxi->vx_namespace; + current->fs = fs; + task_unlock(current); + + put_namespace(old_ns); + put_fs_struct(old_fs); +out_put: + put_vx_info(vxi); + return ret; +} + +int vc_cleanup_namespace(uint32_t id, void *data) +{ + down_write(¤t->namespace->sem); + // spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); + umount_unused(current->namespace->root, current->fs); + spin_unlock(&vfsmount_lock); + // spin_unlock(&dcache_lock); + up_write(¤t->namespace->sem); + return 0; +} + +int vc_set_namespace(uint32_t id, void __user *data) +{ + struct fs_struct *fs; + struct namespace *ns; + struct vx_info *vxi; + int ret; + + if (vx_check(0, VX_ADMIN|VX_WATCH)) + return -ENOSYS; + + task_lock(current); + vxi = get_vx_info(current->vx_info); + fs = current->fs; + atomic_inc(&fs->count); + ns = current->namespace; + get_namespace(current->namespace); + task_unlock(current); + + ret = vx_set_namespace(vxi, ns, fs); + + put_namespace(ns); + put_fs_struct(fs); + put_vx_info(vxi); + return ret; +} + diff --git a/kernel/vserver/network.c b/kernel/vserver/network.c new file mode 100644 index 000000000..479a19b47 --- /dev/null +++ b/kernel/vserver/network.c @@ -0,0 +1,513 @@ +/* + * linux/kernel/vserver/network.c + * + * Virtual Server: Network Support + * + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 broken out from vcontext V0.05 + * V0.02 cleaned up implementation + * V0.03 added equiv nx commands + * + */ + +#include +#include +#include +#include + +#include + + +LIST_HEAD(nx_infos); + +spinlock_t nxlist_lock + __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + + +/* + * struct nx_info allocation and deallocation + */ + +static struct nx_info *alloc_nx_info(void) +{ + struct nx_info *new = NULL; + + nxdprintk("alloc_nx_info()\n"); + /* would this benefit from a slab cache? */ + new = kmalloc(sizeof(struct nx_info), GFP_KERNEL); + if (!new) + return 0; + + memset (new, 0, sizeof(struct nx_info)); + /* rest of init goes here */ + + nxdprintk("alloc_nx_info() = %p\n", new); + return new; +} + +void free_nx_info(struct nx_info *nxi) +{ + nxdprintk("free_nx_info(%p)\n", nxi); + kfree(nxi); +} + +struct nx_info *create_nx_info(void) +{ + struct nx_info *new; + static int gnid = 1; + + nxdprintk("create_nx_info()\n"); + if (!(new = alloc_nx_info())) + return 0; + + spin_lock(&nxlist_lock); + + /* new ip info */ + atomic_set(&new->nx_refcount, 1); + new->nx_id = gnid++; + list_add(&new->nx_list, &nx_infos); + + spin_unlock(&nxlist_lock); + return new; +} + + +/* + * struct nx_info search by id + * assumes nxlist_lock is held + */ + +static __inline__ struct nx_info *__find_nx_info(int id) +{ + struct nx_info *nxi; + + list_for_each_entry(nxi, &nx_infos, nx_list) + if (nxi->nx_id == id) + return nxi; + return 0; +} + + +/* + * struct nx_info ref stuff + */ + +struct nx_info *find_nx_info(int id) +{ + struct nx_info *nxi; + + if (id < 0) { + nxi = current->nx_info; + get_nx_info(nxi); + } else { + spin_lock(&nxlist_lock); + if ((nxi = __find_nx_info(id))) + get_nx_info(nxi); + spin_unlock(&nxlist_lock); + } + return nxi; +} + +/* + * verify that id is a valid nid + */ + +int nx_info_id_valid(int id) +{ + int valid; + + spin_lock(&nxlist_lock); + valid = (__find_nx_info(id) != NULL); + spin_unlock(&nxlist_lock); + return valid; +} + + +/* + * dynamic context id ... + */ + +static __inline__ nid_t __nx_dynamic_id(void) +{ + static nid_t seq = MAX_N_CONTEXT; + nid_t barrier = seq; + + do { + if (++seq > MAX_N_CONTEXT) + seq = MIN_D_CONTEXT; + if (!__find_nx_info(seq)) + return seq; + } while (barrier != seq); + return 0; +} + +static struct nx_info * __foc_nx_info(int id, int *err) +{ + struct nx_info *new, *nxi = NULL; + + nxdprintk("foc_nx_info(%d)\n", id); + // if (!(new = alloc_nx_info(id))) { + if (!(new = alloc_nx_info())) { + *err = -ENOMEM; + return NULL; + } + + spin_lock(&nxlist_lock); + + /* dynamic context requested */ + if (id == IP_DYNAMIC_ID) { + id = __nx_dynamic_id(); + if (!id) { + printk(KERN_ERR "no dynamic context available.\n"); + goto out_unlock; + } + new->nx_id = id; + } + /* existing context requested */ + else if ((nxi = __find_nx_info(id))) { + /* context in setup is not available */ + if (nxi->nx_flags & VXF_STATE_SETUP) { + nxdprintk("foc_nx_info(%d) = %p (not available)\n", id, nxi); + nxi = NULL; + *err = -EBUSY; + } else { + nxdprintk("foc_nx_info(%d) = %p (found)\n", id, nxi); + get_nx_info(nxi); + *err = 0; + } + goto out_unlock; + } + + /* new context requested */ + nxdprintk("foc_nx_info(%d) = %p (new)\n", id, new); + atomic_set(&new->nx_refcount, 1); + list_add(&new->nx_list, &nx_infos); + nxi = new, new = NULL; + *err = 1; + +out_unlock: + spin_unlock(&nxlist_lock); + if (new) + free_nx_info(new); + return nxi; +} + + +struct nx_info *find_or_create_nx_info(int id) +{ + int err; + + return __foc_nx_info(id, &err); +} + +/* + * migrate task to new network + */ + +int nx_migrate_task(struct task_struct *p, struct nx_info *nxi) +{ + struct nx_info *old_nxi = task_get_nx_info(p); + int ret = 0; + + if (!p || !nxi) + BUG(); + + nxdprintk("nx_migrate_task(%p,%p[#%d.%d)\n", p, nxi, + nxi->nx_id, atomic_read(&nxi->nx_refcount)); + if (old_nxi == nxi) + goto out; + + task_lock(p); + set_nx_info(&p->nx_info, nxi); + p->nid = nxi->nx_id; + task_unlock(p); + + put_nx_info(old_nxi); +out: + put_nx_info(old_nxi); + return ret; +} + + +#include +#include + +static inline int __addr_in_nx_info(u32 addr, struct nx_info *nxi) +{ + int i, nbip; + + nbip = nxi->nbipv4; + for (i=0; iipv4[i] == addr) + return 1; + return 0; +} + +int ifa_in_nx_info(struct in_ifaddr *ifa, struct nx_info *nxi) +{ + if (!nxi) + return 1; + + return __addr_in_nx_info(ifa->ifa_address, nxi); +} + +int dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) +{ + struct in_device *in_dev = __in_dev_get(dev); + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; + + if (!nxi) + return 1; + if (!in_dev) + return 0; + + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (__addr_in_nx_info(ifa->ifa_address, nxi)) + return 1; + } + return 0; +} + + + + +/* vserver syscall commands below here */ + +/* taks nid and nx_info functions */ + +#include + + +int vc_task_nid(uint32_t id, void __user *data) +{ + nid_t nid; + + if (id) { + struct task_struct *tsk; + + if (!vx_check(0, VX_ADMIN|VX_WATCH)) + return -EPERM; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(id); + nid = (tsk) ? tsk->nid : -ESRCH; + read_unlock(&tasklist_lock); + } + else + nid = current->nid; + return nid; +} + + +int vc_nx_info(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_nx_info_v0 vc_data; + + if (!vx_check(0, VX_ADMIN)) + return -ENOSYS; + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + vc_data.nid = nxi->nx_id; + put_nx_info(nxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +/* network functions */ + +int vc_net_create(uint32_t nid, void __user *data) +{ + // int ret = -ENOMEM; + struct nx_info *new_nxi; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((nid >= MIN_D_CONTEXT) && (nid != VX_DYNAMIC_ID)) + return -EINVAL; + + if (nid < 1) + return -EINVAL; + + new_nxi = __foc_nx_info(nid, &ret); + if (!new_nxi) + return ret; + if (!(new_nxi->nx_flags & VXF_STATE_SETUP)) { + ret = -EEXIST; + goto out_put; + } + + ret = new_nxi->nx_id; + nx_migrate_task(current, new_nxi); +out_put: + put_nx_info(new_nxi); + return ret; +} + + +int vc_net_migrate(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + nx_migrate_task(current, nxi); + put_nx_info(nxi); + return 0; +} + +int vc_net_add(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_nx_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + // add ip to net context here + put_nx_info(nxi); + return 0; +} + +int vc_net_remove(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_nx_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + // rem ip from net context here + put_nx_info(nxi); + return 0; +} + + + +int vc_get_nflags(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_flags_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + vc_data.flagword = nxi->nx_flags; + + // vc_data.mask = ~0UL; + /* special STATE flag handling */ + vc_data.mask = vx_mask_flags(~0UL, nxi->nx_flags, IPF_ONE_TIME); + + put_nx_info(nxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_nflags(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_flags_v0 vc_data; + uint64_t mask, trigger; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + /* special STATE flag handling */ + mask = vx_mask_mask(vc_data.mask, nxi->nx_flags, IPF_ONE_TIME); + trigger = (mask & nxi->nx_flags) ^ (mask & vc_data.flagword); + // if (trigger & IPF_STATE_SETUP) + + nxi->nx_flags = vx_mask_flags(nxi->nx_flags, + vc_data.flagword, mask); + put_nx_info(nxi); + return 0; +} + +int vc_get_ncaps(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_caps_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + vc_data.ncaps = nxi->nx_ncaps; + vc_data.cmask = ~0UL; + put_nx_info(nxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_ncaps(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_caps_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi = find_nx_info(id); + if (!nxi) + return -ESRCH; + + nxi->nx_ncaps = vx_mask_flags(nxi->nx_ncaps, + vc_data.ncaps, vc_data.cmask); + put_nx_info(nxi); + return 0; +} + + +#include + +EXPORT_SYMBOL_GPL(free_nx_info); +EXPORT_SYMBOL_GPL(nxlist_lock); + diff --git a/kernel/vserver/proc.c b/kernel/vserver/proc.c new file mode 100644 index 000000000..42bc18200 --- /dev/null +++ b/kernel/vserver/proc.c @@ -0,0 +1,905 @@ +/* + * linux/kernel/vserver/proc.c + * + * Virtual Context Support + * + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 basic structure + * V0.02 adaptation vs1.3.0 + * V0.03 proc permissions + * V0.04 locking/generic + * V0.05 next generation procfs + * V0.06 inode validation + * V0.07 generic rewrite vid + * + */ + +#include +#include +#include +#include + +#include +#include + + +static struct proc_dir_entry *proc_virtual; + +static struct proc_dir_entry *proc_vnet; + + +enum vid_directory_inos { + PROC_XID_INO = 32, + PROC_XID_INFO, + PROC_XID_STATUS, + PROC_XID_LIMIT, + PROC_XID_SCHED, + PROC_XID_CVIRT, + PROC_XID_CACCT, + + PROC_NID_INO = 64, + PROC_NID_INFO, + PROC_NID_STATUS, +}; + +#define PROC_VID_MASK 0x60 + + +/* first the actual feeds */ + + +static int proc_virtual_info(int vid, char *buffer) +{ + return sprintf(buffer, + "VCIVersion:\t%04x:%04x\n" + "VCISyscall:\t%d\n" + ,VCI_VERSION >> 16 + ,VCI_VERSION & 0xFFFF + ,__NR_vserver + ); +} + + +int proc_xid_info (int vid, char *buffer) +{ + struct vx_info *vxi; + int length; + + vxi = find_vx_info(vid); + if (!vxi) + return 0; + length = sprintf(buffer, + "ID:\t%d\n" + "Info:\t%p\n" + "Init:\t%d\n" + ,vxi->vx_id + ,vxi + ,vxi->vx_initpid + ); + put_vx_info(vxi); + return length; +} + +int proc_xid_status (int vid, char *buffer) +{ + struct vx_info *vxi; + int length; + + vxi = find_vx_info(vid); + if (!vxi) + return 0; + length = sprintf(buffer, + "RefC:\t%d\n" + "Flags:\t%016llx\n" + "BCaps:\t%016llx\n" + "CCaps:\t%016llx\n" + "Ticks:\t%d\n" + ,atomic_read(&vxi->vx_refcount) + ,vxi->vx_flags + ,vxi->vx_bcaps + ,vxi->vx_ccaps + ,atomic_read(&vxi->limit.ticks) + ); + put_vx_info(vxi); + return length; +} + +int proc_xid_limit (int vid, char *buffer) +{ + struct vx_info *vxi; + int length; + + vxi = find_vx_info(vid); + if (!vxi) + return 0; + length = vx_info_proc_limit(&vxi->limit, buffer); + put_vx_info(vxi); + return length; +} + +int proc_xid_sched (int vid, char *buffer) +{ + struct vx_info *vxi; + int length; + + vxi = find_vx_info(vid); + if (!vxi) + return 0; + length = vx_info_proc_sched(&vxi->sched, buffer); + put_vx_info(vxi); + return length; +} + +int proc_xid_cvirt (int vid, char *buffer) +{ + struct vx_info *vxi; + int length; + + vxi = find_vx_info(vid); + if (!vxi) + return 0; + length = vx_info_proc_cvirt(&vxi->cvirt, buffer); + put_vx_info(vxi); + return length; +} + +int proc_xid_cacct (int vid, char *buffer) +{ + struct vx_info *vxi; + int length; + + vxi = find_vx_info(vid); + if (!vxi) + return 0; + length = vx_info_proc_cacct(&vxi->cacct, buffer); + put_vx_info(vxi); + return length; +} + + +static int proc_vnet_info(int vid, char *buffer) +{ + return sprintf(buffer, + "VCIVersion:\t%04x:%04x\n" + "VCISyscall:\t%d\n" + ,VCI_VERSION >> 16 + ,VCI_VERSION & 0xFFFF + ,__NR_vserver + ); +} + +#define atoquad(a) \ + (((a)>>0) & 0xff), (((a)>>8) & 0xff), \ + (((a)>>16) & 0xff), (((a)>>24) & 0xff) + +int proc_nid_info (int vid, char *buffer) +{ + struct nx_info *nxi; + int length, i; + + nxi = find_nx_info(vid); + if (!nxi) + return 0; + length = sprintf(buffer, + "ID:\t%d\n" + "Info:\t%p\n" + ,nxi->nx_id + ,nxi + ); + for (i=0; inbipv4; i++) { + length += sprintf(buffer + length, + "%d:\t%d.%d.%d.%d/%d.%d.%d.%d\n", i, + atoquad(nxi->ipv4[i]), + atoquad(nxi->mask[i])); + } + put_nx_info(nxi); + return length; +} + +int proc_nid_status (int vid, char *buffer) +{ + struct nx_info *nxi; + int length; + + nxi = find_nx_info(vid); + if (!nxi) + return 0; + length = sprintf(buffer, + "RefC:\t%d\n" + ,atomic_read(&nxi->nx_refcount) + ); + put_nx_info(nxi); + return length; +} + +/* here the inode helpers */ + + + +#define fake_ino(id,ino) (((id)<<16)|(ino)) + +#define inode_vid(i) ((i)->i_ino >> 16) +#define inode_type(i) ((i)->i_ino & 0xFFFF) + +#define MAX_MULBY10 ((~0U-9)/10) + + +static struct inode *proc_vid_make_inode(struct super_block * sb, + int vid, int ino) +{ + struct inode *inode = new_inode(sb); + + if (!inode) + goto out; + + inode->i_mtime = inode->i_atime = + inode->i_ctime = CURRENT_TIME; + inode->i_ino = fake_ino(vid, ino); + + inode->i_uid = 0; + inode->i_gid = 0; + // inode->i_xid = xid; +out: + return inode; +} + +static int proc_vid_revalidate(struct dentry * dentry, struct nameidata *nd) +{ + struct inode * inode = dentry->d_inode; + int vid, valid=0; + + vid = inode_vid(inode); + switch (inode_type(inode) & PROC_VID_MASK) { + case PROC_XID_INO: + valid = vx_info_id_valid(vid); + break; + case PROC_NID_INO: + valid = nx_info_id_valid(vid); + break; + } + if (valid) + return 1; + d_drop(dentry); + return 0; +} + +/* +static int proc_vid_delete_dentry(struct dentry * dentry) +{ + return 1; +} +*/ + + +#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) + +static ssize_t proc_vid_info_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_dentry->d_inode; + unsigned long page; + ssize_t length; + ssize_t end; + int vid; + + if (count > PROC_BLOCK_SIZE) + count = PROC_BLOCK_SIZE; + if (!(page = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + + vid = inode_vid(inode); + length = PROC_I(inode)->op.proc_vid_read(vid, (char*)page); + + if (length < 0) { + free_page(page); + return length; + } + /* Static 4kB (or whatever) block capacity */ + if (*ppos >= length) { + free_page(page); + return 0; + } + if (count + *ppos > length) + count = length - *ppos; + end = count + *ppos; + copy_to_user(buf, (char *) page + *ppos, count); + *ppos = end; + free_page(page); + return count; +} + + + + + +/* here comes the lower level (vid) */ + +static struct file_operations proc_vid_info_file_operations = { + read: proc_vid_info_read, +}; + +static struct dentry_operations proc_vid_dentry_operations = { + d_revalidate: proc_vid_revalidate, +// d_delete: proc_vid_delete_dentry, +}; + + +struct vid_entry { + int type; + int len; + char *name; + mode_t mode; +}; + +#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)} + +static struct vid_entry vx_base_stuff[] = { + E(PROC_XID_INFO, "info", S_IFREG|S_IRUGO), + E(PROC_XID_STATUS, "status", S_IFREG|S_IRUGO), + E(PROC_XID_LIMIT, "limit", S_IFREG|S_IRUGO), + E(PROC_XID_SCHED, "sched", S_IFREG|S_IRUGO), + E(PROC_XID_CVIRT, "cvirt", S_IFREG|S_IRUGO), + E(PROC_XID_CACCT, "cacct", S_IFREG|S_IRUGO), + {0,0,NULL,0} +}; + +static struct vid_entry vn_base_stuff[] = { + E(PROC_NID_INFO, "info", S_IFREG|S_IRUGO), + E(PROC_NID_STATUS, "status", S_IFREG|S_IRUGO), + {0,0,NULL,0} +}; + + + +static struct dentry *proc_vid_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct vid_entry *p; + int error; + + error = -ENOENT; + inode = NULL; + + switch (inode_type(dir)) { + case PROC_XID_INO: + p = vx_base_stuff; + break; + case PROC_NID_INO: + p = vn_base_stuff; + break; + default: + goto out; + } + + for (; p->name; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (!p->name) + goto out; + + error = -EINVAL; + inode = proc_vid_make_inode(dir->i_sb, inode_vid(dir), p->type); + if (!inode) + goto out; + + switch(p->type) { + case PROC_XID_INFO: + PROC_I(inode)->op.proc_vid_read = proc_xid_info; + break; + case PROC_XID_STATUS: + PROC_I(inode)->op.proc_vid_read = proc_xid_status; + break; + case PROC_XID_LIMIT: + PROC_I(inode)->op.proc_vid_read = proc_xid_limit; + break; + case PROC_XID_SCHED: + PROC_I(inode)->op.proc_vid_read = proc_xid_sched; + break; + case PROC_XID_CVIRT: + PROC_I(inode)->op.proc_vid_read = proc_xid_cvirt; + break; + case PROC_XID_CACCT: + PROC_I(inode)->op.proc_vid_read = proc_xid_cacct; + break; + + case PROC_NID_INFO: + PROC_I(inode)->op.proc_vid_read = proc_nid_info; + break; + case PROC_NID_STATUS: + PROC_I(inode)->op.proc_vid_read = proc_nid_status; + break; + + default: + printk("procfs: impossible type (%d)",p->type); + iput(inode); + return ERR_PTR(-EINVAL); + } + inode->i_mode = p->mode; +// inode->i_op = &proc_vid_info_inode_operations; + inode->i_fop = &proc_vid_info_file_operations; + inode->i_nlink = 1; + inode->i_flags|=S_IMMUTABLE; + + dentry->d_op = &proc_vid_dentry_operations; + d_add(dentry, inode); + error = 0; +out: + return ERR_PTR(error); +} + + +static int proc_vid_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + int i, size; + struct inode *inode = filp->f_dentry->d_inode; + struct vid_entry *p; + + i = filp->f_pos; + switch (i) { + case 0: + if (filldir(dirent, ".", 1, i, + inode->i_ino, DT_DIR) < 0) + return 0; + i++; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, i, + PROC_ROOT_INO, DT_DIR) < 0) + return 0; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + switch (inode_type(inode)) { + case PROC_XID_INO: + size = sizeof(vx_base_stuff); + p = vx_base_stuff + i; + break; + case PROC_NID_INO: + size = sizeof(vn_base_stuff); + p = vn_base_stuff + i; + break; + default: + return 1; + } + if (i >= size/sizeof(struct vid_entry)) + return 1; + while (p->name) { + if (filldir(dirent, p->name, p->len, + filp->f_pos, fake_ino(inode_vid(inode), + p->type), p->mode >> 12) < 0) + return 0; + filp->f_pos++; + p++; + } + } + return 1; +} + + + + +/* now the upper level (virtual) */ + +static struct file_operations proc_vid_file_operations = { + read: generic_read_dir, + readdir: proc_vid_readdir, +}; + +static struct inode_operations proc_vid_inode_operations = { + lookup: proc_vid_lookup, +}; + + + +static __inline__ int atovid(const char *str, int len) +{ + int vid, c; + + vid = 0; + while (len-- > 0) { + c = *str - '0'; + str++; + if (c > 9) + return -1; + if (vid >= MAX_MULBY10) + return -1; + vid *= 10; + vid += c; + if (!vid) + return -1; + } + return vid; +} + + +struct dentry *proc_virtual_lookup(struct inode *dir, + struct dentry * dentry, struct nameidata *nd) +{ + int xid, len, ret; + struct vx_info *vxi; + const char *name; + struct inode *inode; + + name = dentry->d_name.name; + len = dentry->d_name.len; + ret = -ENOMEM; + + if (len == 7 && !memcmp(name, "current", 7)) { + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + inode->i_mtime = inode->i_atime = + inode->i_ctime = CURRENT_TIME; + inode->i_ino = fake_ino(1, PROC_XID_INO); + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_uid = inode->i_gid = 0; + inode->i_size = 64; +// inode->i_op = &proc_current_inode_operations; + d_add(dentry, inode); + return NULL; + } + if (len == 4 && !memcmp(name, "info", 4)) { + inode = proc_vid_make_inode(dir->i_sb, 0, PROC_XID_INFO); + if (!inode) + goto out; + inode->i_fop = &proc_vid_info_file_operations; + PROC_I(inode)->op.proc_vid_read = proc_virtual_info; + inode->i_mode = S_IFREG|S_IRUGO; +// inode->i_size = 64; +// inode->i_op = &proc_current_inode_operations; + d_add(dentry, inode); + return NULL; + } + + ret = -ENOENT; + xid = atovid(name, len); + if (xid < 0) + goto out; + vxi = find_vx_info(xid); + if (!vxi) + goto out; + + inode = NULL; + if (vx_check(xid, VX_ADMIN|VX_WATCH|VX_IDENT)) + inode = proc_vid_make_inode(dir->i_sb, + vxi->vx_id, PROC_XID_INO); + if (!inode) + goto out_release; + + inode->i_mode = S_IFDIR|S_IRUGO; + inode->i_op = &proc_vid_inode_operations; + inode->i_fop = &proc_vid_file_operations; + inode->i_nlink = 2; + inode->i_flags|=S_IMMUTABLE; + + dentry->d_op = &proc_vid_dentry_operations; + d_add(dentry, inode); + ret = 0; + +out_release: + put_vx_info(vxi); +out: + return ERR_PTR(ret); +} + + +struct dentry *proc_vnet_lookup(struct inode *dir, + struct dentry * dentry, struct nameidata *nd) +{ + int nid, len, ret; + struct nx_info *nxi; + const char *name; + struct inode *inode; + + name = dentry->d_name.name; + len = dentry->d_name.len; + ret = -ENOMEM; + if (len == 7 && !memcmp(name, "current", 7)) { + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + inode->i_mtime = inode->i_atime = + inode->i_ctime = CURRENT_TIME; + inode->i_ino = fake_ino(1, PROC_NID_INO); + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_uid = inode->i_gid = 0; + inode->i_size = 64; +// inode->i_op = &proc_current_inode_operations; + d_add(dentry, inode); + return NULL; + } + if (len == 4 && !memcmp(name, "info", 4)) { + inode = proc_vid_make_inode(dir->i_sb, 0, PROC_NID_INFO); + if (!inode) + goto out; + inode->i_fop = &proc_vid_info_file_operations; + PROC_I(inode)->op.proc_vid_read = proc_vnet_info; + inode->i_mode = S_IFREG|S_IRUGO; +// inode->i_size = 64; +// inode->i_op = &proc_current_inode_operations; + d_add(dentry, inode); + return NULL; + } + + ret = -ENOENT; + nid = atovid(name, len); + if (nid < 0) + goto out; + nxi = find_nx_info(nid); + if (!nxi) + goto out; + + inode = NULL; + if (1) + inode = proc_vid_make_inode(dir->i_sb, + nxi->nx_id, PROC_NID_INO); + if (!inode) + goto out_release; + + inode->i_mode = S_IFDIR|S_IRUGO; + inode->i_op = &proc_vid_inode_operations; + inode->i_fop = &proc_vid_file_operations; + inode->i_nlink = 2; + inode->i_flags|=S_IMMUTABLE; + + dentry->d_op = &proc_vid_dentry_operations; + d_add(dentry, inode); + ret = 0; + +out_release: + put_nx_info(nxi); +out: + return ERR_PTR(ret); +} + + + + +#define PROC_NUMBUF 10 +#define PROC_MAXVIDS 32 + + +static int get_xid_list(int index, unsigned int *xids) +{ + struct vx_info *p; + int nr_xids = 0; + + index--; + spin_lock(&vxlist_lock); + list_for_each_entry(p, &vx_infos, vx_list) { + int xid = p->vx_id; + + if (--index >= 0) + continue; + xids[nr_xids] = xid; + if (++nr_xids >= PROC_MAXVIDS) + break; + } + spin_unlock(&vxlist_lock); + return nr_xids; +} + +int proc_virtual_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + unsigned int xid_array[PROC_MAXVIDS]; + char buf[PROC_NUMBUF]; + unsigned int nr = filp->f_pos-3; + unsigned int nr_xids, i; + ino_t ino; + + switch ((long)filp->f_pos) { + case 0: + ino = fake_ino(0, PROC_XID_INO); + if (filldir(dirent, ".", 1, + filp->f_pos, ino, DT_DIR) < 0) + return 0; + filp->f_pos++; + /* fall through */ + case 1: + ino = filp->f_dentry->d_parent->d_inode->i_ino; + if (filldir(dirent, "..", 2, + filp->f_pos, ino, DT_DIR) < 0) + return 0; + filp->f_pos++; + /* fall through */ + case 2: + ino = fake_ino(0, PROC_XID_INFO); + if (filldir(dirent, "info", 4, + filp->f_pos, ino, DT_LNK) < 0) + return 0; + filp->f_pos++; + /* fall through */ + case 3: + if (current->xid > 1) { + ino = fake_ino(1, PROC_XID_INO); + if (filldir(dirent, "current", 7, + filp->f_pos, ino, DT_LNK) < 0) + return 0; + } + filp->f_pos++; + } + + nr_xids = get_xid_list(nr, xid_array); + + for (i = 0; i < nr_xids; i++) { + int xid = xid_array[i]; + ino_t ino = fake_ino(xid, PROC_XID_INO); + unsigned long j = PROC_NUMBUF; + + do buf[--j] = '0' + (xid % 10); while (xid/=10); + + if (filldir(dirent, buf+j, PROC_NUMBUF-j, + filp->f_pos, ino, DT_DIR) < 0) + break; + filp->f_pos++; + } + return 0; +} + + +static struct file_operations proc_virtual_dir_operations = { + read: generic_read_dir, + readdir: proc_virtual_readdir, +}; + +static struct inode_operations proc_virtual_dir_inode_operations = { + lookup: proc_virtual_lookup, +}; + + + +static int get_nid_list(int index, unsigned int *nids) +{ + struct nx_info *p; + int nr_nids = 0; + + index--; + spin_lock(&nxlist_lock); + list_for_each_entry(p, &nx_infos, nx_list) { + int nid = p->nx_id; + + if (--index >= 0) + continue; + nids[nr_nids] = nid; + if (++nr_nids >= PROC_MAXVIDS) + break; + } + spin_unlock(&nxlist_lock); + return nr_nids; +} + +int proc_vnet_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + unsigned int nid_array[PROC_MAXVIDS]; + char buf[PROC_NUMBUF]; + unsigned int nr = filp->f_pos-3; + unsigned int nr_nids, i; + ino_t ino; + + switch ((long)filp->f_pos) { + case 0: + ino = fake_ino(0, PROC_NID_INO); + if (filldir(dirent, ".", 1, + filp->f_pos, ino, DT_DIR) < 0) + return 0; + filp->f_pos++; + /* fall through */ + case 1: + ino = filp->f_dentry->d_parent->d_inode->i_ino; + if (filldir(dirent, "..", 2, + filp->f_pos, ino, DT_DIR) < 0) + return 0; + filp->f_pos++; + /* fall through */ + case 2: + ino = fake_ino(0, PROC_NID_INFO); + if (filldir(dirent, "info", 4, + filp->f_pos, ino, DT_LNK) < 0) + return 0; + filp->f_pos++; + /* fall through */ + case 3: + if (current->xid > 1) { + ino = fake_ino(1, PROC_NID_INO); + if (filldir(dirent, "current", 7, + filp->f_pos, ino, DT_LNK) < 0) + return 0; + } + filp->f_pos++; + } + + nr_nids = get_nid_list(nr, nid_array); + + for (i = 0; i < nr_nids; i++) { + int nid = nid_array[i]; + ino_t ino = fake_ino(nid, PROC_NID_INO); + unsigned long j = PROC_NUMBUF; + + do buf[--j] = '0' + (nid % 10); while (nid/=10); + + if (filldir(dirent, buf+j, PROC_NUMBUF-j, + filp->f_pos, ino, DT_DIR) < 0) + break; + filp->f_pos++; + } + return 0; +} + + +static struct file_operations proc_vnet_dir_operations = { + read: generic_read_dir, + readdir: proc_vnet_readdir, +}; + +static struct inode_operations proc_vnet_dir_inode_operations = { + lookup: proc_vnet_lookup, +}; + + + +void proc_vx_init(void) +{ + struct proc_dir_entry *ent; + + ent = proc_mkdir("virtual", 0); + if (ent) { + ent->proc_fops = &proc_virtual_dir_operations; + ent->proc_iops = &proc_virtual_dir_inode_operations; + } + proc_virtual = ent; + + ent = proc_mkdir("vnet", 0); + if (ent) { + ent->proc_fops = &proc_vnet_dir_operations; + ent->proc_iops = &proc_vnet_dir_inode_operations; + } + proc_vnet = ent; +} + + + + +/* per pid info */ + + +char *task_vx_info(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, + "XID:\t%d\n" + ,p->xid); +} + +int proc_pid_vx_info(struct task_struct *p, char *buffer) +{ + char * orig = buffer; + + buffer = task_vx_info(p, buffer); + return buffer - orig; +} + +char *task_nx_info(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, + "NID:\t%d\n" + ,p->nid); +} + +int proc_pid_nx_info(struct task_struct *p, char *buffer) +{ + char * orig = buffer; + + buffer = task_nx_info(p, buffer); + return buffer - orig; +} + diff --git a/kernel/vserver/sched.c b/kernel/vserver/sched.c new file mode 100644 index 000000000..a75195a19 --- /dev/null +++ b/kernel/vserver/sched.c @@ -0,0 +1,162 @@ +/* + * linux/kernel/vserver/sched.c + * + * Virtual Server: Scheduler Support + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 adapted Sam Vilains version to 2.6.3 + * V0.02 removed legacy interface + * + */ + +#include +#include +#include +#include +#include + +#include +#include + + +/* + * recalculate the context's scheduling tokens + * + * ret > 0 : number of tokens available + * ret = 0 : context is paused + * ret < 0 : number of jiffies until new tokens arrive + * + */ +int vx_tokens_recalc(struct vx_info *vxi) +{ + long delta, tokens = 0; + + if (__vx_flags(vxi->vx_flags, VXF_SCHED_PAUSE, 0)) + /* we are paused */ + return 0; + + delta = jiffies - vxi->sched.jiffies; + + if (delta >= vxi->sched.interval) { + /* lockdown scheduler info */ + spin_lock(&vxi->sched.tokens_lock); + + /* calc integral token part */ + delta = jiffies - vxi->sched.jiffies; + tokens = delta / vxi->sched.interval; + delta = tokens * vxi->sched.interval; + tokens *= vxi->sched.fill_rate; + + atomic_add(tokens, &vxi->sched.tokens); + vxi->sched.jiffies += delta; + tokens = atomic_read(&vxi->sched.tokens); + + if (tokens > vxi->sched.tokens_max) { + tokens = vxi->sched.tokens_max; + atomic_set(&vxi->sched.tokens, tokens); + } + spin_unlock(&vxi->sched.tokens_lock); + } else { + /* no new tokens */ + if ((tokens = vx_tokens_avail(vxi)) < vxi->sched.tokens_min) { + /* enough tokens will be available in */ + if (vxi->sched.tokens_min == 0) + return delta - vxi->sched.interval; + return delta - vxi->sched.interval * + vxi->sched.tokens_min / vxi->sched.fill_rate; + } + } + /* we have some tokens left */ + return tokens; +} + +/* + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into a -4 ... 0 ... +4 bonus/penalty range. + * + * Additionally, we scale another amount based on the number of + * CPU tokens currently held by the context, if the process is + * part of a context (and the appropriate SCHED flag is set). + * This ranges from -5 ... 0 ... +15, quadratically. + * + * So, the total bonus is -9 .. 0 .. +19 + * We use ~50% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * unless that context is far exceeding its CPU allocation. + * + * Both properties are important to certain workloads. + */ +int effective_vavavoom(task_t *p, int max_prio) +{ + struct vx_info *vxi = p->vx_info; + int vavavoom, max; + + /* lots of tokens = lots of vavavoom + * no tokens = no vavavoom */ + if ((vavavoom = atomic_read(&vxi->sched.tokens)) >= 0) { + max = vxi->sched.tokens_max; + vavavoom = max - vavavoom; + max = max * max; + vavavoom = max_prio * VAVAVOOM_RATIO / 100 + * (vavavoom*vavavoom - (max >> 2)) / max; + /* alternative, geometric mapping + vavavoom = -( MAX_USER_PRIO*VAVAVOOM_RATIO/100 * vavavoom + / vxi->sched.tokens_max - + MAX_USER_PRIO*VAVAVOOM_RATIO/100/2); */ + } else + vavavoom = 0; + /* vavavoom = ( MAX_USER_PRIO*VAVAVOOM_RATIO/100*tokens_left(p) - + MAX_USER_PRIO*VAVAVOOM_RATIO/100/2); */ + + return vavavoom; +} + + +int vc_set_sched(uint32_t xid, void __user *data) +{ + struct vcmd_set_sched_v2 vc_data; + struct vx_info *vxi; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = find_vx_info(xid); + if (!vxi) + return -EINVAL; + + spin_lock(&vxi->sched.tokens_lock); + + if (vc_data.interval != SCHED_KEEP) + vxi->sched.interval = vc_data.interval; + if (vc_data.fill_rate != SCHED_KEEP) + vxi->sched.fill_rate = vc_data.fill_rate; + if (vc_data.tokens_min != SCHED_KEEP) + vxi->sched.tokens_min = vc_data.tokens_min; + if (vc_data.tokens_max != SCHED_KEEP) + vxi->sched.tokens_max = vc_data.tokens_max; + if (vc_data.tokens != SCHED_KEEP) + atomic_set(&vxi->sched.tokens, vc_data.tokens); + + /* Sanity check the resultant values */ + if (vxi->sched.fill_rate <= 0) + vxi->sched.fill_rate = 1; + if (vxi->sched.interval <= 0) + vxi->sched.interval = HZ; + if (vxi->sched.tokens_max == 0) + vxi->sched.tokens_max = 1; + if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max) + atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max); + if (vxi->sched.tokens_min > vxi->sched.tokens_max) + vxi->sched.tokens_min = vxi->sched.tokens_max; + + spin_unlock(&vxi->sched.tokens_lock); + put_vx_info(vxi); + return 0; +} + diff --git a/kernel/vserver/signal.c b/kernel/vserver/signal.c new file mode 100644 index 000000000..464ea1be4 --- /dev/null +++ b/kernel/vserver/signal.c @@ -0,0 +1,85 @@ +/* + * linux/kernel/vserver/signal.c + * + * Virtual Server: Signal Support + * + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 broken out from vcontext V0.05 + * + */ + +#include +#include + +#include +#include + +#include +#include + + +int vc_ctx_kill(uint32_t id, void __user *data) +{ + int retval, count=0; + struct vcmd_ctx_kill_v0 vc_data; + struct siginfo info; + struct task_struct *p; + struct vx_info *vxi; + + if (!vx_check(0, VX_ADMIN)) + return -ENOSYS; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + info.si_signo = vc_data.sig; + info.si_errno = 0; + info.si_code = SI_USER; + info.si_pid = current->pid; + info.si_uid = current->uid; + + vxi = find_vx_info(id); + if (!vxi) + return -ESRCH; + + retval = -ESRCH; + read_lock(&tasklist_lock); + switch (vc_data.pid) { + case -1: + case 0: + for_each_process(p) { + int err = 0; + + if (vx_task_xid(p) != id || p->pid <= 1 || + (vc_data.pid && vxi->vx_initpid == p->pid) || + !thread_group_leader(p)) + continue; + + err = send_sig_info(vc_data.sig, &info, p); + ++count; + if (err != -EPERM) + retval = err; + } + break; + + default: + p = find_task_by_pid(vc_data.pid); + if (p) { + if (!thread_group_leader(p)) { + struct task_struct *tg; + + tg = find_task_by_pid(p->tgid); + if (tg) + p = tg; + } + if ((id == -1) || (vx_task_xid(p) == id)) + retval = send_sig_info(vc_data.sig, &info, p); + } + break; + } + read_unlock(&tasklist_lock); + put_vx_info(vxi); + return retval; +} + + diff --git a/kernel/vserver/switch.c b/kernel/vserver/switch.c new file mode 100644 index 000000000..90fee1412 --- /dev/null +++ b/kernel/vserver/switch.c @@ -0,0 +1,170 @@ +/* + * linux/kernel/vserver/switch.c + * + * Virtual Server: Syscall Switch + * + * Copyright (C) 2003-2004 Herbert Pötzl + * + * V0.01 syscall switch + * V0.02 added signal to context + * V0.03 added rlimit functions + * V0.04 added iattr, task/xid functions + * + */ + +#include +#include +#include + +#include +#include + + +static inline int +vc_get_version(uint32_t id) +{ + return VCI_VERSION; +} + + +#include +#include +#include +#include +#include +#include +#include +#include + + +extern unsigned int vx_debug_switch; + + +extern asmlinkage long +sys_vserver(uint32_t cmd, uint32_t id, void __user *data) +{ + + if (vx_debug_switch) + printk( "vc: VCMD_%02d_%d[%d], %d\n", + VC_CATEGORY(cmd), VC_COMMAND(cmd), + VC_VERSION(cmd), id); + + switch (cmd) { + case VCMD_get_version: + return vc_get_version(id); + +#ifdef CONFIG_VSERVER_LEGACY + case VCMD_new_s_context: + return vc_new_s_context(id, data); + case VCMD_set_ipv4root: + return vc_set_ipv4root(id, data); +#endif + + case VCMD_task_xid: + return vc_task_xid(id, data); + case VCMD_vx_info: + return vc_vx_info(id, data); + + case VCMD_task_nid: + return vc_task_nid(id, data); + case VCMD_nx_info: + return vc_nx_info(id, data); + + case VCMD_set_namespace: + return vc_set_namespace(id, data); + case VCMD_cleanup_namespace: + return vc_cleanup_namespace(id, data); + } + + /* those are allowed while in setup too */ + if (!vx_check(0, VX_ADMIN|VX_WATCH) && + !vx_flags(VXF_STATE_SETUP,0)) + return -EPERM; + +#ifdef CONFIG_VSERVER_LEGACY + switch (cmd) { + case VCMD_set_cflags: + case VCMD_set_ccaps: + if (vx_check(0, VX_WATCH)) + return 0; + } +#endif + + switch (cmd) { + case VCMD_get_rlimit: + return vc_get_rlimit(id, data); + case VCMD_set_rlimit: + return vc_set_rlimit(id, data); + case VCMD_get_rlimit_mask: + return vc_get_rlimit_mask(id, data); + + case VCMD_vx_get_vhi_name: + return vc_get_vhi_name(id, data); + case VCMD_vx_set_vhi_name: + return vc_set_vhi_name(id, data); + + case VCMD_set_cflags: + return vc_set_cflags(id, data); + case VCMD_get_cflags: + return vc_get_cflags(id, data); + + case VCMD_set_ccaps: + return vc_set_ccaps(id, data); + case VCMD_get_ccaps: + return vc_get_ccaps(id, data); + + case VCMD_set_nflags: + return vc_set_nflags(id, data); + case VCMD_get_nflags: + return vc_get_nflags(id, data); + + case VCMD_set_ncaps: + return vc_set_ncaps(id, data); + case VCMD_get_ncaps: + return vc_get_ncaps(id, data); + + case VCMD_set_sched: + return vc_set_sched(id, data); + } + + /* below here only with VX_ADMIN */ + if (!vx_check(0, VX_ADMIN|VX_WATCH)) + return -EPERM; + + switch (cmd) { + case VCMD_ctx_kill: + return vc_ctx_kill(id, data); + +#ifdef CONFIG_VSERVER_LEGACY + case VCMD_create_context: + return vc_ctx_create(id, data); +#endif + + case VCMD_get_iattr: + return vc_get_iattr(id, data); + case VCMD_set_iattr: + return vc_set_iattr(id, data); + + case VCMD_enter_namespace: + return vc_enter_namespace(id, data); + + case VCMD_ctx_create: +#ifdef CONFIG_VSERVER_LEGACY + if (id == 1) { + current->xid = 1; + return 1; + } +#endif + return vc_ctx_create(id, data); + case VCMD_ctx_migrate: + return vc_ctx_migrate(id, data); + + case VCMD_net_create: + return vc_net_create(id, data); + case VCMD_net_migrate: + return vc_net_migrate(id, data); + + } + return -ENOSYS; +} + diff --git a/kernel/vserver/sysctl.c b/kernel/vserver/sysctl.c new file mode 100644 index 000000000..562fc0eab --- /dev/null +++ b/kernel/vserver/sysctl.c @@ -0,0 +1,150 @@ +/* + * linux/kernel/sysctl.c + * + * Virtual Context Support + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 basic structure + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define CTL_VSERVER 4242 /* unused? */ + +enum { + CTL_DEBUG_SWITCH = 1, + CTL_DEBUG_LIMIT, +}; + + +unsigned int vx_debug_switch = 0; +unsigned int vx_debug_limit = 0; + + +static struct ctl_table_header *vserver_table_header; +static ctl_table vserver_table[]; + + +void vserver_register_sysctl(void) +{ + if (!vserver_table_header) { + vserver_table_header = register_sysctl_table(vserver_table, 1); +#ifdef CONFIG_PROC_FS +// if (vserver_table[0].de) +// vserver_table[0].de->owner = THIS_MODULE; +#endif + } + +} + +void vserver_unregister_sysctl(void) +{ + if (vserver_table_header) { + unregister_sysctl_table(vserver_table_header); + vserver_table_header = NULL; + } +} + + +static int proc_dodebug(ctl_table *table, int write, + struct file *file, void *buffer, size_t *lenp) +{ + char tmpbuf[20], *p, c; + unsigned int value; + size_t left, len; + + if ((file->f_pos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + + left = *lenp; + + if (write) { + if (!access_ok(VERIFY_READ, buffer, left)) + return -EFAULT; + p = (char *) buffer; + while (left && __get_user(c, p) >= 0 && isspace(c)) + left--, p++; + if (!left) + goto done; + + if (left > sizeof(tmpbuf) - 1) + return -EINVAL; + if (copy_from_user(tmpbuf, p, left)) + return -EFAULT; + tmpbuf[left] = '\0'; + + for (p = tmpbuf, value = 0; '0' <= *p && *p <= '9'; p++, left--) + value = 10 * value + (*p - '0'); + if (*p && !isspace(*p)) + return -EINVAL; + while (left && isspace(*p)) + left--, p++; + *(unsigned int *) table->data = value; + } else { + if (!access_ok(VERIFY_WRITE, buffer, left)) + return -EFAULT; + len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + if (len > left) + len = left; + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + if ((left -= len) > 0) { + if (put_user('\n', (char *)buffer + len)) + return -EFAULT; + left--; + } + } + +done: + *lenp -= left; + file->f_pos += *lenp; + return 0; +} + + + +static ctl_table debug_table[] = { + { + .ctl_name = CTL_DEBUG_SWITCH, + .procname = "debug_switch", + .data = &vx_debug_switch, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_DEBUG_LIMIT, + .procname = "debug_limit", + .data = &vx_debug_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { .ctl_name = 0 } +}; + +static ctl_table vserver_table[] = { + { + .ctl_name = CTL_VSERVER, + .procname = "vserver", + .mode = 0555, + .child = debug_table + }, + { .ctl_name = 0 } +}; +