From: Marc Fiuczynski Date: Fri, 11 Feb 2005 05:34:41 +0000 (+0000) Subject: upgrade to vserver 1.9.3.17 X-Git-Tag: before-fedora-2_6_18-1_2239_FC5-vs2_0_2_2-rc6-merge~253 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=887a12fc42875cae0e9bf095b812aa8e6992e1f3;p=linux-2.6.git upgrade to vserver 1.9.3.17 --- diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 6880e9694..1e4f78c0a 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -330,14 +330,6 @@ config MVIAC3_2 endchoice -config X86_HZ - int "Clock Tick Rate" - default 1000 if !(M386 || M486 || M586 || M586TSC || M586MMX) - default 100 if (M386 || M486 || M586 || M586TSC || M586MMX) - help - Select the kernel clock tick rate in interrupts per second. - Slower processors should choose 100; everything else 1000. - config X86_GENERIC bool "Generic x86 support" help @@ -561,6 +553,14 @@ config X86_IO_APIC depends on !SMP && X86_UP_IOAPIC default y +config KERNEL_HZ + int "Timer Frequency (100-20000)" + range 100 20000 + default "1000" + help + This allows you to specify the frequency at which the + kernel timer interrupt will occur. + config X86_TSC bool depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 2c4351d73..adeaef605 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -53,6 +53,7 @@ #include #include +#include #include "mach_traps.h" @@ -306,6 +307,7 @@ void die(const char * str, struct pt_regs * regs, long err) }; static int die_counter; + vxh_throw_oops(); if (die.lock_owner != smp_processor_id()) { console_verbose(); spin_lock_irq(&die.lock); @@ -341,6 +343,7 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(0); die.lock_owner = -1; spin_unlock_irq(&die.lock); + vxh_dump_history(); if (in_interrupt()) panic("Fatal exception in interrupt"); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index e3e7077db..82d068209 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 25da1d4cf..8dce89454 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -44,10 +44,9 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address) vma->vm_end += PAGE_SIZE; // vma->vm_mm->total_vm += grow; vx_vmpages_add(vma->vm_mm, grow); - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) // vma->vm_mm->locked_vm += grow; vx_vmlocked_add(vma->vm_mm, grow); - } __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); return 0; } diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 84e2ee6ca..5d8ec65c3 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index 0c96be567..e9ecc72f1 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -154,7 +154,7 @@ static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; // mm->rss += (HPAGE_SIZE / PAGE_SIZE); - vx_rsspages_sub(mm, HPAGE_SIZE / PAGE_SIZE); + vx_rsspages_add(mm, HPAGE_SIZE / PAGE_SIZE); if (write_access) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -422,7 +422,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, put_page(page); } - mm->rss -= (end - start) >> PAGE_SHIFT; + // mm->rss -= (end - start) >> PAGE_SHIFT; + vx_rsspages_sub(mm, (end - start) >> PAGE_SHIFT); flush_tlb_pending(); } diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index 50b25735f..edbbc4317 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -62,8 +62,8 @@ static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long i; pte_t entry; - mm->rss += (HPAGE_SIZE / PAGE_SIZE); - + // mm->rss += (HPAGE_SIZE / PAGE_SIZE); + vx_rsspages_add(mm, HPAGE_SIZE / PAGE_SIZE); if (write_access) entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -115,7 +115,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, pte_val(entry) += PAGE_SIZE; dst_pte++; } - dst->rss += (HPAGE_SIZE / PAGE_SIZE); + // dst->rss += (HPAGE_SIZE / PAGE_SIZE); + vx_rsspages_add(dst, HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } return 0; @@ -206,7 +207,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, pte++; } } - mm->rss -= (end - start) >> PAGE_SHIFT; + // mm->rss -= (end - start) >> PAGE_SHIFT; + vx_rsspages_sub(mm, (end - start) >> PAGE_SHIFT); flush_tlb_range(vma, start, end); } diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c index b701cb293..bae4b7327 100644 --- a/arch/um/kernel/process_kern.c +++ b/arch/um/kernel/process_kern.c @@ -22,6 +22,8 @@ #include "linux/vs_cvirt.h" #include "linux/proc_fs.h" #include "linux/ptrace.h" +#include "linux/vs_cvirt.h" + #include "asm/unistd.h" #include "asm/mman.h" #include "asm/segment.h" diff --git a/fs/attr.c b/fs/attr.c index 5f78d75a8..fed119204 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -14,9 +14,9 @@ #include #include #include -#include #include #include +#include /* Taken over from the old code... */ @@ -64,22 +64,19 @@ int inode_change_ok(struct inode *inode, struct iattr *attr) goto fine; if (IS_BARRIER(inode)) { - printk(KERN_WARNING - "VSW: xid=%d messing with the barrier.\n", + vxwprintk(1, "xid=%d messing with the barrier.", vx_current_xid()); goto error; } switch (inode->i_sb->s_magic) { case PROC_SUPER_MAGIC: - printk(KERN_WARNING - "VSW: xid=%d messing with the procfs.\n", + vxwprintk(1, "xid=%d messing with the procfs.", vx_current_xid()); goto error; case DEVPTS_SUPER_MAGIC: if (vx_check(inode->i_xid, VX_IDENT)) goto fine; - printk(KERN_WARNING - "VSW: xid=%d messing with the devpts.\n", + vxwprintk(1, "xid=%d messing with the devpts.", vx_current_xid()); goto error; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 6fb3d1f5a..004d7ac2f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -32,6 +32,25 @@ static struct xattr_handler *devpts_xattr_handlers[] = { NULL }; +static int devpts_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + int ret = -EACCES; + + if (vx_check(inode->i_xid, VX_IDENT)) + ret = generic_permission(inode, mask, NULL); + return ret; +} + +struct inode_operations devpts_file_inode_operations = { +#ifdef CONFIG_DEVPTS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = devpts_permission, +}; + static struct vfsmount *devpts_mnt; static struct dentry *devpts_root; @@ -208,26 +227,6 @@ static struct dentry *get_node(int num) return lookup_one_len(s, root, sprintf(s, "%d", num)); } -#ifdef CONFIG_DEVPTS_FS_XATTR -static int devpts_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - int ret = -EACCES; - - if (vx_check(inode->i_xid, VX_IDENT)) - ret = generic_permission(inode, mask, NULL); - return ret; -} -#endif - -struct inode_operations devpts_file_inode_operations = { -#ifdef CONFIG_DEVPTS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = generic_listxattr, - .removexattr = generic_removexattr, - .permission = devpts_permission, -#endif -}; int devpts_pty_new(struct tty_struct *tty) { diff --git a/fs/exec.c b/fs/exec.c index b9888ba4e..b8b650a66 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -48,8 +48,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 5fbe1ca0b..2aa585001 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -16,7 +16,6 @@ #include #include #include -#include #include /* diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 3d9fa57ca..3272b0225 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -18,7 +18,7 @@ #include #include #include -#include + #include #include "ext2.h" @@ -470,7 +470,7 @@ struct inode *ext2_new_inode(struct inode *dir, int mode) return ERR_PTR(-ENOMEM); if (sb->s_flags & MS_TAGXID) - inode->i_xid = current->xid; + inode->i_xid = vx_current_xid(); else inode->i_xid = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index ba3cc9959..ffd30ed7a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1191,7 +1191,7 @@ static int ext2_update_inode(struct inode * inode, int do_sync) raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } -#ifdef CONFIG_INOXID_GID32 +#ifdef CONFIG_INOXID_INTERN raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid); #endif raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 594c16c80..96bfa89cf 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -50,11 +50,11 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * * This test looks nicer. Thanks to Pauline Middelink */ - if (((oldflags & EXT2_IMMUTABLE_FL) || - ((flags ^ oldflags) & - (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL | EXT2_IUNLINK_FL))) - && !capable(CAP_LINUX_IMMUTABLE)) { - return -EPERM; + if ((oldflags & EXT2_IMMUTABLE_FL) || + ((flags ^ oldflags) & (EXT2_APPEND_FL | + EXT2_IMMUTABLE_FL | EXT2_IUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; } flags = flags & EXT2_FL_USER_MODIFIABLE; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 4c616671c..bb62484e4 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -31,6 +31,7 @@ */ #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -81,6 +82,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str inode = iget(dir->i_sb, ino); if (!inode) return ERR_PTR(-EACCES); + vx_propagate_xid(nd, inode); } if (inode) return d_splice_alias(inode, dentry); diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 483913843..47fff3be5 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -19,7 +19,6 @@ #include #include #include -#include #include /* diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 8c6456a32..b7d4e57fc 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -447,7 +448,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) return ERR_PTR(-ENOMEM); if (sb->s_flags & MS_TAGXID) - inode->i_xid = current->xid; + inode->i_xid = vx_current_xid(); else inode->i_xid = 0; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index fac1e98a0..2a452802f 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2582,7 +2582,7 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } -#ifdef CONFIG_INOXID_GID32 +#ifdef CONFIG_INOXID_INTERN raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid); #endif raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index a040edf93..aaf679cb4 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -60,11 +60,11 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * * This test looks nicer. Thanks to Pauline Middelink */ - if (((oldflags & EXT3_IMMUTABLE_FL) || - ((flags ^ oldflags) & - (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL))) - && !capable(CAP_LINUX_IMMUTABLE)) { - return -EPERM; + if ((oldflags & EXT3_IMMUTABLE_FL) || + ((flags ^ oldflags) & (EXT3_APPEND_FL | + EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; } /* @@ -156,38 +156,6 @@ flags_err: remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); return ret; } -#endif -#if defined(CONFIG_VSERVER_LEGACY) && !defined(CONFIG_INOXID_NONE) - case EXT3_IOC_SETXID: { - handle_t *handle; - struct ext3_iloc iloc; - int xid; - int err; - - /* fixme: if stealth, return -ENOTTY */ - if (!capable(CAP_CONTEXT)) - return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (!(inode->i_sb->s_flags & MS_TAGXID)) - return -ENOSYS; - if (get_user(xid, (int *) arg)) - return -EFAULT; - - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - return err; - - inode->i_xid = (xid & 0xFFFF); - inode->i_ctime = CURRENT_TIME; - - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - ext3_journal_stop(handle); - return err; - } #endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode)) { @@ -256,6 +224,39 @@ flags_err: return err; } +#if defined(CONFIG_VSERVER_LEGACY) && !defined(CONFIG_INOXID_NONE) + case EXT3_IOC_SETXID: { + handle_t *handle; + struct ext3_iloc iloc; + int xid; + int err; + + /* fixme: if stealth, return -ENOTTY */ + if (!capable(CAP_CONTEXT)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (!(inode->i_sb->s_flags & MS_TAGXID)) + return -ENOSYS; + if (get_user(xid, (int *) arg)) + return -EFAULT; + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; + + inode->i_xid = (xid & 0xFFFF); + inode->i_ctime = CURRENT_TIME; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + ext3_journal_stop(handle); + return err; + } +#endif + default: return -ENOTTY; } diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index bfaf8a414..b0b8e1091 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -989,6 +990,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str if (!inode) return ERR_PTR(-EACCES); + vx_propagate_xid(nd, inode); } if (inode) return d_splice_alias(inode, dentry); diff --git a/fs/file_table.c b/fs/file_table.c index 75c94a47f..d68ac3370 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -88,7 +88,7 @@ static int old_max; /* f->f_version: 0 */ INIT_LIST_HEAD(&f->f_list); // set_vx_info(&f->f_vx_info, current->vx_info); - f->f_xid = current->xid; + f->f_xid = vx_current_xid(); vx_files_inc(f); return f; } diff --git a/fs/inode.c b/fs/inode.c index a93f58c6a..471010b20 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -118,7 +118,7 @@ static struct inode *alloc_inode(struct super_block *sb) inode->i_sb = sb; // inode->i_dqh = dqhget(sb->s_dqh); - /* important because of inode slab reuse */ + /* essential because of inode slab reuse */ inode->i_xid = 0; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; diff --git a/fs/ioctl.c b/fs/ioctl.c index 6af7a74c8..19e902dc3 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -174,19 +174,6 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) error = vx_proc_ioctl(filp->f_dentry->d_inode, filp, cmd, arg); break; #endif - case FIOC_SETIATTR: - case FIOC_GETIATTR: - /* - * Verify that this filp is a file object, - * not (say) a socket. - */ - error = -ENOTTY; - if (S_ISREG(filp->f_dentry->d_inode->i_mode) || - S_ISDIR(filp->f_dentry->d_inode->i_mode)) - error = vc_iattr_ioctl(filp->f_dentry, - cmd, arg); - break; - default: error = -ENOTTY; if (S_ISREG(filp->f_dentry->d_inode->i_mode)) diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 9c483a635..efba306ae 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -46,7 +46,6 @@ #include #include #include -#include #include "jfs_incore.h" #include "jfs_filsys.h" diff --git a/fs/namei.c b/fs/namei.c index 6fb8c2532..6e7463615 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -28,7 +28,9 @@ #include #include #include -#include +#include +#include +#include #include #include @@ -230,6 +232,24 @@ int generic_permission(struct inode *inode, int mask, return -EACCES; } +static inline int xid_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) { + vxwprintk(1, "xid=%d did hit the barrier.", + vx_current_xid()); + return -EACCES; + } + if (inode->i_xid == 0) + return 0; + if (vx_check(inode->i_xid, VX_ADMIN|VX_WATCH|VX_IDENT)) + return 0; + + vxwprintk(1, "xid=%d denied access to %p[#%d,%lu] »%s«.", + vx_current_xid(), inode, inode->i_xid, inode->i_ino, + vxd_path(nd->dentry, nd->mnt)); + return -EACCES; +} + int permission(struct inode * inode,int mask, struct nameidata *nd) { int retval; @@ -243,6 +263,9 @@ int permission(struct inode * inode,int mask, struct nameidata *nd) (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; + if ((retval = xid_permission(inode, mask, nd))) + return retval; + if (inode->i_op && inode->i_op->permission) retval = inode->i_op->permission(inode, submask, nd); else @@ -645,15 +668,33 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, { struct vfsmount *mnt = nd->mnt; struct dentry *dentry = __d_lookup(nd->dentry, name); + struct inode *inode; if (!dentry) goto need_lookup; if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; + inode = dentry->d_inode; + if (!inode) + goto done; + if (!vx_check(inode->i_xid, VX_WATCH|VX_HOSTID|VX_IDENT)) + goto hidden; + if (inode->i_sb->s_magic == PROC_SUPER_MAGIC) { + struct proc_dir_entry *de = PDE(inode); + + if (de && !vx_hide_check(0, de->vx_flags)) + goto hidden; + } done: path->mnt = mnt; path->dentry = dentry; return 0; +hidden: + vxwprintk(1, "xid=%d did lookup hidden %p[#%d,%lu] »%s«.", + vx_current_xid(), inode, inode->i_xid, inode->i_ino, + vxd_path(dentry, mnt)); + dput(dentry); + return -ENOENT; need_lookup: if (atomic) diff --git a/fs/namespace.c b/fs/namespace.c index ed977ebbc..da22d9350 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -22,8 +22,8 @@ #include #include #include -#include #include +#include #include #include @@ -164,6 +164,7 @@ clone_mnt(struct vfsmount *old, struct dentry *root) mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; mnt->mnt_namespace = old->mnt_namespace; + mnt->mnt_xid = old->mnt_xid; /* stick the duplicate mount on the same expiry list * as the original if that was on one */ @@ -244,6 +245,11 @@ static int show_vfsmnt(struct seq_file *m, void *v) unsigned long s_flags = mnt->mnt_sb->s_flags; int mnt_flags = mnt->mnt_flags; + if (vx_flags(VXF_HIDE_MOUNT, 0)) + return 0; + if (!vx_check_vfsmount(current->vx_info, mnt)) + return 0; + if (vx_flags(VXF_HIDE_MOUNT, 0)) return 0; if (!vx_check_vfsmount(current->vx_info, mnt)) @@ -264,6 +270,8 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_puts(m, p->unset_str); } } + if (mnt->mnt_flags & MNT_XID) + seq_printf(m, ",xid=%d", mnt->mnt_xid); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); @@ -349,8 +357,10 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -static inline void __umount_tree(struct vfsmount *mnt, struct list_head *kill) +static inline void __umount_list(struct list_head *kill) { + struct vfsmount *mnt; + while (!list_empty(kill)) { mnt = list_entry(kill->next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); @@ -377,7 +387,7 @@ void umount_tree(struct vfsmount *mnt) list_del(&p->mnt_list); list_add(&p->mnt_list, &kill); } - __umount_tree(mnt, &kill); + __umount_list(&kill); } void umount_unused(struct vfsmount *mnt, struct fs_struct *fs) @@ -391,7 +401,7 @@ void umount_unused(struct vfsmount *mnt, struct fs_struct *fs) list_del(&p->mnt_list); list_add(&p->mnt_list, &kill); } - __umount_tree(mnt, &kill); + __umount_list(&kill); } static int do_umount(struct vfsmount *mnt, int flags) @@ -650,7 +660,7 @@ out_unlock: /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags) +static int do_loopback(struct nameidata *nd, char *old_name, xid_t xid, unsigned long flags, int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; @@ -681,6 +691,10 @@ static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags list_del_init(&mnt->mnt_fslink); spin_unlock(&vfsmount_lock); + if (flags & MS_XID) { + mnt->mnt_xid = xid; + mnt->mnt_flags |= MNT_XID; + } err = graft_tree(mnt, nd); if (err) { spin_lock(&vfsmount_lock); @@ -703,7 +717,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags */ static int do_remount(struct nameidata *nd, int flags, int mnt_flags, - void *data) + void *data, xid_t xid) { int err; struct super_block * sb = nd->mnt->mnt_sb; @@ -721,8 +735,11 @@ static int do_remount(struct nameidata *nd, int flags, int mnt_flags, mnt_flags |= MNT_NODEV; down_write(&sb->s_umount); err = do_remount_sb(sb, flags, data, 0); - if (!err) + if (!err) { nd->mnt->mnt_flags=mnt_flags; + if (flags & MS_XID) + nd->mnt->mnt_xid = xid; + } up_write(&sb->s_umount); if (!err) security_sb_post_remount(nd->mnt, flags, data); @@ -1048,6 +1065,7 @@ long do_mount(char * dev_name, char * dir_name, char *type_page, struct nameidata nd; int retval = 0; int mnt_flags = 0; + xid_t xid = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) @@ -1063,6 +1081,14 @@ long do_mount(char * dev_name, char * dir_name, char *type_page, if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; + retval = vx_parse_xid(data_page, &xid, 1); + if (retval) { + mnt_flags |= MNT_XID; + /* bind and re-mounts get xid flag */ + if (flags & (MS_BIND|MS_REMOUNT)) + flags |= MS_XID; + } + /* Separate the per-mountpoint flags */ if (flags & MS_RDONLY) mnt_flags |= MNT_RDONLY; @@ -1092,9 +1118,10 @@ long do_mount(char * dev_name, char * dir_name, char *type_page, if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); + data_page, xid); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags, mnt_flags); + retval = do_loopback(&nd, dev_name, xid, flags, mnt_flags); + else if (flags & MS_MOVE) retval = do_move_mount(&nd, dev_name); else diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 34a3c1f74..0547efd1d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "delegation.h" @@ -759,6 +760,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); if (!inode) goto out_unlock; + vx_propagate_xid(nd, inode); no_entry: error = 0; d_add(dentry, inode); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 60b307478..38318ceae 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -723,7 +723,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) out: return inode; -/* +/* FIXME fail_dlim: make_bad_inode(inode); iput(inode); diff --git a/fs/open.c b/fs/open.c index 39b0d45a0..f09f6488f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -27,6 +27,9 @@ #include #include #include +#include +#include +#include #include diff --git a/fs/proc/array.c b/fs/proc/array.c index a29937cf0..909c8ab39 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -73,7 +73,6 @@ #include #include #include -#include #include #include #include @@ -146,8 +145,8 @@ static inline const char * get_task_state(struct task_struct *tsk) TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | - TASK_TRACED | - TASK_ONHOLD)) | + TASK_TRACED | + TASK_ONHOLD)) | (tsk->exit_state & (EXIT_ZOMBIE | EXIT_DEAD)); const char **p = &task_state_array[0]; @@ -163,12 +162,12 @@ static inline char * task_state(struct task_struct *p, char *buffer) { struct group_info *group_info; int g; - pid_t pid, ppid, tppid, tgid; + pid_t pid, ptgid, tppid, tgid; read_lock(&tasklist_lock); tgid = vx_map_tgid(p->tgid); pid = vx_map_pid(p->pid); - ppid = vx_map_pid(p->real_parent->pid); + ptgid = vx_map_pid(p->group_leader->real_parent->tgid); tppid = vx_map_pid(p->parent->pid); buffer += sprintf(buffer, "State:\t%s\n" @@ -181,8 +180,8 @@ static inline char * task_state(struct task_struct *p, char *buffer) "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), (p->sleep_avg/1024)*100/(1020000000/1024), - tgid, pid, (pid > 1) ? ppid : 0, - p->pid && p->ptrace ? tppid : 0, + tgid, pid, (pid > 1) ? ptgid : 0, + pid_alive(p) && p->ptrace ? tppid : 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); read_unlock(&tasklist_lock); @@ -418,10 +417,11 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) stime += task->signal->stime; } } - if (task_vx_flags(task, VXF_VIRT_UPTIME, 0)) { - bias_uptime = task->vx_info->cvirt.bias_uptime.tv_sec * NSEC_PER_SEC - + task->vx_info->cvirt.bias_uptime.tv_nsec; - } + pid = vx_info_map_pid(task->vx_info, pid_alive(task) ? task->pid : 0); + ppid = (!(pid > 1)) ? 0 : vx_info_map_tgid(task->vx_info, + task->group_leader->real_parent->tgid); + pgid = vx_info_map_pid(task->vx_info, pgid); + read_unlock(&tasklist_lock); if (!whole || num_threads<2) { @@ -453,9 +453,21 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) /* convert timespec -> nsec*/ start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + task->start_time.tv_nsec; + /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time - bias_uptime); + /* fixup start time for virt uptime */ + if (vx_flags(VXF_VIRT_UPTIME, 0)) { + unsigned long long bias = + current->vx_info->cvirt.bias_clock; + + if (start_time > bias) + start_time -= bias; + else + start_time = 0; + } + res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", diff --git a/fs/proc/base.c b/fs/proc/base.c index 0a5916c2b..a4caaae4f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1268,6 +1268,9 @@ static struct file_operations proc_tgid_attr_operations; static struct inode_operations proc_tgid_attr_inode_operations; #endif +extern int proc_pid_vx_info(struct task_struct *, char *); +extern int proc_pid_nx_info(struct task_struct *, char *); + /* SMP-safe */ static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, @@ -1530,14 +1533,14 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { char tmp[30]; - sprintf(tmp, "%d", vx_map_pid(current->tgid)); + sprintf(tmp, "%d", vx_map_tgid(current->tgid)); return vfs_readlink(dentry,buffer,buflen,tmp); } static int proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { char tmp[30]; - sprintf(tmp, "%d", vx_map_pid(current->tgid)); + sprintf(tmp, "%d", vx_map_tgid(current->tgid)); return vfs_follow_link(nd,tmp); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 97e6b98b3..f42a81260 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -21,6 +21,7 @@ #include #include #include +#include #include static ssize_t proc_file_read(struct file *file, char __user *buf, @@ -388,7 +389,8 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam error = -EINVAL; inode = proc_get_inode(dir->i_sb, ino, de); - inode->i_xid = vx_current_xid(); + /* generic proc entries belong to the host */ + inode->i_xid = 0; break; } } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index dbe1fa788..e042c2083 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -53,6 +53,8 @@ #include #include +#include + #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) /* diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 30e19a145..de207dc09 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -19,6 +19,7 @@ #include #include #include +#include #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--; @@ -350,6 +351,7 @@ static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dent reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-EACCES); } + vx_propagate_xid(nd, inode); /* Propogate the priv_object flag so we know we're in the priv tree */ if (is_reiserfs_priv_object (dir)) diff --git a/fs/super.c b/fs/super.c index 47d461a9c..035abec62 100644 --- a/fs/super.c +++ b/fs/super.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 57b699176..07a29a26b 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -11,8 +11,6 @@ #include "sysfs.h" -/* Random magic number */ -#define SYSFS_MAGIC 0x62656572 struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; @@ -36,7 +34,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = SYSFS_SUPER_MAGIC; sb->s_op = &sysfs_ops; sysfs_sb = sb; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 17debc1b9..4c9a2fb40 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1013,7 +1013,7 @@ xfs_ioc_fsgeometry( #define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */ #define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */ #define LINUX_XFLAG_BARRIER 0x00004000 /* chroot() barrier */ -#define LINUX_XFLAG_IUNLINK 0x00008000 /* Immutable unlink */ +#define LINUX_XFLAG_IUNLINK 0x00008000 /* immutable unlink */ STATIC unsigned int xfs_merge_ioc_xflags( @@ -1056,6 +1056,8 @@ xfs_di2lxflags( flags |= LINUX_XFLAG_IMMUTABLE; if (di_flags & XFS_DIFLAG_IUNLINK) flags |= LINUX_XFLAG_IUNLINK; + if (di_flags & XFS_DIFLAG_BARRIER) + flags |= LINUX_XFLAG_BARRIER; if (di_flags & XFS_DIFLAG_APPEND) flags |= LINUX_XFLAG_APPEND; if (di_flags & XFS_DIFLAG_SYNC) diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 1a46def0a..425dafdd6 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -460,7 +460,7 @@ xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp); #define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ #define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ #define XFS_DIFLAG_BARRIER_BIT 12 /* chroot() barrier */ -#define XFS_DIFLAG_IUNLINK_BIT 13 /* inode has iunlink */ +#define XFS_DIFLAG_IUNLINK_BIT 13 /* immutable unlink */ #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) @@ -476,6 +476,7 @@ xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp); #define XFS_DIFLAG_BARRIER (1 << XFS_DIFLAG_BARRIER_BIT) #define XFS_DIFLAG_IUNLINK (1 << XFS_DIFLAG_IUNLINK_BIT) + #define XFS_DIFLAG_ANY \ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 8290ea7fb..94a596a87 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -80,7 +80,7 @@ struct fsxattr { #define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ #define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ #define XFS_XFLAG_BARRIER 0x00004000 /* chroot() barrier */ -#define XFS_XFLAG_IUNLINK 0x00008000 /* Immutable unlink */ +#define XFS_XFLAG_IUNLINK 0x00008000 /* immutable unlink */ #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h index 01e616e9a..209cda185 100644 --- a/include/asm-i386/param.h +++ b/include/asm-i386/param.h @@ -4,7 +4,21 @@ #include #ifdef __KERNEL__ -# define HZ (CONFIG_X86_HZ) + +#if defined(CONFIG_X86_HZ) && defined(CONFIG_KERNEL_HZ) +#error MEF: fix up CONFIG to only use one of these +#endif + +#ifdef CONFIG_X86_HZ +# define HZ CONFIG_X86_HZ +#else +# ifdef CONFIG_KERNEL_HZ +# define HZ CONFIG_KERNEL_HZ +# else +# define HZ 1000 /* Internal kernel timer frequency */ +# endif +#endif + # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif diff --git a/include/asm-parisc/unistd.h b/include/asm-parisc/unistd.h index 80c2db1ce..9fe32c447 100644 --- a/include/asm-parisc/unistd.h +++ b/include/asm-parisc/unistd.h @@ -756,7 +756,7 @@ #define __NR_get_mempolicy (__NR_Linux + 261) #define __NR_set_mempolicy (__NR_Linux + 262) #define __NR_vserver (__NR_Linux + 273) -#define __NR_Linux_syscalls 274 + #define HPUX_GATEWAY_ADDR 0xC0000004 #define LINUX_GATEWAY_ADDR 0x100 diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index d224b21bc..87eff2fc1 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h @@ -86,7 +86,8 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un if (rss < freed) freed = rss; - mm->rss = rss - freed; + // mm->rss = rss - freed; + vx_rsspages_sub(mm, freed); tlb_flush_mmu(mp); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 5f8269936..907c3c612 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -30,7 +30,6 @@ static inline void devpts_pty_kill(int number) { } #endif -#define DEVPTS_SUPER_MAGIC 0x1cd1 - +#define DEVPTS_SUPER_MAGIC 0x1cd1 #endif /* _LINUX_DEVPTS_FS_H */ diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h index c2bd10f1f..a9858024b 100644 --- a/include/linux/ext2_fs.h +++ b/include/linux/ext2_fs.h @@ -320,7 +320,7 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ -#define EXT2_MOUNT_TAG_XID (1<<16) /* Enable Context Tags */ +#define EXT2_MOUNT_TAG_XID (1<<24) /* Enable Context Tags */ #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt #define set_opt(o, opt) o |= EXT2_MOUNT_##opt diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index d11f5d116..f2d1cd9fa 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -196,6 +196,9 @@ struct ext3_group_desc #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ #endif +#ifdef CONFIG_VSERVER_LEGACY +#define EXT3_IOC_SETXID FIOC_SETXIDJ +#endif /* * Inode dynamic state flags @@ -366,7 +369,7 @@ struct ext3_inode { #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT3_MOUNT_TAG_XID 0x40000 /* Enable Context Tags */ +#define EXT3_MOUNT_TAG_XID (1<<24) /* Enable Context Tags */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/include/linux/fs.h b/include/linux/fs.h index 667bf7345..93a6a10d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -126,6 +126,7 @@ extern int dir_notify_enable; #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ #define MS_ONE_SECOND (1<<17) /* fs has 1 sec a/m/ctime resolution */ #define MS_TAGXID (1<<24) /* tag inodes with context information */ +#define MS_XID (1<<25) /* use specific xid for this mount */ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -152,8 +153,8 @@ extern int dir_notify_enable; #define S_DIRSYNC 64 /* Directory modifications are synchronous */ #define S_NOCMTIME 128 /* Do not update file c/mtime */ #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ -#define S_BARRIER 512 /* Barrier for chroot() */ -#define S_IUNLINK 1024 /* Immutable unlink */ +#define S_BARRIER 1024 /* Barrier for chroot() */ +#define S_IUNLINK 2048 /* Immutable unlink */ /* * Note that nosuid etc flags are inode-specific: setting some file-system diff --git a/include/linux/mount.h b/include/linux/mount.h index 8821af0e7..03c6f6d7b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -22,6 +22,7 @@ #define MNT_RDONLY 8 #define MNT_NOATIME 16 #define MNT_NODIRATIME 32 +#define MNT_XID 256 struct vfsmount { @@ -39,6 +40,7 @@ struct vfsmount struct list_head mnt_list; struct list_head mnt_fslink; /* link in fs-specific expiry list */ struct namespace *mnt_namespace; /* containing namespace */ + xid_t mnt_xid; /* xid tagging used for vfsmount */ }; #define MNT_IS_RDONLY(m) ((m) && ((m)->mnt_flags & MNT_RDONLY)) diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h index 553a79945..9446bc5e8 100644 --- a/include/linux/reiserfs_fs_sb.h +++ b/include/linux/reiserfs_fs_sb.h @@ -458,6 +458,7 @@ enum reiserfs_mount_options { REISERFS_BARRIER_NONE, REISERFS_BARRIER_FLUSH, REISERFS_TAGXID, + /* Actions on error */ REISERFS_ERROR_PANIC, REISERFS_ERROR_RO, diff --git a/include/linux/sched.h b/include/linux/sched.h index 96b615cc6..9cb07d16b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -30,6 +30,7 @@ #include #include #include +#include struct exec_domain; extern int exec_shield; @@ -949,15 +950,28 @@ static inline int sas_ss_flags(unsigned long sp) #ifdef CONFIG_SECURITY /* code is in security.c */ extern int capable(int cap); +extern int vx_capable(int cap, int ccap); #else static inline int capable(int cap) { + if (vx_check_bit(VXC_CAP_MASK, cap) && !vx_mcaps(1L << cap)) + return 0; if (cap_raised(current->cap_effective, cap)) { current->flags |= PF_SUPERPRIV; return 1; } return 0; } + +static inline int vx_capable(int cap, int ccap) +{ + if (cap_raised(current->cap_effective, cap) && + vx_ccaps(ccap)) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a7f776ee3..cf93d31f7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -134,8 +134,8 @@ enum KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ - KERN_SETUID_DUMPABLE=67, /* int: behaviour of dumps for setuid core */ - KERN_VSHELPER=68, /* string: path to vshelper policy agent */ + KERN_VSHELPER=67, /* string: path to vshelper policy agent */ + KERN_SETUID_DUMPABLE=68, /* int: behaviour of dumps for setuid core */ KERN_DUMP=69, /* dir: dump parameters */ }; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index d12ee2b1e..acb39e268 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -11,6 +11,8 @@ #include +#define SYSFS_SUPER_MAGIC 0x62656572 + struct kobject; struct module; diff --git a/include/linux/vs_base.h b/include/linux/vs_base.h index 4f04513ff..a1d34b6ff 100644 --- a/include/linux/vs_base.h +++ b/include/linux/vs_base.h @@ -1,16 +1,8 @@ #ifndef _VX_VS_BASE_H #define _VX_VS_BASE_H -#include "vserver/context.h" - -// #define VX_DEBUG - -#if defined(VX_DEBUG) -#define vxdprintk(x...) printk("vxd: " x) -#else -#define vxdprintk(x...) -#endif +#include "vserver/context.h" #define vx_task_xid(t) ((t)->xid) @@ -26,7 +18,7 @@ * check current context for ADMIN/WATCH and * optionally agains supplied argument */ -static __inline__ int __vx_check(xid_t cid, xid_t id, unsigned int mode) +static inline int __vx_check(xid_t cid, xid_t id, unsigned int mode) { if (mode & VX_ARG_MASK) { if ((mode & VX_IDENT) && @@ -43,36 +35,70 @@ static __inline__ int __vx_check(xid_t cid, xid_t id, unsigned int mode) return 1; } return (((mode & VX_ADMIN) && (cid == 0)) || - ((mode & VX_WATCH) && (cid == 1))); + ((mode & VX_WATCH) && (cid == 1)) || + ((mode & VX_HOSTID) && (id == 0))); } -#define __vx_flags(v,m,f) (((v) & (m)) ^ (f)) +#define __vx_state(v) ((v) ? ((v)->vx_state) : 0) + +#define vx_info_state(v,m) (__vx_state(v) & (m)) + + +/* generic flag merging */ + +#define vx_check_flags(v,m,f) (((v) & (m)) ^ (f)) + +#define vx_mask_flags(v,f,m) (((v) & ~(m)) | ((f) & (m))) + +#define vx_mask_mask(v,f,m) (((v) & ~(m)) | ((v) & (f) & (m))) + +#define vx_check_bit(v,n) ((v) & (1LL << (n))) + -#define __vx_task_flags(t,m,f) \ - (((t) && ((t)->vx_info)) ? \ - __vx_flags((t)->vx_info->vx_flags,(m),(f)) : 0) +/* context flags */ -#define vx_current_flags() \ - ((current->vx_info) ? current->vx_info->vx_flags : 0) +#define __vx_flags(v) ((v) ? (v)->vx_flags : 0) -#define vx_flags(m,f) __vx_flags(vx_current_flags(),(m),(f)) +#define vx_current_flags() __vx_flags(current->vx_info) +#define vx_info_flags(v,m,f) \ + vx_check_flags(__vx_flags(v),(m),(f)) -#define vx_current_ccaps() \ - ((current->vx_info) ? current->vx_info->vx_ccaps : 0) +#define task_vx_flags(t,m,f) \ + ((t) && vx_info_flags((t)->vx_info, (m), (f))) + +#define vx_flags(m,f) vx_info_flags(current->vx_info,(m),(f)) + + +/* context caps */ + +#define __vx_ccaps(v) ((v) ? (v)->vx_ccaps : 0) + +#define vx_current_ccaps() __vx_ccaps(current->vx_info) + +#define vx_info_ccaps(v,c) (__vx_ccaps(v) & (c)) + +#define vx_ccaps(c) vx_info_ccaps(current->vx_info,(c)) + + +#define __vx_mcaps(v) ((v) ? (v)->vx_ccaps >> 32UL : ~0 ) + +#define vx_info_mcaps(v,c) (__vx_mcaps(v) & (c)) + +#define vx_mcaps(c) vx_info_mcaps(current->vx_info,(c)) -#define vx_ccaps(c) (vx_current_ccaps() & (c)) #define vx_current_bcaps() \ (((current->vx_info) && !vx_flags(VXF_STATE_SETUP, 0)) ? \ current->vx_info->vx_bcaps : cap_bset) -/* generic flag merging */ - -#define vx_mask_flags(v,f,m) (((v) & ~(m)) | ((f) & (m))) +#define vx_current_initpid(n) \ + (current->vx_info && \ + (current->vx_info->vx_initpid == (n))) -#define vx_mask_mask(v,f,m) (((v) & ~(m)) | ((v) & (f) & (m))) +#else +#warning duplicate inclusion #endif diff --git a/include/linux/vs_context.h b/include/linux/vs_context.h index 9d119cdc2..cc41014f1 100644 --- a/include/linux/vs_context.h +++ b/include/linux/vs_context.h @@ -3,16 +3,9 @@ #include -#include -#include - -#include "vserver/context.h" #include "vserver/debug.h" -extern int proc_pid_vx_info(struct task_struct *, char *); - - #define get_vx_info(i) __get_vx_info(i,__FILE__,__LINE__) static inline struct vx_info *__get_vx_info(struct vx_info *vxi, @@ -20,25 +13,28 @@ static inline struct vx_info *__get_vx_info(struct vx_info *vxi, { if (!vxi) return NULL; + vxlprintk(VXD_CBIT(xid, 2), "get_vx_info(%p[#%d.%d])", vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_usecnt):0, _file, _line); + vxh_get_vx_info(vxi); + atomic_inc(&vxi->vx_usecnt); return vxi; } - -extern void free_vx_info(struct vx_info *); - #define put_vx_info(i) __put_vx_info(i,__FILE__,__LINE__) static inline void __put_vx_info(struct vx_info *vxi, const char *_file, int _line) { if (!vxi) return; + vxlprintk(VXD_CBIT(xid, 2), "put_vx_info(%p[#%d.%d])", vxi, vxi?vxi->vx_id:0, vxi?atomic_read(&vxi->vx_usecnt):0, _file, _line); + vxh_put_vx_info(vxi); + if (atomic_dec_and_test(&vxi->vx_usecnt)) free_vx_info(vxi); } @@ -58,6 +54,7 @@ static inline void __set_vx_info(struct vx_info **vxp, struct vx_info *vxi, vxi?atomic_read(&vxi->vx_usecnt):0, vxi?atomic_read(&vxi->vx_refcnt):0, _file, _line); + vxh_set_vx_info(vxi, vxp); atomic_inc(&vxi->vx_refcnt); vxo = xchg(vxp, __get_vx_info(vxi, _file, _line)); @@ -80,6 +77,7 @@ static inline void __clr_vx_info(struct vx_info **vxp, vxo?atomic_read(&vxo->vx_usecnt):0, vxo?atomic_read(&vxo->vx_refcnt):0, _file, _line); + vxh_clr_vx_info(vxo, vxp); if (atomic_dec_and_test(&vxo->vx_refcnt)) unhash_vx_info(vxo); @@ -87,7 +85,7 @@ static inline void __clr_vx_info(struct vx_info **vxp, } -#define task_get_vx_info(i) __task_get_vx_info(i,__FILE__,__LINE__) +#define task_get_vx_info(p) __task_get_vx_info(p,__FILE__,__LINE__) static __inline__ struct vx_info *__task_get_vx_info(struct task_struct *p, const char *_file, int _line) diff --git a/include/linux/vs_cvirt.h b/include/linux/vs_cvirt.h index 65f430362..64b38c2f8 100644 --- a/include/linux/vs_cvirt.h +++ b/include/linux/vs_cvirt.h @@ -2,16 +2,8 @@ #define _VX_VS_CVIRT_H -// #define VX_DEBUG - #include "vserver/cvirt.h" -#include "vs_base.h" - -#if defined(VX_DEBUG) -#define vxdprintk(x...) printk("vxd: " x) -#else -#define vxdprintk(x...) -#endif +#include "vserver/debug.h" /* utsname virtualization */ @@ -29,42 +21,88 @@ static inline struct new_utsname *vx_new_utsname(void) /* pid faking stuff */ -#define vx_map_tgid(v,p) \ - __vx_map_tgid((v), (p), __FILE__, __LINE__) +#define vx_info_map_pid(v,p) \ + __vx_info_map_pid((v), (p), __FUNC__, __FILE__, __LINE__) +#define vx_info_map_tgid(v,p) vx_info_map_pid(v,p) +#define vx_map_pid(p) vx_info_map_pid(current->vx_info, p) +#define vx_map_tgid(p) vx_map_pid(p) -static inline int __vx_map_tgid(struct vx_info *vxi, int pid, - char *file, int line) +static inline int __vx_info_map_pid(struct vx_info *vxi, int pid, + const char *func, const char *file, int line) { - if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) { - vxdprintk("vx_map_tgid: %p/%llx: %d -> %d in %s:%d\n", - vxi, vxi->vx_flags, pid, - (pid == vxi->vx_initpid)?1:pid, - file, line); + if (vx_info_flags(vxi, VXF_INFO_INIT, 0)) { + vxfprintk(VXD_CBIT(cvirt, 2), + "vx_map_tgid: %p/%llx: %d -> %d", + vxi, (long long)vxi->vx_flags, pid, + (pid && pid == vxi->vx_initpid)?1:pid, + func, file, line); + if (pid == 0) + return 0; if (pid == vxi->vx_initpid) return 1; } return pid; } -#define vx_rmap_tgid(v,p) \ - __vx_rmap_tgid((v), (p), __FILE__, __LINE__) +#define vx_info_rmap_pid(v,p) \ + __vx_info_rmap_pid((v), (p), __FUNC__, __FILE__, __LINE__) +#define vx_rmap_pid(p) vx_info_rmap_pid(current->vx_info, p) +#define vx_rmap_tgid(p) vx_rmap_pid(p) -static inline int __vx_rmap_tgid(struct vx_info *vxi, int pid, - char *file, int line) +static inline int __vx_info_rmap_pid(struct vx_info *vxi, int pid, + const char *func, const char *file, int line) { - if (vxi && __vx_flags(vxi->vx_flags, VXF_INFO_INIT, 0)) { - vxdprintk("vx_rmap_tgid: %p/%llx: %d -> %d in %s:%d\n", - vxi, vxi->vx_flags, pid, + if (vx_info_flags(vxi, VXF_INFO_INIT, 0)) { + vxfprintk(VXD_CBIT(cvirt, 2), + "vx_rmap_tgid: %p/%llx: %d -> %d", + vxi, (long long)vxi->vx_flags, pid, (pid == 1)?vxi->vx_initpid:pid, - file, line); + func, file, line); if ((pid == 1) && vxi->vx_initpid) return vxi->vx_initpid; + if (pid == vxi->vx_initpid) + return ~0U; } return pid; } -#undef vxdprintk -#define vxdprintk(x...) + +static inline void vx_activate_task(struct task_struct *p) +{ + struct vx_info *vxi; + + if ((vxi = p->vx_info)) { + vx_update_load(vxi); + atomic_inc(&vxi->cvirt.nr_running); + } +} + +static inline void vx_deactivate_task(struct task_struct *p) +{ + struct vx_info *vxi; + + if ((vxi = p->vx_info)) { + vx_update_load(vxi); + atomic_dec(&vxi->cvirt.nr_running); + } +} + +static inline void vx_uninterruptible_inc(struct task_struct *p) +{ + struct vx_info *vxi; + + if ((vxi = p->vx_info)) + atomic_inc(&vxi->cvirt.nr_uninterruptible); +} + +static inline void vx_uninterruptible_dec(struct task_struct *p) +{ + struct vx_info *vxi; + + if ((vxi = p->vx_info)) + atomic_dec(&vxi->cvirt.nr_uninterruptible); +} + #else #warning duplicate inclusion diff --git a/include/linux/vs_dlimit.h b/include/linux/vs_dlimit.h index 805c25748..b92768724 100644 --- a/include/linux/vs_dlimit.h +++ b/include/linux/vs_dlimit.h @@ -1,11 +1,7 @@ #ifndef _VX_VS_DLIMIT_H #define _VX_VS_DLIMIT_H -#include -#include -#include -#include "vserver/context.h" #include "vserver/dlimit.h" #include "vserver/debug.h" @@ -112,7 +108,7 @@ static inline int __dl_alloc_inode(struct super_block *sb, dli->dl_inodes_used++; #if 0 else - printk("VSW: DLIMIT hit (%p,#%d), inode %d>=%d @ %s:%d\n", + vxwprintk("DLIMIT hit (%p,#%d), inode %d>=%d @ %s:%d", sb, xid, dli->dl_inodes_used, dli->dl_inodes_total, file, line); diff --git a/include/linux/vs_limit.h b/include/linux/vs_limit.h index 82e8de4ec..561df5a70 100644 --- a/include/linux/vs_limit.h +++ b/include/linux/vs_limit.h @@ -2,97 +2,78 @@ #define _VX_VS_LIMIT_H -// #define VX_DEBUG - -#include -#include -#include - -#include "vserver/context.h" #include "vserver/limit.h" +#include "vserver/debug.h" /* file limits */ -#define VX_DEBUG_ACC_FILE 0 -#define VX_DEBUG_ACC_OPENFD 0 - -#if (VX_DEBUG_ACC_FILE) || (VX_DEBUG_ACC_OPENFD) -#define vxdprintk(x...) printk("vxd: " x) -#else -#define vxdprintk(x...) -#endif - - -#define vx_acc_cres(v,d,r) \ - __vx_acc_cres((v), (r), (d), __FILE__, __LINE__) static inline void __vx_acc_cres(struct vx_info *vxi, - int res, int dir, char *file, int line) + int res, int dir, void *_data, char *_file, int _line) { - if (vxi) { - if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) || - (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD)) - printk("vx_acc_cres[%5d,%2d]: %5d%s in %s:%d\n", - (vxi?vxi->vx_id:-1), res, - (vxi?atomic_read(&vxi->limit.rcur[res]):0), - (dir>0)?"++":"--", file, line); - if (dir > 0) - atomic_inc(&vxi->limit.rcur[res]); - else - atomic_dec(&vxi->limit.rcur[res]); - } + if (VXD_RLIMIT(res, RLIMIT_NOFILE) || + VXD_RLIMIT(res, RLIMIT_NPROC) || + VXD_RLIMIT(res, VLIMIT_NSOCK)) + vxlprintk(1, "vx_acc_cres[%5d,%s,%2d]: %5d%s (%p)", + (vxi?vxi->vx_id:-1), vlimit_name[res], res, + (vxi?atomic_read(&vxi->limit.rcur[res]):0), + (dir>0)?"++":"--", _data, _file, _line); + if (vxi) { + if (dir > 0) + atomic_inc(&vxi->limit.rcur[res]); + else + atomic_dec(&vxi->limit.rcur[res]); + } } -#define vx_nproc_inc(p) vx_acc_cres(current->vx_info, 1, RLIMIT_NPROC) -#define vx_nproc_dec(p) vx_acc_cres(current->vx_info,-1, RLIMIT_NPROC) +#define vx_acc_cres(v,d,p,r) \ + __vx_acc_cres((v), (r), (d), (p), __FILE__, __LINE__) + +#define vx_acc_cres_cond(x,d,p,r) \ + __vx_acc_cres(((x) == vx_current_xid()) ? current->vx_info : 0,\ + (r), (d), (p), __FILE__, __LINE__) + +#define vx_nproc_inc(p) \ + vx_acc_cres((p)->vx_info, 1, (p), RLIMIT_NPROC) -#define vx_files_inc(f) vx_acc_cres(current->vx_info, 1, RLIMIT_NOFILE) -#define vx_files_dec(f) vx_acc_cres(current->vx_info,-1, RLIMIT_NOFILE) +#define vx_nproc_dec(p) \ + vx_acc_cres((p)->vx_info,-1, (p), RLIMIT_NPROC) -#define vx_openfd_inc(f) vx_acc_cres(current->vx_info, 1, RLIMIT_OPENFD) -#define vx_openfd_dec(f) vx_acc_cres(current->vx_info,-1, RLIMIT_OPENFD) +#define vx_files_inc(f) \ + vx_acc_cres_cond((f)->f_xid, 1, (f), RLIMIT_NOFILE) -/* -#define vx_openfd_inc(f) do { \ - vx_acc_cres(current->vx_info, 1, RLIMIT_OPENFD); \ - printk("vx_openfd_inc: %d[#%d] in %s:%d\n", \ - f, current->xid, __FILE__, __LINE__); \ - } while (0) +#define vx_files_dec(f) \ + vx_acc_cres_cond((f)->f_xid,-1, (f), RLIMIT_NOFILE) -#define vx_openfd_dec(f) do { \ - vx_acc_cres(current->vx_info,-1, RLIMIT_OPENFD); \ - printk("vx_openfd_dec: %d[#%d] in %s:%d\n", \ - f, current->xid, __FILE__, __LINE__); \ - } while (0) -*/ #define vx_cres_avail(v,n,r) \ - __vx_cres_avail((v), (r), (n), __FILE__, __LINE__) + __vx_cres_avail((v), (r), (n), __FILE__, __LINE__) static inline int __vx_cres_avail(struct vx_info *vxi, - int res, int num, char *file, int line) + int res, int num, char *_file, int _line) { unsigned long value; - if ((res == RLIMIT_NOFILE && VX_DEBUG_ACC_FILE) || - (res == RLIMIT_OPENFD && VX_DEBUG_ACC_OPENFD)) - printk("vx_cres_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n", - (vxi?vxi->vx_id:-1), res, + if (VXD_RLIMIT(res, RLIMIT_NOFILE) || + VXD_RLIMIT(res, RLIMIT_NPROC) || + VXD_RLIMIT(res, VLIMIT_NSOCK)) + vxlprintk(1, "vx_cres_avail[%5d,%s,%2d]: %5ld > %5d + %5d", + (vxi?vxi->vx_id:-1), vlimit_name[res], res, (vxi?vxi->limit.rlim[res]:1), - (vxi?atomic_read(&vxi->limit.rcur[res]):0), - num, file, line); - if (!vxi) - return 1; - value = atomic_read(&vxi->limit.rcur[res]); + (vxi?atomic_read(&vxi->limit.rcur[res]):0), + num, _file, _line); + if (!vxi) + return 1; + value = atomic_read(&vxi->limit.rcur[res]); if (value > vxi->limit.rmax[res]) vxi->limit.rmax[res] = value; - if (vxi->limit.rlim[res] == RLIM_INFINITY) - return 1; - if (value + num <= vxi->limit.rlim[res]) - return 1; + if (vxi->limit.rlim[res] == RLIM_INFINITY) + return 1; + if (value + num <= vxi->limit.rlim[res]) + return 1; atomic_inc(&vxi->limit.lhit[res]); - return 0; + return 0; } #define vx_nproc_avail(n) \ @@ -101,18 +82,16 @@ static inline int __vx_cres_avail(struct vx_info *vxi, #define vx_files_avail(n) \ vx_cres_avail(current->vx_info, (n), RLIMIT_NOFILE) -#define vx_openfd_avail(n) \ - vx_cres_avail(current->vx_info, (n), RLIMIT_OPENFD) - /* socket limits */ -#define vx_sock_inc(f) vx_acc_cres(current->vx_info, 1, VLIMIT_SOCK) -#define vx_sock_dec(f) vx_acc_cres(current->vx_info,-1, VLIMIT_SOCK) +#define vx_sock_inc(s) \ + vx_acc_cres((s)->sk_vx_info, 1, (s), VLIMIT_NSOCK) +#define vx_sock_dec(s) \ + vx_acc_cres((s)->sk_vx_info,-1, (s), VLIMIT_NSOCK) #define vx_sock_avail(n) \ - vx_cres_avail(current->vx_info, (n), VLIMIT_SOCK) - + vx_cres_avail(current->vx_info, (n), VLIMIT_NSOCK) #else #warning duplicate inclusion diff --git a/include/linux/vs_memory.h b/include/linux/vs_memory.h index 2fe9c0809..2509432f0 100644 --- a/include/linux/vs_memory.h +++ b/include/linux/vs_memory.h @@ -2,44 +2,35 @@ #define _VX_VS_MEMORY_H -// #define VX_DEBUG - -#include -#include -#include - -#include "vserver/context.h" #include "vserver/limit.h" +#include "vserver/debug.h" -#define VX_DEBUG_ACC_RSS 0 -#define VX_DEBUG_ACC_VM 0 -#define VX_DEBUG_ACC_VML 0 - -#if (VX_DEBUG_ACC_RSS) || (VX_DEBUG_ACC_VM) || (VX_DEBUG_ACC_VML) -#define vxdprintk(x...) printk("vxd: " x) -#else -#define vxdprintk(x...) -#endif - #define vx_acc_page(m, d, v, r) \ __vx_acc_page(&(m->v), m->mm_vx_info, r, d, __FILE__, __LINE__) static inline void __vx_acc_page(unsigned long *v, struct vx_info *vxi, - int res, int dir, char *file, int line) + int res, int dir, char *file, int line) { - if (v) { - if (dir > 0) - ++(*v); - else - --(*v); - } - if (vxi) { - if (dir > 0) - atomic_inc(&vxi->limit.rcur[res]); - else - atomic_dec(&vxi->limit.rcur[res]); - } + if (VXD_RLIMIT(res, RLIMIT_RSS) || + VXD_RLIMIT(res, RLIMIT_AS) || + VXD_RLIMIT(res, RLIMIT_MEMLOCK)) + vxlprintk(1, "vx_acc_page[%5d,%s,%2d]: %5d%s", + (vxi?vxi->vx_id:-1), vlimit_name[res], res, + (vxi?atomic_read(&vxi->limit.rcur[res]):0), + (dir?"++":"--"), file, line); + if (v) { + if (dir > 0) + ++(*v); + else + --(*v); + } + if (vxi) { + if (dir > 0) + atomic_inc(&vxi->limit.rcur[res]); + else + atomic_dec(&vxi->limit.rcur[res]); + } } @@ -47,85 +38,85 @@ static inline void __vx_acc_page(unsigned long *v, struct vx_info *vxi, __vx_acc_pages(&(m->v), m->mm_vx_info, r, p, __FILE__, __LINE__) static inline void __vx_acc_pages(unsigned long *v, struct vx_info *vxi, - int res, int pages, char *file, int line) + int res, int pages, char *_file, int _line) { - if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) || - (res == RLIMIT_AS && VX_DEBUG_ACC_VM) || - (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML)) - vxdprintk("vx_acc_pages [%5d,%2d]: %5d += %5d in %s:%d\n", - (vxi?vxi->vx_id:-1), res, - (vxi?atomic_read(&vxi->limit.res[res]):0), - pages, file, line); - if (pages == 0) - return; - if (v) - *v += pages; - if (vxi) - atomic_add(pages, &vxi->limit.rcur[res]); + if (VXD_RLIMIT(res, RLIMIT_RSS) || + VXD_RLIMIT(res, RLIMIT_AS) || + VXD_RLIMIT(res, RLIMIT_MEMLOCK)) + vxlprintk(1, "vx_acc_pages[%5d,%s,%2d]: %5d += %5d", + (vxi?vxi->vx_id:-1), vlimit_name[res], res, + (vxi?atomic_read(&vxi->limit.rcur[res]):0), + pages, _file, _line); + if (pages == 0) + return; + if (v) + *v += pages; + if (vxi) + atomic_add(pages, &vxi->limit.rcur[res]); } -#define vx_acc_vmpage(m,d) vx_acc_page(m, d, total_vm, RLIMIT_AS) -#define vx_acc_vmlpage(m,d) vx_acc_page(m, d, locked_vm, RLIMIT_MEMLOCK) -#define vx_acc_rsspage(m,d) vx_acc_page(m, d, rss, RLIMIT_RSS) +#define vx_acc_vmpage(m,d) vx_acc_page(m, d, total_vm, RLIMIT_AS) +#define vx_acc_vmlpage(m,d) vx_acc_page(m, d, locked_vm, RLIMIT_MEMLOCK) +#define vx_acc_rsspage(m,d) vx_acc_page(m, d, rss, RLIMIT_RSS) -#define vx_acc_vmpages(m,p) vx_acc_pages(m, p, total_vm, RLIMIT_AS) -#define vx_acc_vmlpages(m,p) vx_acc_pages(m, p, locked_vm, RLIMIT_MEMLOCK) -#define vx_acc_rsspages(m,p) vx_acc_pages(m, p, rss, RLIMIT_RSS) +#define vx_acc_vmpages(m,p) vx_acc_pages(m, p, total_vm, RLIMIT_AS) +#define vx_acc_vmlpages(m,p) vx_acc_pages(m, p, locked_vm, RLIMIT_MEMLOCK) +#define vx_acc_rsspages(m,p) vx_acc_pages(m, p, rss, RLIMIT_RSS) -#define vx_pages_add(s,r,p) __vx_acc_pages(0, s, r, p, __FILE__, __LINE__) -#define vx_pages_sub(s,r,p) __vx_pages_add(s, r, -(p)) +#define vx_pages_add(s,r,p) __vx_acc_pages(0, s, r, p, __FILE__, __LINE__) +#define vx_pages_sub(s,r,p) vx_pages_add(s, r, -(p)) -#define vx_vmpages_inc(m) vx_acc_vmpage(m, 1) -#define vx_vmpages_dec(m) vx_acc_vmpage(m,-1) -#define vx_vmpages_add(m,p) vx_acc_vmpages(m, p) -#define vx_vmpages_sub(m,p) vx_acc_vmpages(m,-(p)) +#define vx_vmpages_inc(m) vx_acc_vmpage(m, 1) +#define vx_vmpages_dec(m) vx_acc_vmpage(m,-1) +#define vx_vmpages_add(m,p) vx_acc_vmpages(m, p) +#define vx_vmpages_sub(m,p) vx_acc_vmpages(m,-(p)) -#define vx_vmlocked_inc(m) vx_acc_vmlpage(m, 1) -#define vx_vmlocked_dec(m) vx_acc_vmlpage(m,-1) -#define vx_vmlocked_add(m,p) vx_acc_vmlpages(m, p) -#define vx_vmlocked_sub(m,p) vx_acc_vmlpages(m,-(p)) +#define vx_vmlocked_inc(m) vx_acc_vmlpage(m, 1) +#define vx_vmlocked_dec(m) vx_acc_vmlpage(m,-1) +#define vx_vmlocked_add(m,p) vx_acc_vmlpages(m, p) +#define vx_vmlocked_sub(m,p) vx_acc_vmlpages(m,-(p)) -#define vx_rsspages_inc(m) vx_acc_rsspage(m, 1) -#define vx_rsspages_dec(m) vx_acc_rsspage(m,-1) -#define vx_rsspages_add(m,p) vx_acc_rsspages(m, p) -#define vx_rsspages_sub(m,p) vx_acc_rsspages(m,-(p)) +#define vx_rsspages_inc(m) vx_acc_rsspage(m, 1) +#define vx_rsspages_dec(m) vx_acc_rsspage(m,-1) +#define vx_rsspages_add(m,p) vx_acc_rsspages(m, p) +#define vx_rsspages_sub(m,p) vx_acc_rsspages(m,-(p)) #define vx_pages_avail(m, p, r) \ - __vx_pages_avail((m)->mm_vx_info, (r), (p), __FILE__, __LINE__) + __vx_pages_avail((m)->mm_vx_info, (r), (p), __FILE__, __LINE__) static inline int __vx_pages_avail(struct vx_info *vxi, - int res, int pages, char *file, int line) + int res, int pages, char *_file, int _line) { unsigned long value; - if ((res == RLIMIT_RSS && VX_DEBUG_ACC_RSS) || - (res == RLIMIT_AS && VX_DEBUG_ACC_VM) || - (res == RLIMIT_MEMLOCK && VX_DEBUG_ACC_VML)) - printk("vx_pages_avail[%5d,%2d]: %5ld > %5d + %5d in %s:%d\n", - (vxi?vxi->vx_id:-1), res, + if (VXD_RLIMIT(res, RLIMIT_RSS) || + VXD_RLIMIT(res, RLIMIT_AS) || + VXD_RLIMIT(res, RLIMIT_MEMLOCK)) + vxlprintk(1, "vx_pages_avail[%5d,%s,%2d]: %5ld > %5d + %5d", + (vxi?vxi->vx_id:-1), vlimit_name[res], res, (vxi?vxi->limit.rlim[res]:1), - (vxi?atomic_read(&vxi->limit.rcur[res]):0), - pages, file, line); - if (!vxi) - return 1; - value = atomic_read(&vxi->limit.rcur[res]); + (vxi?atomic_read(&vxi->limit.rcur[res]):0), + pages, _file, _line); + if (!vxi) + return 1; + value = atomic_read(&vxi->limit.rcur[res]); if (value > vxi->limit.rmax[res]) vxi->limit.rmax[res] = value; - if (vxi->limit.rlim[res] == RLIM_INFINITY) - return 1; - if (value + pages <= vxi->limit.rlim[res]) - return 1; + if (vxi->limit.rlim[res] == RLIM_INFINITY) + return 1; + if (value + pages <= vxi->limit.rlim[res]) + return 1; atomic_inc(&vxi->limit.lhit[res]); - return 0; + return 0; } -#define vx_vmpages_avail(m,p) vx_pages_avail(m, p, RLIMIT_AS) -#define vx_vmlocked_avail(m,p) vx_pages_avail(m, p, RLIMIT_MEMLOCK) -#define vx_rsspages_avail(m,p) vx_pages_avail(m, p, RLIMIT_RSS) +#define vx_vmpages_avail(m,p) vx_pages_avail(m, p, RLIMIT_AS) +#define vx_vmlocked_avail(m,p) vx_pages_avail(m, p, RLIMIT_MEMLOCK) +#define vx_rsspages_avail(m,p) vx_pages_avail(m, p, RLIMIT_RSS) #else #warning duplicate inclusion diff --git a/include/linux/vs_network.h b/include/linux/vs_network.h index 4bbf92368..9461b8647 100644 --- a/include/linux/vs_network.h +++ b/include/linux/vs_network.h @@ -1,17 +1,11 @@ #ifndef _NX_VS_NETWORK_H #define _NX_VS_NETWORK_H -#include -#include -#include #include "vserver/network.h" #include "vserver/debug.h" -extern int proc_pid_nx_info(struct task_struct *, char *); - - #define get_nx_info(i) __get_nx_info(i,__FILE__,__LINE__) static inline struct nx_info *__get_nx_info(struct nx_info *nxi, @@ -26,10 +20,6 @@ static inline struct nx_info *__get_nx_info(struct nx_info *nxi, return nxi; } - -#define free_nx_info(i) \ - call_rcu(&i->nx_rcu, rcu_free_nx_info); - #define put_nx_info(i) __put_nx_info(i,__FILE__,__LINE__) static inline void __put_nx_info(struct nx_info *nxi, const char *_file, int _line) diff --git a/include/linux/vs_sched.h b/include/linux/vs_sched.h new file mode 100644 index 000000000..0eb1ee6ad --- /dev/null +++ b/include/linux/vs_sched.h @@ -0,0 +1,73 @@ +#ifndef _VX_VS_SCHED_H +#define _VX_VS_SCHED_H + + +#include "vserver/sched.h" + + +#define VAVAVOOM_RATIO 50 + +#define MAX_PRIO_BIAS 20 +#define MIN_PRIO_BIAS -20 + + +static inline int vx_tokens_avail(struct vx_info *vxi) +{ + return atomic_read(&vxi->sched.tokens); +} + +static inline void vx_consume_token(struct vx_info *vxi) +{ + atomic_dec(&vxi->sched.tokens); +} + +static inline int vx_need_resched(struct task_struct *p) +{ +#ifdef CONFIG_VSERVER_HARDCPU + struct vx_info *vxi = p->vx_info; +#endif + int slice = --p->time_slice; + +#ifdef CONFIG_VSERVER_HARDCPU + if (vxi) { + int tokens; + + if ((tokens = vx_tokens_avail(vxi)) > 0) + vx_consume_token(vxi); + /* for tokens > 0, one token was consumed */ + if (tokens < 2) + return 1; + } +#endif + return (slice == 0); +} + + +static inline void vx_onhold_inc(struct vx_info *vxi) +{ + int onhold = atomic_read(&vxi->cvirt.nr_onhold); + + atomic_inc(&vxi->cvirt.nr_onhold); + if (!onhold) + vxi->cvirt.onhold_last = jiffies; +} + +static inline void __vx_onhold_update(struct vx_info *vxi) +{ + int cpu = smp_processor_id(); + uint32_t now = jiffies; + uint32_t delta = now - vxi->cvirt.onhold_last; + + vxi->cvirt.onhold_last = now; + vxi->sched.cpu[cpu].hold_ticks += delta; +} + +static inline void vx_onhold_dec(struct vx_info *vxi) +{ + if (atomic_dec_and_test(&vxi->cvirt.nr_onhold)) + __vx_onhold_update(vxi); +} + +#else +#warning duplicate inclusion +#endif diff --git a/include/linux/vs_socket.h b/include/linux/vs_socket.h index 499245822..d5505c561 100644 --- a/include/linux/vs_socket.h +++ b/include/linux/vs_socket.h @@ -1,15 +1,8 @@ -#ifndef _VX_VS_LIMIT_H -#define _VX_VS_LIMIT_H +#ifndef _VX_VS_SOCKET_H +#define _VX_VS_SOCKET_H -// #define VX_DEBUG - -#include -#include -#include - -#include "vserver/context.h" -#include "vserver/network.h" +#include "vserver/debug.h" /* socket accounting */ @@ -33,12 +26,12 @@ static inline int vx_sock_type(int family) static inline void __vx_acc_sock(struct vx_info *vxi, int family, int pos, int size, char *file, int line) { - if (vxi) { + if (vxi) { int type = vx_sock_type(family); atomic_inc(&vxi->cacct.sock[type][pos].count); atomic_add(size, &vxi->cacct.sock[type][pos].total); - } + } } #define vx_sock_recv(sk,s) \ @@ -49,12 +42,12 @@ static inline void __vx_acc_sock(struct vx_info *vxi, vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 2, (s)) -#define sock_vx_init(s) do { \ +#define sock_vx_init(s) do { \ (s)->sk_xid = 0; \ (s)->sk_vx_info = NULL; \ } while (0) -#define sock_nx_init(s) do { \ +#define sock_nx_init(s) do { \ (s)->sk_nid = 0; \ (s)->sk_nx_info = NULL; \ } while (0) diff --git a/include/linux/vserver.h b/include/linux/vserver.h deleted file mode 100644 index 2c39ebbe0..000000000 --- a/include/linux/vserver.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _LINUX_VSERVER_H -#define _LINUX_VSERVER_H - -#include -#include -#include -#include - -#endif diff --git a/include/linux/vserver/context.h b/include/linux/vserver/context.h index 4061e7b45..1fe76e749 100644 --- a/include/linux/vserver/context.h +++ b/include/linux/vserver/context.h @@ -3,22 +3,75 @@ #include + #define MAX_S_CONTEXT 65535 /* Arbitrary limit */ #define MIN_D_CONTEXT 49152 /* dynamic contexts start here */ #define VX_DYNAMIC_ID ((uint32_t)-1) /* id for dynamic context */ +/* context flags */ + +#define VXF_INFO_LOCK 0x00000001 +#define VXF_INFO_SCHED 0x00000002 +#define VXF_INFO_NPROC 0x00000004 +#define VXF_INFO_PRIVATE 0x00000008 + +#define VXF_INFO_INIT 0x00000010 +#define VXF_INFO_HIDE 0x00000020 +#define VXF_INFO_ULIMIT 0x00000040 +#define VXF_INFO_NSPACE 0x00000080 + +#define VXF_SCHED_HARD 0x00000100 +#define VXF_SCHED_PRIO 0x00000200 +#define VXF_SCHED_PAUSE 0x00000400 + +#define VXF_VIRT_MEM 0x00010000 +#define VXF_VIRT_UPTIME 0x00020000 +#define VXF_VIRT_CPU 0x00040000 +#define VXF_VIRT_LOAD 0x00080000 + +#define VXF_HIDE_MOUNT 0x01000000 +#define VXF_HIDE_NETIF 0x02000000 + +#define VXF_STATE_SETUP (1ULL<<32) +#define VXF_STATE_INIT (1ULL<<33) + +#define VXF_FORK_RSS (1ULL<<48) +#define VXF_PROLIFIC (1ULL<<49) + +#define VXF_IGNEG_NICE (1ULL<<52) + +#define VXF_ONE_TIME (0x0003ULL<<32) + + +/* context caps */ + +#define VXC_CAP_MASK 0x00000000 + +#define VXC_SET_UTSNAME 0x00000001 +#define VXC_SET_RLIMIT 0x00000002 + +#define VXC_RAW_ICMP 0x00000100 + +#define VXC_SECURE_MOUNT 0x00010000 +#define VXC_SECURE_REMOUNT 0x00020000 + + +/* vshelper sync commands */ + +#define VS_CONTEXT_CREATED 1 +#define VS_CONTEXT_DESTROY 2 + + #ifdef __KERNEL__ #include #include #include -#define _VX_INFO_DEF_ -#include "cvirt.h" -#include "limit.h" -#include "sched.h" -#undef _VX_INFO_DEF_ +#include "limit_def.h" +#include "sched_def.h" +#include "cvirt_def.h" struct vx_info { struct hlist_node vx_hlist; /* linked list of contexts */ @@ -48,6 +101,7 @@ struct vx_info { char vx_name[65]; /* vserver name */ }; + /* status flags */ #define VXS_HASHED 0x0001 @@ -61,7 +115,8 @@ struct vx_info { #define VX_ADMIN 0x0001 #define VX_WATCH 0x0002 -#define VX_DUMMY 0x0008 +#define VX_HIDE 0x0004 +#define VX_HOSTID 0x0008 #define VX_IDENT 0x0010 #define VX_EQUIV 0x0020 @@ -78,9 +133,10 @@ struct vx_info { struct rcu_head; -// extern void rcu_free_vx_info(struct rcu_head *); extern void unhash_vx_info(struct vx_info *); +extern void free_vx_info(struct vx_info *); + extern struct vx_info *locate_vx_info(int); extern struct vx_info *locate_or_create_vx_info(int); @@ -89,111 +145,11 @@ extern int vx_info_is_hashed(xid_t); extern int vx_migrate_task(struct task_struct *, struct vx_info *); -#endif /* __KERNEL__ */ - -#include "switch.h" - -/* vinfo commands */ - -#define VCMD_task_xid VC_CMD(VINFO, 1, 0) -#define VCMD_task_nid VC_CMD(VINFO, 2, 0) - -#ifdef __KERNEL__ -extern int vc_task_xid(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VCMD_vx_info VC_CMD(VINFO, 5, 0) -#define VCMD_nx_info VC_CMD(VINFO, 6, 0) - -struct vcmd_vx_info_v0 { - uint32_t xid; - uint32_t initpid; - /* more to come */ -}; - -#ifdef __KERNEL__ -extern int vc_vx_info(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VCMD_ctx_create VC_CMD(VPROC, 1, 0) -#define VCMD_ctx_migrate VC_CMD(PROCMIG, 1, 0) - -#ifdef __KERNEL__ -extern int vc_ctx_create(uint32_t, void __user *); -extern int vc_ctx_migrate(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VCMD_get_cflags VC_CMD(FLAGS, 1, 0) -#define VCMD_set_cflags VC_CMD(FLAGS, 2, 0) - -struct vcmd_ctx_flags_v0 { - uint64_t flagword; - uint64_t mask; -}; - -#ifdef __KERNEL__ -extern int vc_get_cflags(uint32_t, void __user *); -extern int vc_set_cflags(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VXF_INFO_LOCK 0x00000001 -#define VXF_INFO_SCHED 0x00000002 -#define VXF_INFO_NPROC 0x00000004 -#define VXF_INFO_PRIVATE 0x00000008 - -#define VXF_INFO_INIT 0x00000010 -#define VXF_INFO_HIDE 0x00000020 -#define VXF_INFO_ULIMIT 0x00000040 -#define VXF_INFO_NSPACE 0x00000080 - -#define VXF_SCHED_HARD 0x00000100 -#define VXF_SCHED_PRIO 0x00000200 -#define VXF_SCHED_PAUSE 0x00000400 - -#define VXF_VIRT_MEM 0x00010000 -#define VXF_VIRT_UPTIME 0x00020000 -#define VXF_VIRT_CPU 0x00040000 -#define VXF_VIRT_LOAD 0x00080000 - -#define VXF_HIDE_MOUNT 0x01000000 -#define VXF_HIDE_NETIF 0x02000000 - -#define VXF_STATE_SETUP (1ULL<<32) -#define VXF_STATE_INIT (1ULL<<33) - -#define VXF_FORK_RSS (1ULL<<48) -#define VXF_PROLIFIC (1ULL<<49) - -#define VXF_IGNEG_NICE (1ULL<<52) +// extern int proc_pid_vx_info(struct task_struct *, char *); -#define VXF_ONE_TIME (0x0003ULL<<32) - -#define VCMD_get_ccaps VC_CMD(FLAGS, 3, 0) -#define VCMD_set_ccaps VC_CMD(FLAGS, 4, 0) - -struct vcmd_ctx_caps_v0 { - uint64_t bcaps; - uint64_t ccaps; - uint64_t cmask; -}; - -#ifdef __KERNEL__ -extern int vc_get_ccaps(uint32_t, void __user *); -extern int vc_set_ccaps(uint32_t, void __user *); +extern long vs_context_state(unsigned int); #endif /* __KERNEL__ */ - -#define VXC_SET_UTSNAME 0x00000001 -#define VXC_SET_RLIMIT 0x00000002 - -#define VXC_RAW_ICMP 0x00000100 - -#define VXC_SECURE_MOUNT 0x00010000 -#define VXC_SECURE_REMOUNT 0x00020000 - - +#else /* _VX_CONTEXT_H */ +#warning duplicate inclusion #endif /* _VX_CONTEXT_H */ diff --git a/include/linux/vserver/context_cmd.h b/include/linux/vserver/context_cmd.h new file mode 100644 index 000000000..637a0d88c --- /dev/null +++ b/include/linux/vserver/context_cmd.h @@ -0,0 +1,73 @@ +#ifndef _VX_CONTEXT_CMD_H +#define _VX_CONTEXT_CMD_H + + +/* vinfo commands */ + +#define VCMD_task_xid VC_CMD(VINFO, 1, 0) + +#ifdef __KERNEL__ +extern int vc_task_xid(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_vx_info VC_CMD(VINFO, 5, 0) + +struct vcmd_vx_info_v0 { + uint32_t xid; + uint32_t initpid; + /* more to come */ +}; + +#ifdef __KERNEL__ +extern int vc_vx_info(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + + +/* context commands */ + +#define VCMD_ctx_create VC_CMD(VPROC, 1, 0) +#define VCMD_ctx_migrate VC_CMD(PROCMIG, 1, 0) + +#ifdef __KERNEL__ +extern int vc_ctx_create(uint32_t, void __user *); +extern int vc_ctx_migrate(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + + +/* flag commands */ + +#define VCMD_get_cflags VC_CMD(FLAGS, 1, 0) +#define VCMD_set_cflags VC_CMD(FLAGS, 2, 0) + +struct vcmd_ctx_flags_v0 { + uint64_t flagword; + uint64_t mask; +}; + +#ifdef __KERNEL__ +extern int vc_get_cflags(uint32_t, void __user *); +extern int vc_set_cflags(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + + +/* context caps commands */ + +#define VCMD_get_ccaps VC_CMD(FLAGS, 3, 0) +#define VCMD_set_ccaps VC_CMD(FLAGS, 4, 0) + +struct vcmd_ctx_caps_v0 { + uint64_t bcaps; + uint64_t ccaps; + uint64_t cmask; +}; + +#ifdef __KERNEL__ +extern int vc_get_ccaps(uint32_t, void __user *); +extern int vc_set_ccaps(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_CONTEXT_CMD_H */ diff --git a/include/linux/vserver/cvirt.h b/include/linux/vserver/cvirt.h index ba3a25356..31c47a72a 100644 --- a/include/linux/vserver/cvirt.h +++ b/include/linux/vserver/cvirt.h @@ -1,133 +1,18 @@ -#if defined(__KERNEL__) && defined(_VX_INFO_DEF_) - -#include -#include -#include -#include -#include - -/* context sub struct */ - -struct _vx_cvirt { - int max_threads; - - unsigned int bias_cswtch; - struct timespec bias_idle; - struct timespec bias_tp; - uint64_t bias_jiffies; - - struct new_utsname utsname; -}; - -struct sock_acc { - atomic_t count; - atomic_t total; -}; - -struct _vx_cacct { - atomic_t nr_threads; - int nr_running; - - unsigned long total_forks; - - struct sock_acc sock[5][3]; -}; - - -static inline long vx_sock_count(struct _vx_cacct *cacct, int type, int pos) -{ - return atomic_read(&cacct->sock[type][pos].count); -} - - -static inline long vx_sock_total(struct _vx_cacct *cacct, int type, int pos) -{ - return atomic_read(&cacct->sock[type][pos].total); -} - - -extern uint64_t vx_idle_jiffies(void); - -static inline void vx_info_init_cvirt(struct _vx_cvirt *cvirt) -{ - uint64_t idle_jiffies = vx_idle_jiffies(); - - // new->virt.bias_cswtch = kstat.context_swtch; - cvirt->bias_jiffies = get_jiffies_64(); - - jiffies_to_timespec(idle_jiffies, &cvirt->bias_idle); - do_posix_clock_monotonic_gettime(&cvirt->bias_tp); - - down_read(&uts_sem); - cvirt->utsname = system_utsname; - up_read(&uts_sem); -} - -static inline void vx_info_exit_cvirt(struct _vx_cvirt *cvirt) -{ - return; -} - -static inline void vx_info_init_cacct(struct _vx_cacct *cacct) -{ - int i,j; - - atomic_set(&cacct->nr_threads, 1); - for (i=0; i<5; i++) { - for (j=0; j<3; j++) { - atomic_set(&cacct->sock[i][j].count, 0); - atomic_set(&cacct->sock[i][j].total, 0); - } - } -} - -static inline void vx_info_exit_cacct(struct _vx_cacct *cacct) -{ - return; -} - -static inline int vx_info_proc_cvirt(struct _vx_cvirt *cvirt, char *buffer) -{ - int length = 0; - return length; -} - -static inline int vx_info_proc_cacct(struct _vx_cacct *cacct, char *buffer) -{ - int i,j, length = 0; - static char *type[] = { "UNSPEC", "UNIX", "INET", "INET6", "OTHER" }; - - for (i=0; i<5; i++) { - length += sprintf(buffer + length, - "%s:", type[i]); - for (j=0; j<3; j++) { - length += sprintf(buffer + length, - "\t%12lu/%-12lu" - ,vx_sock_count(cacct, i, j) - ,vx_sock_total(cacct, i, j) - ); - } - buffer[length++] = '\n'; - } - return length; -} - -#else /* _VX_INFO_DEF_ */ #ifndef _VX_CVIRT_H #define _VX_CVIRT_H -#include "switch.h" +#ifdef __KERNEL__ -/* cvirt vserver commands */ +struct timespec; +void vx_vsi_uptime(struct timespec *, struct timespec *); -#ifdef __KERNEL__ -struct timespec; +struct vx_info; -void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle); +void vx_update_load(struct vx_info *); #endif /* __KERNEL__ */ - +#else /* _VX_CVIRT_H */ +#warning duplicate inclusion #endif /* _VX_CVIRT_H */ -#endif diff --git a/include/linux/vserver/cvirt_cmd.h b/include/linux/vserver/cvirt_cmd.h new file mode 100644 index 000000000..368f52732 --- /dev/null +++ b/include/linux/vserver/cvirt_cmd.h @@ -0,0 +1,7 @@ +#ifndef _VX_CVIRT_CMD_H +#define _VX_CVIRT_CMD_H + +/* cvirt vserver commands */ + + +#endif /* _VX_CVIRT_CMD_H */ diff --git a/include/linux/vserver/cvirt_def.h b/include/linux/vserver/cvirt_def.h new file mode 100644 index 000000000..bf4bd848a --- /dev/null +++ b/include/linux/vserver/cvirt_def.h @@ -0,0 +1,59 @@ +#ifndef _VX_CVIRT_DEF_H +#define _VX_CVIRT_DEF_H + +#include +#include +#include +#include +#include + + +struct _vx_usage_stat { + uint64_t user; + uint64_t nice; + uint64_t system; + uint64_t softirq; + uint64_t irq; + uint64_t idle; + uint64_t iowait; +}; + +/* context sub struct */ + +struct _vx_cvirt { + int max_threads; /* maximum allowed threads */ + atomic_t nr_threads; /* number of current threads */ + atomic_t nr_running; /* number of running threads */ + atomic_t nr_uninterruptible; /* number of uninterruptible threads */ + + atomic_t nr_onhold; /* processes on hold */ + uint32_t onhold_last; /* jiffies when put on hold */ + + struct timespec bias_idle; + struct timespec bias_uptime; /* context creation point */ + uint64_t bias_clock; /* offset in clock_t */ + + struct new_utsname utsname; + + spinlock_t load_lock; /* lock for the load averages */ + atomic_t load_updates; /* nr of load updates done so far */ + uint32_t load_last; /* last time load was cacled */ + uint32_t load[3]; /* load averages 1,5,15 */ + + struct _vx_usage_stat cpustat[NR_CPUS]; +}; + +struct _vx_sock_acc { + atomic_t count; + atomic_t total; +}; + +/* context sub struct */ + +struct _vx_cacct { + unsigned long total_forks; + + struct _vx_sock_acc sock[5][3]; +}; + +#endif /* _VX_CVIRT_DEF_H */ diff --git a/include/linux/vserver/debug.h b/include/linux/vserver/debug.h index 15b52c930..f6b27cfd3 100644 --- a/include/linux/vserver/debug.h +++ b/include/linux/vserver/debug.h @@ -2,6 +2,19 @@ #define _VX_DEBUG_H +#define VXD_CBIT(n,m) (vx_debug_ ## n & (1 << (m))) +#define VXD_CMIN(n,m) (vx_debug_ ## n > (m)) +#define VXD_MASK(n,m) (vx_debug_ ## n & (m)) + +#define VXD_QPOS(v,p) (((uint32_t)(v) >> ((p)*8)) & 0xFF) +#define VXD_QUAD(v) VXD_QPOS(v,0), VXD_QPOS(v,1), \ + VXD_QPOS(v,2), VXD_QPOS(v,3) + +#define __FUNC__ __func__ + + +#ifdef CONFIG_VSERVER_DEBUG + extern unsigned int vx_debug_switch; extern unsigned int vx_debug_xid; extern unsigned int vx_debug_nid; @@ -11,36 +24,245 @@ extern unsigned int vx_debug_dlim; extern unsigned int vx_debug_cvirt; -#define VXD_CBIT(n,m) (vx_debug_ ## n & (1 << (m))) -#define VXD_CMIN(n,m) (vx_debug_ ## n > (m)) -#define VXD_MASK(n,m) (vx_debug_ ## n & (m)) - -// #define VXD_HERE __FILE__, __LINE__ - - -#ifdef CONFIG_VSERVER_DEBUG - -#define VX_LOGLEVEL "vxD: " +#define VX_LOGLEVEL "vxD: " +#define VX_WARNLEVEL KERN_WARNING "vxW: " #define vxdprintk(c,f,x...) \ do { \ if (c) \ - printk(VX_LOGLEVEL f "\n", x); \ - } while (0) + printk(VX_LOGLEVEL f "\n" , ##x); \ + } while (0) #define vxlprintk(c,f,x...) \ do { \ if (c) \ printk(VX_LOGLEVEL f " @%s:%d\n", x); \ - } while (0) + } while (0) + +#define vxfprintk(c,f,x...) \ + do { \ + if (c) \ + printk(VX_LOGLEVEL f " %s@%s:%d\n", x); \ + } while (0) + + +#define vxwprintk(c,f,x...) \ + do { \ + if (c) \ + printk(VX_WARNLEVEL f "\n" , ##x); \ + } while (0) + + +#define vxd_path(d,m) \ + ({ static char _buffer[PATH_MAX]; \ + d_path((d), (m), _buffer, sizeof(_buffer)); }) + +#else /* CONFIG_VSERVER_DEBUG */ + +#define vx_debug_switch 0 +#define vx_debug_xid 0 +#define vx_debug_nid 0 +#define vx_debug_net 0 +#define vx_debug_limit 0 +#define vx_debug_dlim 0 +#define vx_debug_cvirt 0 + +#define vxdprintk(x...) do { } while (0) +#define vxlprintk(x...) do { } while (0) +#define vxfprintk(x...) do { } while (0) +#define vxwprintk(x...) do { } while (0) + +#define vxd_path "" + +#endif /* CONFIG_VSERVER_DEBUG */ + + +/* history stuff */ + +#ifdef CONFIG_VSERVER_HISTORY + + +extern unsigned volatile int vxh_active; + +struct _vxhe_vxi { + struct vx_info *ptr; + unsigned xid; + unsigned usecnt; + unsigned refcnt; +}; + +struct _vxhe_set_clr { + void *data; +}; + +struct _vxhe_loc_lookup { + unsigned arg; +}; + +enum { + VXH_UNUSED=0, + VXH_THROW_OOPS=1, + + VXH_GET_VX_INFO, + VXH_PUT_VX_INFO, + VXH_SET_VX_INFO, + VXH_CLR_VX_INFO, + VXH_ALLOC_VX_INFO, + VXH_DEALLOC_VX_INFO, + VXH_HASH_VX_INFO, + VXH_UNHASH_VX_INFO, + VXH_LOC_VX_INFO, + VXH_LOOKUP_VX_INFO, +}; + +struct _vx_hist_entry { + void *loc; + unsigned short seq; + unsigned short type; + struct _vxhe_vxi vxi; + union { + struct _vxhe_set_clr sc; + struct _vxhe_loc_lookup ll; + }; +}; + +struct _vx_hist_entry *vxh_advance(void *loc); + +#define VXH_HERE() \ + ({ __label__ here; \ + here:; \ + &&here; }) + + + +static inline void __vxh_copy_vxi(struct _vx_hist_entry *entry, struct vx_info *vxi) +{ + entry->vxi.ptr = vxi; + if (vxi) { + entry->vxi.usecnt = atomic_read(&vxi->vx_usecnt); + entry->vxi.refcnt = atomic_read(&vxi->vx_refcnt); + entry->vxi.xid = vxi->vx_id; + } +} + +static inline void vxh_throw_oops(void) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + entry->type = VXH_THROW_OOPS; + + /* prevent further acquisition */ + vxh_active = 0; +} + +static inline void vxh_get_vx_info(struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->type = VXH_GET_VX_INFO; +} + +static inline void vxh_put_vx_info(struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->type = VXH_PUT_VX_INFO; +} + +static inline void vxh_set_vx_info(struct vx_info *vxi, void *data) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->sc.data = data; + entry->type = VXH_SET_VX_INFO; +} + +static inline void vxh_clr_vx_info(struct vx_info *vxi, void *data) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->sc.data = data; + entry->type = VXH_CLR_VX_INFO; +} + +static inline void vxh_alloc_vx_info(struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->type = VXH_ALLOC_VX_INFO; +} + +static inline void vxh_dealloc_vx_info(struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->type = VXH_DEALLOC_VX_INFO; +} + +static inline void vxh_hash_vx_info(struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->type = VXH_HASH_VX_INFO; +} + +static inline void vxh_unhash_vx_info(struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->type = VXH_UNHASH_VX_INFO; +} + +static inline void vxh_loc_vx_info(unsigned arg, struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->ll.arg = arg; + entry->type = VXH_LOC_VX_INFO; +} + +static inline void vxh_lookup_vx_info(unsigned arg, struct vx_info *vxi) +{ + struct _vx_hist_entry *entry = vxh_advance(VXH_HERE()); + + __vxh_copy_vxi(entry, vxi); + entry->ll.arg = arg; + entry->type = VXH_LOOKUP_VX_INFO; +} + +extern void vxh_dump_history(void); + +#else /* CONFIG_VSERVER_HISTORY */ + +#define vxh_throw_oops() do { } while (0) + +#define vxh_get_vx_info(v) do { } while (0) +#define vxh_put_vx_info(v) do { } while (0) + +#define vxh_set_vx_info(v,d) do { } while (0) +#define vxh_clr_vx_info(v,d) do { } while (0) + +#define vxh_alloc_vx_info(v) do { } while (0) +#define vxh_dealloc_vx_info(v) do { } while (0) -#else +#define vxh_hash_vx_info(v) do { } while (0) +#define vxh_unhash_vx_info(v) do { } while (0) -#define vxdprintk(x...) do { } while (0) -#define vxlprintk(x...) do { } while (0) +#define vxh_loc_vx_info(a,v) do { } while (0) +#define vxh_lookup_vx_info(a,v) do { } while (0) -#endif +#define vxh_dump_history() do { } while (0) +#endif /* CONFIG_VSERVER_HISTORY */ #endif /* _VX_DEBUG_H */ diff --git a/include/linux/vserver/debug_cmd.h b/include/linux/vserver/debug_cmd.h new file mode 100644 index 000000000..c0cbd0845 --- /dev/null +++ b/include/linux/vserver/debug_cmd.h @@ -0,0 +1,14 @@ +#ifndef _VX_DEBUG_CMD_H +#define _VX_DEBUG_CMD_H + + +/* debug commands */ + +#define VCMD_dump_history VC_CMD(DEBUG, 1, 0) + +#ifdef __KERNEL__ + +extern int vc_dump_history(uint32_t); + +#endif /* __KERNEL__ */ +#endif /* _VX_DEBUG_CMD_H */ diff --git a/include/linux/vserver/dlimit.h b/include/linux/vserver/dlimit.h index 14a68fd04..0c6587eb4 100644 --- a/include/linux/vserver/dlimit.h +++ b/include/linux/vserver/dlimit.h @@ -79,7 +79,7 @@ extern int vc_get_dlimit(uint32_t, void __user *); typedef uint64_t dlsize_t; - #endif /* __KERNEL__ */ - +#else /* _VX_DLIMIT_H */ +#warning duplicate inclusion #endif /* _VX_DLIMIT_H */ diff --git a/include/linux/vserver/inode.h b/include/linux/vserver/inode.h index dac07ea48..a1054e831 100644 --- a/include/linux/vserver/inode.h +++ b/include/linux/vserver/inode.h @@ -57,10 +57,6 @@ extern int vc_set_iattr_v0(uint32_t, void __user *); extern int vc_get_iattr(uint32_t, void __user *); extern int vc_set_iattr(uint32_t, void __user *); -extern int vc_iattr_ioctl(struct dentry *de, - unsigned int cmd, - unsigned long arg); - #endif /* __KERNEL__ */ /* inode ioctls */ @@ -68,7 +64,6 @@ extern int vc_iattr_ioctl(struct dentry *de, #define FIOC_GETXFLG _IOR('x', 5, long) #define FIOC_SETXFLG _IOW('x', 6, long) -#define FIOC_GETIATTR _IOR('x', 7, long) -#define FIOC_SETIATTR _IOR('x', 8, long) - +#else /* _VX_INODE_H */ +#warning duplicate inclusion #endif /* _VX_INODE_H */ diff --git a/include/linux/vserver/legacy.h b/include/linux/vserver/legacy.h index 1372c0fa6..d5b8a3f21 100644 --- a/include/linux/vserver/legacy.h +++ b/include/linux/vserver/legacy.h @@ -2,7 +2,6 @@ #define _VX_LEGACY_H #include "switch.h" -#include "network.h" /* compatibiliy vserver commands */ @@ -13,12 +12,12 @@ /* compatibiliy vserver arguments */ -struct vcmd_new_s_context_v1 { +struct vcmd_new_s_context_v1 { uint32_t remove_cap; uint32_t flags; }; -struct vcmd_set_ipv4root_v3 { +struct vcmd_set_ipv4root_v3 { /* number of pairs in id */ uint32_t broadcast; struct { @@ -40,7 +39,7 @@ struct vcmd_set_ipv4root_v3 { /* of the context */ #define VX_INFO_NAMESPACE 128 /* save private namespace */ - + #define NB_S_CONTEXT 16 #define NB_IPV4ROOT 16 diff --git a/include/linux/vserver/limit.h b/include/linux/vserver/limit.h index 27496c1f2..0ed0e2b27 100644 --- a/include/linux/vserver/limit.h +++ b/include/linux/vserver/limit.h @@ -1,117 +1,20 @@ -#if defined(__KERNEL__) && defined(_VX_INFO_DEF_) - -#include -#include - -/* context sub struct */ - -#define RLIMIT_OPENFD 12 - -#define NUM_RLIMITS 16 - -#define VLIMIT_SOCK 16 - - -struct _vx_limit { - atomic_t ticks; - - unsigned long rlim[NUM_RLIMITS]; /* Per context limit */ - atomic_t res[NUM_RLIMITS]; /* Current value */ -}; - -static inline void vx_info_init_limit(struct _vx_limit *limit) -{ - int lim; - - for (lim=0; limrlim[lim] = RLIM_INFINITY; - atomic_set(&limit->res[lim], 0); - } -} - -extern unsigned int vx_debug_limit; - -static inline void vx_info_exit_limit(struct _vx_limit *limit) -{ - int lim, value; - - for (lim=0; limres[lim]); - if (value && vx_debug_limit) - printk("!!! limit: %p[%d] = %d on exit.\n", - limit, lim, value); - } -} - - -static inline int vx_info_proc_limit(struct _vx_limit *limit, char *buffer) -{ - return sprintf(buffer, - "PROC:\t%8d/%ld\n" - "VM:\t%8d/%ld\n" - "VML:\t%8d/%ld\n" - "RSS:\t%8d/%ld\n" - "FILES:\t%8d/%ld\n" - "OFD:\t%8d/%ld\n" - ,atomic_read(&limit->res[RLIMIT_NPROC]) - ,limit->rlim[RLIMIT_NPROC] - ,atomic_read(&limit->res[RLIMIT_AS]) - ,limit->rlim[RLIMIT_AS] - ,atomic_read(&limit->res[RLIMIT_MEMLOCK]) - ,limit->rlim[RLIMIT_MEMLOCK] - ,atomic_read(&limit->res[RLIMIT_RSS]) - ,limit->rlim[RLIMIT_RSS] - ,atomic_read(&limit->res[RLIMIT_NOFILE]) - ,limit->rlim[RLIMIT_NOFILE] - ,atomic_read(&limit->res[RLIMIT_OPENFD]) - ,limit->rlim[RLIMIT_OPENFD] - ); -} - -#else /* _VX_INFO_DEF_ */ #ifndef _VX_LIMIT_H #define _VX_LIMIT_H -#include "switch.h" - -/* rlimit vserver commands */ - -#define VCMD_get_rlimit VC_CMD(RLIMIT, 1, 0) -#define VCMD_set_rlimit VC_CMD(RLIMIT, 2, 0) -#define VCMD_get_rlimit_mask VC_CMD(RLIMIT, 3, 0) - -struct vcmd_ctx_rlimit_v0 { - uint32_t id; - uint64_t minimum; - uint64_t softlimit; - uint64_t maximum; -}; - -struct vcmd_ctx_rlimit_mask_v0 { - uint32_t minimum; - uint32_t softlimit; - uint32_t maximum; -}; - -#define CRLIM_UNSET (0ULL) -#define CRLIM_INFINITY (~0ULL) -#define CRLIM_KEEP (~1ULL) - #ifdef __KERNEL__ -#include - -extern int vc_get_rlimit(uint32_t, void __user *); -extern int vc_set_rlimit(uint32_t, void __user *); -extern int vc_get_rlimit_mask(uint32_t, void __user *); - struct sysinfo; void vx_vsi_meminfo(struct sysinfo *); void vx_vsi_swapinfo(struct sysinfo *); +#define VXD_RLIMIT(r,l) (VXD_CBIT(limit, (l)) && ((r) == (l))) -#endif /* __KERNEL__ */ +#define NUM_LIMITS 20 +#define VLIMIT_NSOCK 16 + +extern const char *vlimit_name[NUM_LIMITS]; + +#endif /* __KERNEL__ */ #endif /* _VX_LIMIT_H */ -#endif diff --git a/include/linux/vserver/limit_cmd.h b/include/linux/vserver/limit_cmd.h new file mode 100644 index 000000000..a994d02ea --- /dev/null +++ b/include/linux/vserver/limit_cmd.h @@ -0,0 +1,36 @@ +#ifndef _VX_LIMIT_CMD_H +#define _VX_LIMIT_CMD_H + +/* rlimit vserver commands */ + +#define VCMD_get_rlimit VC_CMD(RLIMIT, 1, 0) +#define VCMD_set_rlimit VC_CMD(RLIMIT, 2, 0) +#define VCMD_get_rlimit_mask VC_CMD(RLIMIT, 3, 0) + +struct vcmd_ctx_rlimit_v0 { + uint32_t id; + uint64_t minimum; + uint64_t softlimit; + uint64_t maximum; +}; + +struct vcmd_ctx_rlimit_mask_v0 { + uint32_t minimum; + uint32_t softlimit; + uint32_t maximum; +}; + +#define CRLIM_UNSET (0ULL) +#define CRLIM_INFINITY (~0ULL) +#define CRLIM_KEEP (~1ULL) + +#ifdef __KERNEL__ + +#include + +extern int vc_get_rlimit(uint32_t, void __user *); +extern int vc_set_rlimit(uint32_t, void __user *); +extern int vc_get_rlimit_mask(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_LIMIT_CMD_H */ diff --git a/include/linux/vserver/limit_def.h b/include/linux/vserver/limit_def.h new file mode 100644 index 000000000..bab1def7f --- /dev/null +++ b/include/linux/vserver/limit_def.h @@ -0,0 +1,21 @@ +#ifndef _VX_LIMIT_DEF_H +#define _VX_LIMIT_DEF_H + +#include +#include + +#include "limit.h" + +/* context sub struct */ + +struct _vx_limit { + atomic_t ticks; + + unsigned long rlim[NUM_LIMITS]; /* Context limit */ + unsigned long rmax[NUM_LIMITS]; /* Context maximum */ + atomic_t rcur[NUM_LIMITS]; /* Current value */ + atomic_t lhit[NUM_LIMITS]; /* Limit hits */ +}; + + +#endif /* _VX_LIMIT_DEF_H */ diff --git a/include/linux/vserver/namespace.h b/include/linux/vserver/namespace.h index 140fc79f2..72a51f6df 100644 --- a/include/linux/vserver/namespace.h +++ b/include/linux/vserver/namespace.h @@ -3,13 +3,13 @@ #include - + /* virtual host info names */ #define VCMD_vx_set_vhi_name VC_CMD(VHOST, 1, 0) #define VCMD_vx_get_vhi_name VC_CMD(VHOST, 2, 0) -struct vcmd_vx_vhi_name_v0 { +struct vcmd_vx_vhi_name_v0 { uint32_t field; char name[65]; }; @@ -44,6 +44,9 @@ extern int vc_get_vhi_name(uint32_t, void __user *); struct vx_info; struct namespace; struct fs_struct; +struct vfsmount; + +extern int vx_check_vfsmount(struct vx_info *, struct vfsmount *); extern int vx_set_namespace(struct vx_info *, struct namespace *, struct fs_struct *); @@ -52,4 +55,6 @@ extern int vc_cleanup_namespace(uint32_t, void __user *); extern int vc_set_namespace(uint32_t, void __user *); #endif /* __KERNEL__ */ +#else /* _VX_NAMESPACE_H */ +#warning duplicate inclusion #endif /* _VX_NAMESPACE_H */ diff --git a/include/linux/vserver/network.h b/include/linux/vserver/network.h index e77866b5f..b1ccb9aa1 100644 --- a/include/linux/vserver/network.h +++ b/include/linux/vserver/network.h @@ -1,19 +1,21 @@ #ifndef _VX_NETWORK_H #define _VX_NETWORK_H +#include + + #define MAX_N_CONTEXT 65535 /* Arbitrary limit */ #define NX_DYNAMIC_ID ((uint32_t)-1) /* id for dynamic context */ #define NB_IPV4ROOT 16 + #ifdef __KERNEL__ #include #include -#include #include -#include #include @@ -43,9 +45,10 @@ struct nx_info { struct rcu_head; -extern void rcu_free_nx_info(struct rcu_head *); extern void unhash_nx_info(struct nx_info *); +extern void free_nx_info(struct nx_info *); + extern struct nx_info *locate_nx_info(int); extern struct nx_info *locate_or_create_nx_info(int); @@ -64,89 +67,7 @@ struct sock; int nx_addr_conflict(struct nx_info *, uint32_t, struct sock *); - #endif /* __KERNEL__ */ - -#include "switch.h" - -/* vinfo commands */ - -#define VCMD_task_nid VC_CMD(VINFO, 2, 0) - -#ifdef __KERNEL__ -extern int vc_task_nid(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VCMD_nx_info VC_CMD(VINFO, 6, 0) - -struct vcmd_nx_info_v0 { - uint32_t nid; - /* more to come */ -}; - -#ifdef __KERNEL__ -extern int vc_nx_info(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VCMD_net_create VC_CMD(VNET, 1, 0) -#define VCMD_net_migrate VC_CMD(NETMIG, 1, 0) - -#define VCMD_net_add VC_CMD(NETALT, 1, 0) -#define VCMD_net_remove VC_CMD(NETALT, 2, 0) - -struct vcmd_net_nx_v0 { - uint16_t type; - uint16_t count; - uint32_t ip[4]; - uint32_t mask[4]; - /* more to come */ -}; - -// IPN_TYPE_IPV4 - - -#ifdef __KERNEL__ -extern int vc_net_create(uint32_t, void __user *); -extern int vc_net_migrate(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define VCMD_get_nflags VC_CMD(FLAGS, 5, 0) -#define VCMD_set_nflags VC_CMD(FLAGS, 6, 0) - -struct vcmd_net_flags_v0 { - uint64_t flagword; - uint64_t mask; -}; - -#ifdef __KERNEL__ -extern int vc_get_nflags(uint32_t, void __user *); -extern int vc_set_nflags(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define IPF_STATE_SETUP (1ULL<<32) - - -#define IPF_ONE_TIME (0x0001ULL<<32) - -#define VCMD_get_ncaps VC_CMD(FLAGS, 7, 0) -#define VCMD_set_ncaps VC_CMD(FLAGS, 8, 0) - -struct vcmd_net_caps_v0 { - uint64_t ncaps; - uint64_t cmask; -}; - -#ifdef __KERNEL__ -extern int vc_get_ncaps(uint32_t, void __user *); -extern int vc_set_ncaps(uint32_t, void __user *); - -#endif /* __KERNEL__ */ - -#define IPC_WOSSNAME 0x00000001 - - +#else /* _VX_NETWORK_H */ +#warning duplicate inclusion #endif /* _VX_NETWORK_H */ diff --git a/include/linux/vserver/network_cmd.h b/include/linux/vserver/network_cmd.h new file mode 100644 index 000000000..4403f549f --- /dev/null +++ b/include/linux/vserver/network_cmd.h @@ -0,0 +1,81 @@ +#ifndef _VX_NETWORK_CMD_H +#define _VX_NETWORK_CMD_H + + +/* vinfo commands */ + +#define VCMD_task_nid VC_CMD(VINFO, 2, 0) + +#ifdef __KERNEL__ +extern int vc_task_nid(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_nx_info VC_CMD(VINFO, 6, 0) + +struct vcmd_nx_info_v0 { + uint32_t nid; + /* more to come */ +}; + +#ifdef __KERNEL__ +extern int vc_nx_info(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_net_create VC_CMD(VNET, 1, 0) +#define VCMD_net_migrate VC_CMD(NETMIG, 1, 0) + +#define VCMD_net_add VC_CMD(NETALT, 1, 0) +#define VCMD_net_remove VC_CMD(NETALT, 2, 0) + +struct vcmd_net_nx_v0 { + uint16_t type; + uint16_t count; + uint32_t ip[4]; + uint32_t mask[4]; + /* more to come */ +}; + +// IPN_TYPE_IPV4 + + +#ifdef __KERNEL__ +extern int vc_net_create(uint32_t, void __user *); +extern int vc_net_migrate(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define VCMD_get_nflags VC_CMD(FLAGS, 5, 0) +#define VCMD_set_nflags VC_CMD(FLAGS, 6, 0) + +struct vcmd_net_flags_v0 { + uint64_t flagword; + uint64_t mask; +}; + +#ifdef __KERNEL__ +extern int vc_get_nflags(uint32_t, void __user *); +extern int vc_set_nflags(uint32_t, void __user *); + +#endif /* __KERNEL__ */ + +#define IPF_STATE_SETUP (1ULL<<32) + + +#define IPF_ONE_TIME (0x0001ULL<<32) + +#define VCMD_get_ncaps VC_CMD(FLAGS, 7, 0) +#define VCMD_set_ncaps VC_CMD(FLAGS, 8, 0) + +struct vcmd_net_caps_v0 { + uint64_t ncaps; + uint64_t cmask; +}; + +#ifdef __KERNEL__ +extern int vc_get_ncaps(uint32_t, void __user *); +extern int vc_set_ncaps(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_CONTEXT_CMD_H */ diff --git a/include/linux/vserver/sched.h b/include/linux/vserver/sched.h index f5982bb4e..e527b4492 100644 --- a/include/linux/vserver/sched.h +++ b/include/linux/vserver/sched.h @@ -1,221 +1,25 @@ -/* _VX_SCHED_H defined below */ - -#if defined(__KERNEL__) && defined(_VX_INFO_DEF_) - -#include -#include -#include -#include -#include - -struct _vx_ticks { - uint64_t user_ticks; /* token tick events */ - uint64_t sys_ticks; /* token tick events */ - uint64_t hold_ticks; /* token ticks paused */ - uint64_t unused[5]; /* cacheline ? */ -}; - -/* context sub struct */ - -struct _vx_sched { - atomic_t tokens; /* number of CPU tokens */ - spinlock_t tokens_lock; /* lock for token bucket */ - - int fill_rate; /* Fill rate: add X tokens... */ - int interval; /* Divisor: per Y jiffies */ - int tokens_min; /* Limit: minimum for unhold */ - int tokens_max; /* Limit: no more than N tokens */ - uint32_t jiffies; /* last time accounted */ - - int priority_bias; /* bias offset for priority */ - cpumask_t cpus_allowed; /* cpu mask for context */ - - struct _vx_ticks cpu[NR_CPUS]; -}; - -static inline void vx_info_init_sched(struct _vx_sched *sched) -{ - int i; - - /* scheduling; hard code starting values as constants */ - sched->fill_rate = 1; - sched->interval = 4; - sched->tokens_min = HZ >> 4; - sched->tokens_max = HZ >> 1; - sched->jiffies = jiffies; - sched->tokens_lock = SPIN_LOCK_UNLOCKED; - - atomic_set(&sched->tokens, HZ >> 2); - sched->cpus_allowed = CPU_MASK_ALL; - sched->priority_bias = 0; - - for_each_cpu(i) { - sched->cpu[i].user_ticks = 0; - sched->cpu[i].sys_ticks = 0; - sched->cpu[i].hold_ticks = 0; - } -} - -static inline void vx_info_exit_sched(struct _vx_sched *sched) -{ - return; -} - -static inline int vx_info_proc_sched(struct _vx_sched *sched, char *buffer) -{ - int length = 0; - int i; - - length += sprintf(buffer, - "Token:\t\t%8d\n" - "FillRate:\t%8d\n" - "Interval:\t%8d\n" - "TokensMin:\t%8d\n" - "TokensMax:\t%8d\n" - "PrioBias:\t%8d\n" - ,atomic_read(&sched->tokens) - ,sched->fill_rate - ,sched->interval - ,sched->tokens_min - ,sched->tokens_max - ,sched->priority_bias - ); - - for_each_online_cpu(i) { - length += sprintf(buffer + length, - "cpu %d: %lld %lld %lld\n" - ,i - ,(long long)sched->cpu[i].user_ticks - ,(long long)sched->cpu[i].sys_ticks - ,(long long)sched->cpu[i].hold_ticks - ); - } - - return length; -} - - -#else /* _VX_INFO_DEF_ */ #ifndef _VX_SCHED_H #define _VX_SCHED_H -#include "switch.h" - -/* sched vserver commands */ - -#define VCMD_set_sched_v2 VC_CMD(SCHED, 1, 2) -#define VCMD_set_sched VC_CMD(SCHED, 1, 3) - -struct vcmd_set_sched_v2 { - int32_t fill_rate; - int32_t interval; - int32_t tokens; - int32_t tokens_min; - int32_t tokens_max; - uint64_t cpu_mask; -}; - -struct vcmd_set_sched_v3 { - uint32_t set_mask; - int32_t fill_rate; - int32_t interval; - int32_t tokens; - int32_t tokens_min; - int32_t tokens_max; - int32_t priority_bias; -}; - - -#define VXSM_FILL_RATE 0x0001 -#define VXSM_INTERVAL 0x0002 -#define VXSM_TOKENS 0x0010 -#define VXSM_TOKENS_MIN 0x0020 -#define VXSM_TOKENS_MAX 0x0040 -#define VXSM_PRIO_BIAS 0x0100 - -#define SCHED_KEEP (-2) - #ifdef __KERNEL__ -extern int vc_set_sched_v1(uint32_t, void __user *); -extern int vc_set_sched_v2(uint32_t, void __user *); -extern int vc_set_sched(uint32_t, void __user *); +struct timespec; +void vx_vsi_uptime(struct timespec *, struct timespec *); -#define VAVAVOOM_RATIO 50 -#define MAX_PRIO_BIAS 20 -#define MIN_PRIO_BIAS -20 +struct vx_info; -#include "context.h" +void vx_update_load(struct vx_info *); -/* scheduling stuff */ +struct task_struct; int effective_vavavoom(struct task_struct *, int); int vx_tokens_recalc(struct vx_info *); -/* new stuff ;) */ - -static inline int vx_tokens_avail(struct vx_info *vxi) -{ - return atomic_read(&vxi->sched.tokens); -} - -static inline void vx_consume_token(struct vx_info *vxi) -{ - atomic_dec(&vxi->sched.tokens); -} - -static inline int vx_need_resched(struct task_struct *p) -{ -#ifdef CONFIG_VSERVER_HARDCPU - struct vx_info *vxi = p->vx_info; -#endif - int slice = --p->time_slice; - -#ifdef CONFIG_VSERVER_HARDCPU - if (vxi) { - int tokens; - - if ((tokens = vx_tokens_avail(vxi)) > 0) - vx_consume_token(vxi); - /* for tokens > 0, one token was consumed */ - if (tokens < 2) - return 1; - } -#endif - return (slice == 0); -} - - -static inline void vx_onhold_inc(struct vx_info *vxi) -{ - int onhold = atomic_read(&vxi->cvirt.nr_onhold); - - atomic_inc(&vxi->cvirt.nr_onhold); - if (!onhold) - vxi->cvirt.onhold_last = jiffies; -} - -static inline void __vx_onhold_update(struct vx_info *vxi) -{ - int cpu = smp_processor_id(); - uint32_t now = jiffies; - uint32_t delta = now - vxi->cvirt.onhold_last; - - vxi->cvirt.onhold_last = now; - vxi->sched.cpu[cpu].hold_ticks += delta; -} - -static inline void vx_onhold_dec(struct vx_info *vxi) -{ - if (atomic_dec_and_test(&vxi->cvirt.nr_onhold)) - __vx_onhold_update(vxi); -} - #endif /* __KERNEL__ */ - +#else /* _VX_SCHED_H */ +#warning duplicate inclusion #endif /* _VX_SCHED_H */ -#endif diff --git a/include/linux/vserver/sched_cmd.h b/include/linux/vserver/sched_cmd.h new file mode 100644 index 000000000..2a6f55bda --- /dev/null +++ b/include/linux/vserver/sched_cmd.h @@ -0,0 +1,47 @@ +#ifndef _VX_SCHED_CMD_H +#define _VX_SCHED_CMD_H + +/* sched vserver commands */ + +#define VCMD_set_sched_v2 VC_CMD(SCHED, 1, 2) +#define VCMD_set_sched VC_CMD(SCHED, 1, 3) + +struct vcmd_set_sched_v2 { + int32_t fill_rate; + int32_t interval; + int32_t tokens; + int32_t tokens_min; + int32_t tokens_max; + uint64_t cpu_mask; +}; + +struct vcmd_set_sched_v3 { + uint32_t set_mask; + int32_t fill_rate; + int32_t interval; + int32_t tokens; + int32_t tokens_min; + int32_t tokens_max; + int32_t priority_bias; +}; + + +#define VXSM_FILL_RATE 0x0001 +#define VXSM_INTERVAL 0x0002 +#define VXSM_TOKENS 0x0010 +#define VXSM_TOKENS_MIN 0x0020 +#define VXSM_TOKENS_MAX 0x0040 +#define VXSM_PRIO_BIAS 0x0100 + +#define SCHED_KEEP (-2) + +#ifdef __KERNEL__ + +#include + +extern int vc_set_sched_v1(uint32_t, void __user *); +extern int vc_set_sched_v2(uint32_t, void __user *); +extern int vc_set_sched(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_SCHED_CMD_H */ diff --git a/include/linux/vserver/sched_def.h b/include/linux/vserver/sched_def.h new file mode 100644 index 000000000..e85c09ff7 --- /dev/null +++ b/include/linux/vserver/sched_def.h @@ -0,0 +1,36 @@ +#ifndef _VX_SCHED_DEF_H +#define _VX_SCHED_DEF_H + +#include +#include +#include +#include +#include + + +struct _vx_ticks { + uint64_t user_ticks; /* token tick events */ + uint64_t sys_ticks; /* token tick events */ + uint64_t hold_ticks; /* token ticks paused */ + uint64_t unused[5]; /* cacheline ? */ +}; + +/* context sub struct */ + +struct _vx_sched { + atomic_t tokens; /* number of CPU tokens */ + spinlock_t tokens_lock; /* lock for token bucket */ + + int fill_rate; /* Fill rate: add X tokens... */ + int interval; /* Divisor: per Y jiffies */ + int tokens_min; /* Limit: minimum for unhold */ + int tokens_max; /* Limit: no more than N tokens */ + uint32_t jiffies; /* last time accounted */ + + int priority_bias; /* bias offset for priority */ + cpumask_t cpus_allowed; /* cpu mask for context */ + + struct _vx_ticks cpu[NR_CPUS]; +}; + +#endif /* _VX_SCHED_DEF_H */ diff --git a/include/linux/vserver/switch.h b/include/linux/vserver/switch.h index 5fef6907b..81f5c2398 100644 --- a/include/linux/vserver/switch.h +++ b/include/linux/vserver/switch.h @@ -12,11 +12,11 @@ /* - Syscall Matrix V2.6 + Syscall Matrix V2.8 - |VERSION|CREATE |MODIFY |MIGRATE|CONTROL|EXPERIM| |SPECIAL|SPECIAL| - |STATS |DESTROY|ALTER |CHANGE |LIMIT |TEST | | | | - |INFO |SETUP | |MOVE | | | | | | + |VERSION|CREATE |MODIFY |MIGRATE|CONTROL|EXPERIM| |SPECIAL|SPECIAL| + |STATS |DESTROY|ALTER |CHANGE |LIMIT |TEST | | | | + |INFO |SETUP | |MOVE | | | | | | -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ SYSTEM |VERSION|VSETUP |VHOST | | | | |DEVICES| | HOST | 00| 01| 02| 03| 04| 05| | 06| 07| @@ -25,22 +25,22 @@ PROCESS| 08| 09| 10| 11| 12| 13| | 14| 15| -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ MEMORY | | | | | | | |SWAP | | - | 16| 17| 18| 19| 20| 21| | 22| 23| + | 16| 17| 18| 19| 20| 21| | 22| 23| -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ NETWORK| |VNET |NETALT |NETMIG |NETCTL | | |SERIAL | | - | 24| 25| 26| 27| 28| 29| | 30| 31| + | 24| 25| 26| 27| 28| 29| | 30| 31| -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ - DISK | | | | | | | |INODE | | + DISK | | | | |DLIMIT | | |INODE | | VFS | 32| 33| 34| 35| 36| 37| | 38| 39| -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ OTHER | | | | | | | |VINFO | | - | 40| 41| 42| 43| 44| 45| | 46| 47| + | 40| 41| 42| 43| 44| 45| | 46| 47| =======+=======+=======+=======+=======+=======+=======+ +=======+=======+ - SPECIAL| | | | |FLAGS | | | | | - | 48| 49| 50| 51| 52| 53| | 54| 55| + SPECIAL|EVENT | | | |FLAGS | | | | | + | 48| 49| 50| 51| 52| 53| | 54| 55| -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ - SPECIAL| | | | |RLIMIT |SYSCALL| | |COMPAT | - | 56| 57| 58| 59| 60|TEST 61| | 62| 63| + SPECIAL|DEBUG | | | |RLIMIT |SYSCALL| | |COMPAT | + | 56| 57| 58| 59| 60|TEST 61| | 62| 63| -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ */ @@ -49,7 +49,7 @@ #define VC_CAT_VSETUP 1 #define VC_CAT_VHOST 2 - + #define VC_CAT_VPROC 9 #define VC_CAT_PROCALT 10 #define VC_CAT_PROCMIG 11 @@ -62,19 +62,22 @@ #define VC_CAT_NETMIG 27 #define VC_CAT_NETCTRL 28 +#define VC_CAT_DLIMIT 36 #define VC_CAT_INODE 38 #define VC_CAT_VINFO 46 +#define VC_CAT_EVENT 48 #define VC_CAT_FLAGS 52 +#define VC_CAT_DEBUG 56 #define VC_CAT_RLIMIT 60 #define VC_CAT_SYSTEST 61 #define VC_CAT_COMPAT 63 - + /* interface version */ -#define VCI_VERSION 0x00010016 +#define VCI_VERSION 0x00010025 /* query version */ @@ -86,7 +89,6 @@ #include -#define ENOTSUP -EOPNOTSUPP #else /* __KERNEL__ */ #define __user diff --git a/include/linux/vserver/xid.h b/include/linux/vserver/xid.h index ba52c2588..91e28de5e 100644 --- a/include/linux/vserver/xid.h +++ b/include/linux/vserver/xid.h @@ -1,15 +1,21 @@ -#ifndef _LINUX_XID_H_ -#define _LINUX_XID_H_ +#ifndef _VX_XID_H +#define _VX_XID_H + + +#define XID_TAG(in) (!(in) || \ + (((struct inode *)in)->i_sb && \ + (((struct inode *)in)->i_sb->s_flags & MS_TAGXID))) + #ifdef CONFIG_INOXID_NONE #define MAX_UID 0xFFFFFFFF #define MAX_GID 0xFFFFFFFF -#define INOXID_XID(uid, gid, xid) (0) +#define INOXID_XID(tag, uid, gid, xid) (0) -#define XIDINO_UID(uid, xid) (uid) -#define XIDINO_GID(gid, xid) (gid) +#define XIDINO_UID(tag, uid, xid) (uid) +#define XIDINO_GID(tag, gid, xid) (gid) #endif @@ -19,37 +25,57 @@ #define MAX_UID 0xFFFFFFFF #define MAX_GID 0x0000FFFF -#define INOXID_XID(uid, gid, xid) (((gid) >> 16) & 0xFFFF) - -#define XIDINO_UID(uid, xid) (uid) -#define XIDINO_GID(gid, xid) (((gid) & 0xFFFF) | ((xid) << 16)) +#define INOXID_XID(tag, uid, gid, xid) \ + ((tag) ? (((gid) >> 16) & 0xFFFF) : 0) +#define XIDINO_UID(tag, uid, xid) (uid) +#define XIDINO_GID(tag, gid, xid) \ + ((tag) ? (((gid) & 0xFFFF) | ((xid) << 16)) : (gid)) #endif -#ifdef CONFIG_INOXID_GID24 +#ifdef CONFIG_INOXID_UGID24 #define MAX_UID 0x00FFFFFF #define MAX_GID 0x00FFFFFF -#define INOXID_XID(uid, gid, xid) ((((uid) >> 16) & 0xFF00) | (((gid) >> 24) & 0xFF)) +#define INOXID_XID(tag, uid, gid, xid) \ + ((tag) ? ((((uid) >> 16) & 0xFF00) | (((gid) >> 24) & 0xFF)) : 0) + +#define XIDINO_UID(tag, uid, xid) \ + ((tag) ? (((uid) & 0xFFFFFF) | (((xid) & 0xFF00) << 16)) : (uid)) +#define XIDINO_GID(tag, gid, xid) \ + ((tag) ? (((gid) & 0xFFFFFF) | (((xid) & 0x00FF) << 24)) : (gid)) + +#endif + + +#ifdef CONFIG_INOXID_UID16 + +#define MAX_UID 0x0000FFFF +#define MAX_GID 0xFFFFFFFF -#define XIDINO_UID(uid, xid) (((uid) & 0xFFFFFF) | (((xid) & 0xFF00) << 16)) -#define XIDINO_GID(gid, xid) (((gid) & 0xFFFFFF) | (((xid) & 0x00FF) << 24)) +#define INOXID_XID(tag, uid, gid, xid) \ + ((tag) ? ((uid) >> 16) & 0xFFFF) : 0) + +#define XIDINO_UID(tag, uid, xid) \ + ((tag) ? (((uid) & 0xFFFF) | ((xid) << 16)) : (uid)) +#define XIDINO_GID(tag, gid, xid) (gid) #endif -#ifdef CONFIG_INOXID_GID32 +#ifdef CONFIG_INOXID_INTERN #define MAX_UID 0xFFFFFFFF #define MAX_GID 0xFFFFFFFF -#define INOXID_XID(uid, gid, xid) (xid) +#define INOXID_XID(tag, uid, gid, xid) \ + ((tag) ? (xid) : 0) -#define XIDINO_UID(uid, xid) (uid) -#define XIDINO_GID(gid, xid) (gid) +#define XIDINO_UID(tag, uid, xid) (uid) +#define XIDINO_GID(tag, gid, xid) (gid) #endif @@ -59,16 +85,19 @@ #define MAX_UID 0xFFFFFFFF #define MAX_GID 0xFFFFFFFF -#define INOXID_XID(uid, gid, xid) (0) +#define INOXID_XID(tag, uid, gid, xid) (0) -#define XIDINO_UID(uid, xid) (uid) -#define XIDINO_GID(gid, xid) (gid) +#define XIDINO_UID(tag, uid, xid) (uid) +#define XIDINO_GID(tag, gid, xid) (gid) #endif -#define INOXID_UID(uid, gid) ((uid) & MAX_UID) -#define INOXID_GID(uid, gid) ((gid) & MAX_GID) +#define INOXID_UID(tag, uid, gid) \ + ((tag) ? ((uid) & MAX_UID) : (uid)) +#define INOXID_GID(tag, uid, gid) \ + ((tag) ? ((gid) & MAX_GID) : (gid)) + static inline uid_t vx_map_uid(uid_t uid) { @@ -85,10 +114,13 @@ static inline gid_t vx_map_gid(gid_t gid) } -#ifdef CONFIG_VSERVER_LEGACY +#ifdef CONFIG_VSERVER_LEGACY #define FIOC_GETXID _IOR('x', 1, long) #define FIOC_SETXID _IOW('x', 2, long) #define FIOC_SETXIDJ _IOW('x', 3, long) #endif -#endif /* _LINUX_XID_H_ */ +int vx_parse_xid(char *string, xid_t *xid, int remove); +void vx_propagate_xid(struct nameidata *nd, struct inode *inode); + +#endif /* _VX_XID_H */ diff --git a/include/net/route.h b/include/net/route.h index c5d47b24b..9ed04d9f6 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -146,6 +145,59 @@ static inline char rt_tos2priority(u8 tos) return ip_tos2prio[IPTOS_TOS(tos)>>1]; } +#define IPI_LOOPBACK 0x0100007f + +static inline int ip_find_src(struct nx_info *nxi, struct rtable **rp, struct flowi *fl) +{ + int err; + int i, n = nxi->nbipv4; + u32 ipv4root = nxi->ipv4[0]; + + if (ipv4root == 0) + return 0; + + if (fl->fl4_src == 0) { + if (n > 1) { + u32 foundsrc; + + err = __ip_route_output_key(rp, fl); + if (err) { + fl->fl4_src = ipv4root; + err = __ip_route_output_key(rp, fl); + } + if (err) + return err; + + foundsrc = (*rp)->rt_src; + ip_rt_put(*rp); + + for (i=0; imask[i]; + u32 ipv4 = nxi->ipv4[i]; + u32 net4 = ipv4 & mask; + + if (foundsrc == ipv4) { + fl->fl4_src = ipv4; + break; + } + if (!fl->fl4_src && (foundsrc & mask) == net4) + fl->fl4_src = ipv4; + } + } + if (fl->fl4_src == 0) + fl->fl4_src = (fl->fl4_dst == IPI_LOOPBACK) + ? IPI_LOOPBACK : ipv4root; + } else { + for (i=0; iipv4[i] == fl->fl4_src) + break; + } + if (i == n) + return -EPERM; + } + return 0; +} + static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif, u8 protocol, u16 sport, u16 dport, struct sock *sk) @@ -160,7 +212,23 @@ static inline int ip_route_connect(struct rtable **rp, u32 dst, .dport = dport } } }; int err; - if (!dst || !src) { + struct nx_info *nx_info = current->nx_info; + + if (sk) + nx_info = sk->sk_nx_info; + vxdprintk(VXD_CBIT(net, 4), + "ip_route_connect(%p) %p,%p;%lx", + sk, nx_info, sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0)); + + if (nx_info) { + err = ip_find_src(nx_info, rp, &fl); + if (err) + return err; + if (fl.fl4_dst == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + fl.fl4_dst = nx_info->ipv4[0]; + } + if (!fl.fl4_dst || !fl.fl4_src) { err = __ip_route_output_key(rp, &fl); if (err) return err; diff --git a/ipc/msg.c b/ipc/msg.c index 796440603..62aead43e 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -99,7 +99,7 @@ static int newque (key_t key, int msgflg) msq->q_perm.mode = (msgflg & S_IRWXUGO); msq->q_perm.key = key; - msq->q_perm.xid = current->xid; + msq->q_perm.xid = vx_current_xid(); msq->q_perm.security = NULL; retval = security_msg_queue_alloc(msq); diff --git a/ipc/sem.c b/ipc/sem.c index 3960ddb24..d33f2adc9 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -178,7 +178,7 @@ static int newary (key_t key, int nsems, int semflg) sma->sem_perm.mode = (semflg & S_IRWXUGO); sma->sem_perm.key = key; - sma->sem_perm.xid = current->xid; + sma->sem_perm.xid = vx_current_xid(); sma->sem_perm.security = NULL; retval = security_sem_alloc(sma); diff --git a/ipc/shm.c b/ipc/shm.c index d7bb539e0..fa14c369b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -195,7 +195,7 @@ static int newseg (key_t key, int shmflg, size_t size) return -ENOMEM; shp->shm_perm.key = key; - shp->shm_perm.xid = current->xid; + shp->shm_perm.xid = vx_current_xid(); shp->shm_flags = (shmflg & S_IRWXUGO); shp->mlock_user = NULL; diff --git a/kernel/Makefile b/kernel/Makefile index 3d32576e4..23dc38fa1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,6 +14,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ subdir-y += vserver obj-y += vserver/vserver.o +subdir-y += vserver +obj-y += vserver/vserver.o + obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o diff --git a/kernel/capability.c b/kernel/capability.c index a4bf68d2f..649a9ce34 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -12,6 +12,8 @@ #include #include #include +#include + #include unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ diff --git a/kernel/exit.c b/kernel/exit.c index 764c1ad06..ebcc1b63c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -382,6 +383,7 @@ static inline void close_files(struct files_struct * files) struct file * file = xchg(&files->fd[i], NULL); if (file) filp_close(file, files); + // vx_openfd_dec(i); } i++; set >>= 1; @@ -611,6 +613,7 @@ static inline void forget_original_parent(struct task_struct * father, struct task_struct *p, *reaper = father; struct list_head *_p, *_n; + /* FIXME handle vchild_reaper/initpid */ do { reaper = next_thread(reaper); if (reaper == father) { diff --git a/kernel/fork.c b/kernel/fork.c index d19d14e91..a44ced0d2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -39,12 +39,12 @@ #include #include #include -#include -#include -#include #include #include #include +#include +#include +#include #include #include @@ -354,7 +354,6 @@ void fastcall __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); - clr_vx_info(&mm->mm_vx_info); #ifdef CONFIG_CKRM_RES_MEM /* class can be null and mm's tasklist can be empty here */ if (mm->memclass) { @@ -362,6 +361,7 @@ void fastcall __mmdrop(struct mm_struct *mm) mm->memclass = NULL; } #endif + clr_vx_info(&mm->mm_vx_info); free_mm(mm); } @@ -869,6 +869,23 @@ static task_t *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_vm; } + p->vx_info = NULL; + set_vx_info(&p->vx_info, current->vx_info); + p->nx_info = NULL; + set_nx_info(&p->nx_info, current->nx_info); + + /* check vserver memory */ + if (p->mm && !(clone_flags & CLONE_VM)) { + if (vx_vmpages_avail(p->mm, p->mm->total_vm)) + vx_pages_add(p->mm->mm_vx_info, RLIMIT_AS, p->mm->total_vm); + else + goto bad_fork_free; + } + if (p->mm && vx_flags(VXF_FORK_RSS, 0)) { + if (!vx_rsspages_avail(p->mm, p->mm->rss)) + goto bad_fork_cleanup_vm; + } + retval = -EAGAIN; if (!vx_nproc_avail(1)) goto bad_fork_cleanup_vm; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 2d04567ea..663980afb 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -46,6 +46,7 @@ #include #include #include +#include #ifndef div_long_long_rem #include diff --git a/kernel/printk.c b/kernel/printk.c index 31e1731df..2b80f441c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -539,6 +539,8 @@ asmlinkage int printk(const char *fmt, ...) return r; } +static volatile int printk_cpu = -1; + asmlinkage int vprintk(const char *fmt, va_list args) { unsigned long flags; @@ -547,11 +549,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) static char printk_buf[1024]; static int log_level_unknown = 1; - if (unlikely(oops_in_progress)) + if (unlikely(oops_in_progress && printk_cpu == smp_processor_id())) zap_locks(); /* This stops the holder of console_sem just where we want him */ spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); diff --git a/kernel/sched.c b/kernel/sched.c index f609197c8..855a39a81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -50,6 +50,9 @@ #include #include +#include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -262,6 +265,10 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; #endif +#ifdef CONFIG_VSERVER_HARDCPU + struct list_head hold_queue; + int idle_tokens; +#endif #ifdef CONFIG_VSERVER_HARDCPU struct list_head hold_queue; @@ -738,12 +745,10 @@ static int effective_prio(task_t *p) bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; - #ifdef CONFIG_VSERVER_HARDCPU if (task_vx_flags(p, VXF_SCHED_PRIO, 0)) prio += effective_vavavoom(p, MAX_USER_PRIO); #endif - if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) @@ -904,10 +909,11 @@ static void __deactivate_task(struct task_struct *p, runqueue_t *rq) p->array = NULL; } -static void deactivate_task(struct task_struct *p, runqueue_t *rq) +static inline +void deactivate_task(struct task_struct *p, runqueue_t *rq) { - __deactivate_task(p, rq); vx_deactivate_task(p); + __deactivate_task(p, rq); } /* @@ -1244,6 +1250,9 @@ out_activate: * to be considered on this CPU.) */ activate_task(p, rq, cpu == this_cpu); + /* this is to get the accounting behind the load update */ + if (old_state == TASK_UNINTERRUPTIBLE) + vx_uninterruptible_dec(p); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -2886,7 +2895,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); - if (vxi) { vxi->sched.cpu[cpu].user_ticks += user_ticks; vxi->sched.cpu[cpu].sys_ticks += sys_ticks; @@ -2911,6 +2919,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (wake_priority_sleeper(rq)) goto out; + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); #ifdef CONFIG_VSERVER_HARDCPU_IDLE @@ -2955,6 +2964,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) } goto out_unlock; } +#warning MEF: vx_need_resched incorpates standard kernel code, which it should not. if (vx_need_resched(p)) { #ifdef CONFIG_CKRM_CPU_SCHEDULE /* Hubertus ... we can abstract this out */ @@ -3158,11 +3168,11 @@ asmlinkage void __sched schedule(void) prio_array_t *array; unsigned long long now; unsigned long run_time; - int cpu; #ifdef CONFIG_VSERVER_HARDCPU struct vx_info *vxi; int maxidle = -HZ; #endif + int cpu; /* * If crash dump is in progress, this other cpu's @@ -3173,7 +3183,6 @@ asmlinkage void __sched schedule(void) if (unlikely(dump_oncpu)) goto dump_scheduling_disabled; - /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3249,8 +3258,10 @@ need_resched_nonpreemptible: unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { - if (prev->state == TASK_UNINTERRUPTIBLE) + if (prev->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible++; + vx_uninterruptible_inc(prev); + } deactivate_task(prev, rq); } } @@ -3330,6 +3341,26 @@ go_idle: */ next = rq_get_next_task(rq); +#ifdef CONFIG_VSERVER_HARDCPU + vxi = next->vx_info; + if (vx_info_flags(vxi, VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) { + int ret = vx_tokens_recalc(vxi); + + if (unlikely(ret <= 0)) { + if (ret && (rq->idle_tokens > -ret)) + rq->idle_tokens = -ret; + __deactivate_task(next, rq); + recalc_task_prio(next, now); + // a new one on hold + vx_onhold_inc(vxi); + next->state |= TASK_ONHOLD; + list_add_tail(&next->run_list, &rq->hold_queue); + //printk("··· %8lu hold %p [%d]\n", jiffies, next, next->prio); + goto pick_next; + } + } +#endif + #ifdef CONFIG_VSERVER_HARDCPU vxi = next->vx_info; if (vx_info_flags(vxi, VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) { diff --git a/kernel/signal.c b/kernel/signal.c index a56f3d984..e74c8211a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -626,7 +626,6 @@ static int check_kill_permission(int sig, struct siginfo *info, if (sig < 0 || sig > _NSIG) return error; - user = (!info || (info != SEND_SIG_PRIV && info != SEND_SIG_FORCED && diff --git a/kernel/sys.c b/kernel/sys.c index 37923408e..fee92cce1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -25,10 +25,8 @@ #include #include #include -#include -#include #include - +#include #include #include diff --git a/kernel/vserver/Kconfig b/kernel/vserver/Kconfig index 635d8d488..22f8f2315 100644 --- a/kernel/vserver/Kconfig +++ b/kernel/vserver/Kconfig @@ -11,7 +11,7 @@ config VSERVER_LEGACY This enables the legacy API used in vs1.xx, which allows to use older tools (for migration purposes). -config PROC_SECURE +config VSERVER_PROC_SECURE bool "Enable Proc Security" depends on PROC_FS default y @@ -25,9 +25,19 @@ config VSERVER_HARDCPU help Activate the Hard CPU Limits +config VSERVER_HARDCPU_IDLE + bool "Limit the IDLE task" + depends on VSERVER_HARDCPU + default n + help + Limit the idle slices, so the the next context + will be scheduled as soon as possible. + might improve interactivity/latency but + increases scheduling overhead. + choice prompt "Persistent Inode Context Tagging" - default INOXID_GID24 + default INOXID_UGID24 help This adds persistent context information to filesystems mounted with the tagxid option. Tagging is a requirement @@ -39,26 +49,31 @@ config INOXID_NONE help no context information is store for inodes +config INOXID_UID16 + bool "UID16/GID32" + help + reduces UID to 16 bit, but leaves GID at 32 bit. + config INOXID_GID16 bool "UID32/GID16" help reduces GID to 16 bit, but leaves UID at 32 bit. -config INOXID_GID24 +config INOXID_UGID24 bool "UID24/GID24" help uses the upper 8bit from UID and GID for XID tagging which leaves 24bit for UID/GID each, which should be more than sufficient for normal use. -config INOXID_GID32 +config INOXID_INTERN bool "UID32/GID32" help this uses otherwise reserved inode fields in the on disk representation, which limits the use to a few filesystems (currently ext2 and ext3) -config INOXID_MAGIC +config INOXID_RUNTIME bool "Runtime" depends on EXPERIMENTAL help @@ -68,5 +83,32 @@ config INOXID_MAGIC endchoice +config VSERVER_DEBUG + bool "Compile Debugging Code" + default n + help + Set this to yes if you want to be able to activate + debugging output at runtime. It adds a probably small + overhead (~ ??%) to all vserver related functions and + increases the kernel size by about 20k. + +config VSERVER_HISTORY + bool "Compile History Tracing" + depends on VSERVER_DEBUG + default n + help + Set this to yes if you want to record the history of + linux-vserver activities, so they can be replayed on + a kernel panic (oops) + +config VSERVER_HISTORY_SIZE + int "Per CPU History Size (32-65536)" + depends on VSERVER_HISTORY + range 32 65536 + default 64 + help + This allows you to specify the number of entries in + the per CPU history buffer. + endmenu diff --git a/kernel/vserver/Makefile b/kernel/vserver/Makefile index c035a77cd..1cee3de70 100644 --- a/kernel/vserver/Makefile +++ b/kernel/vserver/Makefile @@ -6,7 +6,9 @@ obj-y += vserver.o vserver-y := switch.o context.o namespace.o sched.o network.o inode.o \ - limit.o cvirt.o signal.o proc.o sysctl.o init.o + limit.o cvirt.o signal.o proc.o helper.o init.o dlimit.o +vserver-$(CONFIG_VSERVER_DEBUG) += sysctl.o vserver-$(CONFIG_VSERVER_LEGACY) += legacy.o +vserver-$(CONFIG_VSERVER_HISTORY) += history.o diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c index d56d362a2..57481cbc6 100644 --- a/kernel/vserver/context.c +++ b/kernel/vserver/context.c @@ -3,7 +3,7 @@ * * Virtual Server: Context Support * - * Copyright (C) 2003-2004 Herbert Pötzl + * Copyright (C) 2003-2005 Herbert Pötzl * * V0.01 context helper * V0.02 vx_ctx_kill syscall command @@ -13,26 +13,31 @@ * V0.06 task_xid and info commands * V0.07 context flags and caps * V0.08 switch to RCU based hash + * V0.09 revert to non RCU for now + * V0.10 and back to working RCU hash * */ #include #include -#include -#include -#include -#include -#include +#include #include -#include -#define CKRM_VSERVER_INTEGRATION -#ifdef CKRM_VSERVER_INTEGRATION -#include -#endif //CKRM_VSERVER_INTEGRATION +#include +#include +#include +#include +#include +#include +#include +#include /* needed for ckrm_cb_xid() */ #include +#include "cvirt_init.h" +#include "limit_init.h" +#include "sched_init.h" + /* __alloc_vx_info() @@ -74,6 +79,7 @@ static struct vx_info *__alloc_vx_info(xid_t xid) vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d) = %p", xid, new); + vxh_alloc_vx_info(new); return new; } @@ -85,6 +91,7 @@ static void __dealloc_vx_info(struct vx_info *vxi) { vxdprintk(VXD_CBIT(xid, 0), "dealloc_vx_info(%p)", vxi); + vxh_dealloc_vx_info(vxi); vxi->vx_hlist.next = LIST_POISON1; vxi->vx_id = -1; @@ -122,40 +129,47 @@ static inline int __free_vx_info(struct vx_info *vxi) return usecnt; } -#if 0 - -static void __rcu_free_vx_info(struct rcu_head *head) +static void __rcu_put_vx_info(struct rcu_head *head) { struct vx_info *vxi = container_of(head, struct vx_info, vx_rcu); - BUG_ON(!head); vxdprintk(VXD_CBIT(xid, 3), - "rcu_free_vx_info(%p): uc=%d", vxi, - atomic_read(&vxi->vx_usecnt)); - - __free_vx_info(vxi); + "__rcu_put_vx_info(%p[#%d]): %d,%d", + vxi, vxi->vx_id, + atomic_read(&vxi->vx_usecnt), + atomic_read(&vxi->vx_refcnt)); + put_vx_info(vxi); } -#endif - -void free_vx_info(struct vx_info *vxi) +void __shutdown_vx_info(struct vx_info *vxi) { struct namespace *namespace; struct fs_struct *fs; - /* context shutdown is mandatory */ - // BUG_ON(vxi->vx_state != VXS_SHUTDOWN); + might_sleep(); namespace = xchg(&vxi->vx_namespace, NULL); - fs = xchg(&vxi->vx_fs, NULL); - if (namespace) put_namespace(namespace); + + fs = xchg(&vxi->vx_fs, NULL); if (fs) put_fs_struct(fs); +} + +/* exported stuff */ + +void free_vx_info(struct vx_info *vxi) +{ + /* context shutdown is mandatory */ + // BUG_ON(vxi->vx_state != VXS_SHUTDOWN); + + BUG_ON(vxi->vx_state & VXS_HASHED); + + BUG_ON(vxi->vx_namespace); + BUG_ON(vxi->vx_fs); BUG_ON(__free_vx_info(vxi)); - // call_rcu(&i->vx_rcu, __rcu_free_vx_info); } @@ -186,6 +200,8 @@ static inline void __hash_vx_info(struct vx_info *vxi) vxdprintk(VXD_CBIT(xid, 4), "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); + vxh_hash_vx_info(vxi); + get_vx_info(vxi); vxi->vx_state |= VXS_HASHED; head = &vx_info_hash[__hashval(vxi->vx_id)]; @@ -201,9 +217,12 @@ static inline void __unhash_vx_info(struct vx_info *vxi) { vxdprintk(VXD_CBIT(xid, 4), "__unhash_vx_info: %p[#%d]", vxi, vxi->vx_id); + vxh_unhash_vx_info(vxi); + vxi->vx_state &= ~VXS_HASHED; hlist_del_rcu(&vxi->vx_hlist); - put_vx_info(vxi); + + call_rcu(&vxi->vx_rcu, __rcu_put_vx_info); } @@ -216,22 +235,29 @@ static inline struct vx_info *__lookup_vx_info(xid_t xid) { struct hlist_head *head = &vx_info_hash[__hashval(xid)]; struct hlist_node *pos; + struct vx_info *vxi; hlist_for_each_rcu(pos, head) { - struct vx_info *vxi = - hlist_entry(pos, struct vx_info, vx_hlist); + vxi = hlist_entry(pos, struct vx_info, vx_hlist); if ((vxi->vx_id == xid) && vx_info_state(vxi, VXS_HASHED)) - return vxi; + goto found; } - return NULL; + vxi = NULL; +found: + vxdprintk(VXD_CBIT(xid, 0), + "__lookup_vx_info(#%u): %p[#%u]", + xid, vxi, vxi?vxi->vx_id:0); + vxh_lookup_vx_info(xid, vxi); + return vxi; } /* __vx_dynamic_id() * find unused dynamic xid + * requires the rcu_read_lock() * requires the hash_lock to be held */ static inline xid_t __vx_dynamic_id(void) @@ -267,6 +293,9 @@ static struct vx_info * __loc_vx_info(int id, int *err) return NULL; } + /* FIXME is this required at all ? */ + rcu_read_lock(); + /* required to make dynamic xids unique */ spin_lock(&vx_info_hash_lock); /* dynamic context requested */ @@ -304,6 +333,8 @@ static struct vx_info * __loc_vx_info(int id, int *err) out_unlock: spin_unlock(&vx_info_hash_lock); + rcu_read_unlock(); + vxh_loc_vx_info(id, vxi); if (new) __dealloc_vx_info(new); return vxi; @@ -316,6 +347,7 @@ out_unlock: void unhash_vx_info(struct vx_info *vxi) { + __shutdown_vx_info(vxi); spin_lock(&vx_info_hash_lock); __unhash_vx_info(vxi); spin_unlock(&vx_info_hash_lock); @@ -534,12 +566,7 @@ int vx_migrate_task(struct task_struct *p, struct vx_info *vxi) out: -#ifdef CKRM_VSERVER_INTEGRATION - do { - ckrm_cb_xid(p); - } while (0); -#endif //CKRM_VSERVER_INTEGRATION - + ckrm_cb_xid(p); put_vx_info(old_vxi); return ret; @@ -584,7 +611,7 @@ int vc_task_xid(uint32_t id, void __user *data) read_unlock(&tasklist_lock); } else - xid = current->xid; + xid = vx_current_xid(); return xid; } @@ -768,8 +795,6 @@ int vc_set_ccaps(uint32_t id, void __user *data) #include -// EXPORT_SYMBOL_GPL(rcu_free_vx_info); EXPORT_SYMBOL_GPL(free_vx_info); -EXPORT_SYMBOL_GPL(vx_info_hash_lock); EXPORT_SYMBOL_GPL(unhash_vx_info); diff --git a/kernel/vserver/cvirt.c b/kernel/vserver/cvirt.c index 2b5c81e35..1cb3eda97 100644 --- a/kernel/vserver/cvirt.c +++ b/kernel/vserver/cvirt.c @@ -10,10 +10,11 @@ */ #include -#include -#include +#include +#include +#include +#include #include -#include #include #include @@ -24,8 +25,8 @@ void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle) struct vx_info *vxi = current->vx_info; set_normalized_timespec(uptime, - uptime->tv_sec - vxi->cvirt.bias_tp.tv_sec, - uptime->tv_nsec - vxi->cvirt.bias_tp.tv_nsec); + uptime->tv_sec - vxi->cvirt.bias_uptime.tv_sec, + uptime->tv_nsec - vxi->cvirt.bias_uptime.tv_nsec); if (!idle) return; set_normalized_timespec(idle, @@ -34,8 +35,63 @@ void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle) return; } -uint64_t vx_idle_jiffies() +uint64_t vx_idle_jiffies(void) { return init_task.utime + init_task.stime; } + + +static inline uint32_t __update_loadavg(uint32_t load, + int wsize, int delta, int n) +{ + unsigned long long calc, prev; + + /* just set it to n */ + if (unlikely(delta >= wsize)) + return (n << FSHIFT); + + calc = delta * n; + calc <<= FSHIFT; + prev = (wsize - delta); + prev *= load; + calc += prev; + do_div(calc, wsize); + return calc; +} + + +void vx_update_load(struct vx_info *vxi) +{ + uint32_t now, last, delta; + unsigned int nr_running, nr_uninterruptible; + unsigned int total; + + spin_lock(&vxi->cvirt.load_lock); + + now = jiffies; + last = vxi->cvirt.load_last; + delta = now - last; + + if (delta < 5*HZ) + goto out; + + nr_running = atomic_read(&vxi->cvirt.nr_running); + nr_uninterruptible = atomic_read(&vxi->cvirt.nr_uninterruptible); + total = nr_running + nr_uninterruptible; + + vxi->cvirt.load[0] = __update_loadavg(vxi->cvirt.load[0], + 60*HZ, delta, total); + vxi->cvirt.load[1] = __update_loadavg(vxi->cvirt.load[1], + 5*60*HZ, delta, total); + vxi->cvirt.load[2] = __update_loadavg(vxi->cvirt.load[2], + 15*60*HZ, delta, total); + + vxi->cvirt.load_last = now; +out: + atomic_inc(&vxi->cvirt.load_updates); + spin_unlock(&vxi->cvirt.load_lock); +} + + + diff --git a/kernel/vserver/cvirt_init.h b/kernel/vserver/cvirt_init.h new file mode 100644 index 000000000..ecc34e1da --- /dev/null +++ b/kernel/vserver/cvirt_init.h @@ -0,0 +1,66 @@ + +extern uint64_t vx_idle_jiffies(void); + +static inline void vx_info_init_cvirt(struct _vx_cvirt *cvirt) +{ + uint64_t idle_jiffies = vx_idle_jiffies(); + uint64_t nsuptime; + + do_posix_clock_monotonic_gettime(&cvirt->bias_uptime); + nsuptime = (unsigned long long)cvirt->bias_uptime.tv_sec + * NSEC_PER_SEC + cvirt->bias_uptime.tv_nsec; + cvirt->bias_clock = nsec_to_clock_t(nsuptime); + + jiffies_to_timespec(idle_jiffies, &cvirt->bias_idle); + atomic_set(&cvirt->nr_threads, 0); + atomic_set(&cvirt->nr_running, 0); + atomic_set(&cvirt->nr_uninterruptible, 0); + atomic_set(&cvirt->nr_onhold, 0); + + down_read(&uts_sem); + cvirt->utsname = system_utsname; + up_read(&uts_sem); + + spin_lock_init(&cvirt->load_lock); + cvirt->load_last = jiffies; + atomic_set(&cvirt->load_updates, 0); + cvirt->load[0] = 0; + cvirt->load[1] = 0; + cvirt->load[2] = 0; +} + +static inline void vx_info_exit_cvirt(struct _vx_cvirt *cvirt) +{ +#ifdef CONFIG_VSERVER_DEBUG + int value; + + vxwprintk((value = atomic_read(&cvirt->nr_threads)), + "!!! cvirt: %p[nr_threads] = %d on exit.", + cvirt, value); + vxwprintk((value = atomic_read(&cvirt->nr_running)), + "!!! cvirt: %p[nr_running] = %d on exit.", + cvirt, value); + vxwprintk((value = atomic_read(&cvirt->nr_uninterruptible)), + "!!! cvirt: %p[nr_uninterruptible] = %d on exit.", + cvirt, value); +#endif + return; +} + +static inline void vx_info_init_cacct(struct _vx_cacct *cacct) +{ + int i,j; + + for (i=0; i<5; i++) { + for (j=0; j<3; j++) { + atomic_set(&cacct->sock[i][j].count, 0); + atomic_set(&cacct->sock[i][j].total, 0); + } + } +} + +static inline void vx_info_exit_cacct(struct _vx_cacct *cacct) +{ + return; +} + diff --git a/kernel/vserver/cvirt_proc.h b/kernel/vserver/cvirt_proc.h new file mode 100644 index 000000000..ac67f989c --- /dev/null +++ b/kernel/vserver/cvirt_proc.h @@ -0,0 +1,90 @@ +#ifndef _VX_CVIRT_PROC_H +#define _VX_CVIRT_PROC_H + +#include + + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static inline int vx_info_proc_cvirt(struct _vx_cvirt *cvirt, char *buffer) +{ + int length = 0; + int a, b, c; + + length += sprintf(buffer + length, + "BiasUptime:\t%lu.%02lu\n", + (unsigned long)cvirt->bias_uptime.tv_sec, + (cvirt->bias_uptime.tv_nsec / (NSEC_PER_SEC / 100))); + length += sprintf(buffer + length, + "SysName:\t%.*s\n" + "NodeName:\t%.*s\n" + "Release:\t%.*s\n" + "Version:\t%.*s\n" + "Machine:\t%.*s\n" + "DomainName:\t%.*s\n" + ,__NEW_UTS_LEN, cvirt->utsname.sysname + ,__NEW_UTS_LEN, cvirt->utsname.nodename + ,__NEW_UTS_LEN, cvirt->utsname.release + ,__NEW_UTS_LEN, cvirt->utsname.version + ,__NEW_UTS_LEN, cvirt->utsname.machine + ,__NEW_UTS_LEN, cvirt->utsname.domainname + ); + + a = cvirt->load[0] + (FIXED_1/200); + b = cvirt->load[1] + (FIXED_1/200); + c = cvirt->load[2] + (FIXED_1/200); + length += sprintf(buffer + length, + "nr_threads:\t%d\n" + "nr_running:\t%d\n" + "nr_unintr:\t%d\n" + "nr_onhold:\t%d\n" + "load_updates:\t%d\n" + "loadavg:\t%d.%02d %d.%02d %d.%02d\n" + ,atomic_read(&cvirt->nr_threads) + ,atomic_read(&cvirt->nr_running) + ,atomic_read(&cvirt->nr_uninterruptible) + ,atomic_read(&cvirt->nr_onhold) + ,atomic_read(&cvirt->load_updates) + ,LOAD_INT(a), LOAD_FRAC(a) + ,LOAD_INT(b), LOAD_FRAC(b) + ,LOAD_INT(c), LOAD_FRAC(c) + ); + return length; +} + + +static inline long vx_sock_count(struct _vx_cacct *cacct, int type, int pos) +{ + return atomic_read(&cacct->sock[type][pos].count); +} + + +static inline long vx_sock_total(struct _vx_cacct *cacct, int type, int pos) +{ + return atomic_read(&cacct->sock[type][pos].total); +} + +static inline int vx_info_proc_cacct(struct _vx_cacct *cacct, char *buffer) +{ + int i,j, length = 0; + static char *type[] = { "UNSPEC", "UNIX", "INET", "INET6", "OTHER" }; + + for (i=0; i<5; i++) { + length += sprintf(buffer + length, + "%s:", type[i]); + for (j=0; j<3; j++) { + length += sprintf(buffer + length, + "\t%12lu/%-12lu" + ,vx_sock_count(cacct, i, j) + ,vx_sock_total(cacct, i, j) + ); + } + buffer[length++] = '\n'; + } + length += sprintf(buffer + length, + "forks:\t%lu\n", cacct->total_forks); + return length; +} + +#endif /* _VX_CVIRT_PROC_H */ diff --git a/kernel/vserver/dlimit.c b/kernel/vserver/dlimit.c index 11da06d49..6b1449416 100644 --- a/kernel/vserver/dlimit.c +++ b/kernel/vserver/dlimit.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -389,7 +388,7 @@ void vx_vsi_statfs(struct super_block *sb, struct kstatfs *buf) __u64 blimit, bfree, bavail; __u32 ifree; - dli = locate_dl_info(sb, current->xid); + dli = locate_dl_info(sb, vx_current_xid()); if (!dli) return; diff --git a/kernel/vserver/helper.c b/kernel/vserver/helper.c index 880b84335..ce8f9710a 100644 --- a/kernel/vserver/helper.c +++ b/kernel/vserver/helper.c @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include #include @@ -57,19 +56,19 @@ long vs_reboot(unsigned int cmd, void * arg) switch (cmd) { case LINUX_REBOOT_CMD_RESTART: argv[1] = "restart"; - break; + break; case LINUX_REBOOT_CMD_HALT: argv[1] = "halt"; - break; + break; case LINUX_REBOOT_CMD_POWER_OFF: argv[1] = "poweroff"; - break; + break; case LINUX_REBOOT_CMD_SW_SUSPEND: argv[1] = "swsusp"; - break; + break; case LINUX_REBOOT_CMD_RESTART2: if (strncpy_from_user(&buffer[0], (char *)arg, sizeof(buffer) - 1) < 0) @@ -77,7 +76,7 @@ long vs_reboot(unsigned int cmd, void * arg) argv[3] = buffer; default: argv[1] = "restart2"; - break; + break; } /* maybe we should wait ? */ @@ -90,3 +89,34 @@ long vs_reboot(unsigned int cmd, void * arg) return 0; } +long vs_context_state(unsigned int cmd) +{ + char id_buf[8], cmd_buf[32]; + + char *argv[] = {vshelper_path, NULL, id_buf, NULL, 0}; + char *envp[] = {"HOME=/", "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", cmd_buf, 0}; + + snprintf(id_buf, sizeof(id_buf)-1, "%d", vx_current_xid()); + snprintf(cmd_buf, sizeof(cmd_buf)-1, "VS_CMD=%08x", cmd); + + switch (cmd) { + case VS_CONTEXT_CREATED: + argv[1] = "startup"; + break; + case VS_CONTEXT_DESTROY: + argv[1] = "shutdown"; + break; + default: + return 0; + } + + if (call_usermodehelper(*argv, argv, envp, 1)) { + printk( KERN_WARNING + "vs_context_state(): failed to exec (%s %s %s %s)\n", + vshelper_path, argv[1], argv[2], argv[3]); + return 0; + } + return 0; +} + diff --git a/kernel/vserver/init.c b/kernel/vserver/init.c index 8afd1fc64..8c44b3313 100644 --- a/kernel/vserver/init.c +++ b/kernel/vserver/init.c @@ -11,8 +11,6 @@ #include #include -#include -// #include #include #include @@ -24,7 +22,9 @@ static int __init init_vserver(void) { int ret = 0; +#ifdef CONFIG_VSERVER_DEBUG vserver_register_sysctl(); +#endif return ret; } @@ -32,7 +32,9 @@ static int __init init_vserver(void) static void __exit exit_vserver(void) { +#ifdef CONFIG_VSERVER_DEBUG vserver_unregister_sysctl(); +#endif return; } diff --git a/kernel/vserver/inode.c b/kernel/vserver/inode.c index 60e6fe1fd..8fdd30c62 100644 --- a/kernel/vserver/inode.c +++ b/kernel/vserver/inode.c @@ -10,12 +10,15 @@ */ #include -#include +#include #include -#include #include +#include #include +#include +#include #include +#include #include #include @@ -23,6 +26,8 @@ static int __vc_get_iattr(struct inode *in, uint32_t *xid, uint32_t *flags, uint32_t *mask) { + struct proc_dir_entry *entry; + if (!in || !in->i_sb) return -ESRCH; @@ -40,8 +45,9 @@ static int __vc_get_iattr(struct inode *in, uint32_t *xid, uint32_t *flags, uint *mask |= IATTR_XID; } - if (in->i_sb->s_magic == PROC_SUPER_MAGIC) { - struct proc_dir_entry *entry = PROC_I(in)->pde; + switch (in->i_sb->s_magic) { + case PROC_SUPER_MAGIC: + entry = PROC_I(in)->pde; // check for specific inodes ? if (entry) @@ -50,6 +56,15 @@ static int __vc_get_iattr(struct inode *in, uint32_t *xid, uint32_t *flags, uint *flags |= (entry->vx_flags & IATTR_FLAGS); else *flags |= (PROC_I(in)->vx_flags & IATTR_FLAGS); + break; + + case DEVPTS_SUPER_MAGIC: + *xid = in->i_xid; + *mask |= IATTR_XID; + break; + + default: + break; } return 0; } @@ -57,7 +72,7 @@ static int __vc_get_iattr(struct inode *in, uint32_t *xid, uint32_t *flags, uint int vc_get_iattr(uint32_t id, void __user *data) { struct nameidata nd; - struct vcmd_ctx_iattr_v1 vc_data; + struct vcmd_ctx_iattr_v1 vc_data = { .xid = -1 }; int ret; if (!vx_check(0, VX_ADMIN)) @@ -80,7 +95,7 @@ int vc_get_iattr(uint32_t id, void __user *data) static int __vc_set_iattr(struct dentry *de, uint32_t *xid, uint32_t *flags, uint32_t *mask) { struct inode *in = de->d_inode; - int error = 0, is_proc = 0; + int error = 0, is_proc = 0, has_xid = 0; if (!in || !in->i_sb) return -ESRCH; @@ -88,7 +103,10 @@ static int __vc_set_iattr(struct dentry *de, uint32_t *xid, uint32_t *flags, uin is_proc = (in->i_sb->s_magic == PROC_SUPER_MAGIC); if ((*mask & IATTR_FLAGS) && !is_proc) return -EINVAL; - if ((*mask & IATTR_XID) && !(in->i_sb->s_flags & MS_TAGXID)) + + has_xid = (in->i_sb->s_flags & MS_TAGXID) || + (in->i_sb->s_magic == DEVPTS_SUPER_MAGIC); + if ((*mask & IATTR_XID) && !has_xid) return -EINVAL; down(&in->i_sem); @@ -170,40 +188,8 @@ int vc_set_iattr(uint32_t id, void __user *data) return ret; } -int vc_iattr_ioctl(struct dentry *de, unsigned int cmd, unsigned long arg) -{ - void __user *data = (void __user *)arg; - struct vcmd_ctx_iattr_v1 vc_data; - int ret; - - /* - * I don't think we need any dget/dput pairs in here as long as - * this function is always called from sys_ioctl i.e., de is - * a field of a struct file that is guaranteed not to be freed. - */ - if (cmd == FIOC_SETIATTR) { - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - if (copy_from_user (&vc_data, data, sizeof(vc_data))) - return -EFAULT; - ret = __vc_set_iattr(de, - &vc_data.xid, &vc_data.flags, &vc_data.mask); - } - else { - if (!vx_check(0, VX_ADMIN)) - return -ENOSYS; - ret = __vc_get_iattr(de->d_inode, - &vc_data.xid, &vc_data.flags, &vc_data.mask); - } - - if (!ret && copy_to_user (data, &vc_data, sizeof(vc_data))) - ret = -EFAULT; - return ret; -} - #ifdef CONFIG_VSERVER_LEGACY -#include #define PROC_DYNAMIC_FIRST 0xF0000000UL @@ -252,3 +238,69 @@ int vx_proc_ioctl(struct inode * inode, struct file * filp, } #endif + +int vx_parse_xid(char *string, xid_t *xid, int remove) +{ + static match_table_t tokens = { + {1, "xid=%u"}, + {0, NULL} + }; + substring_t args[MAX_OPT_ARGS]; + int token, option = 0; + + if (!string) + return 0; + + token = match_token(string, tokens, args); + if (token && xid && !match_int(args, &option)) + *xid = option; + + vxdprintk(VXD_CBIT(xid, 7), + "vx_parse_xid(»%s«): %d:#%d", + string, token, option); + + if (token && remove) { + char *p = strstr(string, "xid="); + char *q = p; + + if (p) { + while (*q != '\0' && *q != ',') + q++; + while (*q) + *p++ = *q++; + while (*p) + *p++ = '\0'; + } + } + return token; +} + +void vx_propagate_xid(struct nameidata *nd, struct inode *inode) +{ + xid_t new_xid = 0; + struct vfsmount *mnt; + int propagate; + + if (!nd) + return; + mnt = nd->mnt; + if (!mnt) + return; + + propagate = (mnt->mnt_flags & MNT_XID); + if (propagate) + new_xid = mnt->mnt_xid; + + vxdprintk(VXD_CBIT(xid, 7), + "vx_propagate_xid(%p[#%lu.%d]): %d,%d", + inode, inode->i_ino, inode->i_xid, + new_xid, (propagate)?1:0); + + if (propagate) + inode->i_xid = new_xid; +} + +#include + +EXPORT_SYMBOL_GPL(vx_propagate_xid); + diff --git a/kernel/vserver/legacy.c b/kernel/vserver/legacy.c index e76065340..fe4c66d61 100644 --- a/kernel/vserver/legacy.c +++ b/kernel/vserver/legacy.c @@ -12,13 +12,11 @@ #include #include -#include -#include -#include -#include -#include #include #include +#include +#include +#include #include #include @@ -61,8 +59,9 @@ int vc_new_s_context(uint32_t ctx, void __user *data) return ret; } - if (!vx_check(0, VX_ADMIN) || - !capable(CAP_SYS_ADMIN) || vx_flags(VX_INFO_PRIVATE, 0)) + if (!vx_check(0, VX_ADMIN) || !capable(CAP_SYS_ADMIN) + /* might make sense in the future, or not ... */ + || vx_flags(VX_INFO_LOCK, 0)) return -EPERM; /* ugly hack for Spectator */ @@ -82,6 +81,12 @@ int vc_new_s_context(uint32_t ctx, void __user *data) if (!new_vxi) return -EINVAL; + + ret = -EPERM; + if (!vx_info_flags(new_vxi, VXF_STATE_SETUP, 0) && + vx_info_flags(new_vxi, VX_INFO_PRIVATE, 0)) + goto out_put; + new_vxi->vx_flags &= ~(VXF_STATE_SETUP|VXF_STATE_INIT); ret = vx_migrate_task(current, new_vxi); @@ -99,6 +104,7 @@ int vc_new_s_context(uint32_t ctx, void __user *data) current->signal->rlim[RLIMIT_NPROC].rlim_max; ret = new_vxi->vx_id; } +out_put: put_vx_info(new_vxi); return ret; } diff --git a/kernel/vserver/limit.c b/kernel/vserver/limit.c index 5bd2fdcb9..a1497be36 100644 --- a/kernel/vserver/limit.c +++ b/kernel/vserver/limit.c @@ -10,15 +10,32 @@ */ #include +#include +#include +#include #include -#include #include -#include +#include #include #include +const char *vlimit_name[NUM_LIMITS] = { + [RLIMIT_CPU] = "CPU", + [RLIMIT_RSS] = "RSS", + [RLIMIT_NPROC] = "NPROC", + [RLIMIT_NOFILE] = "NOFILE", + [RLIMIT_MEMLOCK] = "VML", + [RLIMIT_AS] = "VM", + [RLIMIT_LOCKS] = "LOCKS", + [RLIMIT_MSGQUEUE] = "MSGQ", + [VLIMIT_NSOCK] = "NSOCK", +}; + +EXPORT_SYMBOL_GPL(vlimit_name); + + static int is_valid_rlimit(int id) { int valid = 0; @@ -42,7 +59,7 @@ static inline uint64_t vc_get_rlim(struct vx_info *vxi, int id) limit = vxi->limit.rlim[id]; if (limit == RLIM_INFINITY) return CRLIM_INFINITY; - return limit; + return limit; } int vc_get_rlimit(uint32_t id, void __user *data) @@ -54,8 +71,8 @@ int vc_get_rlimit(uint32_t id, void __user *data) return -EFAULT; if (!is_valid_rlimit(vc_data.id)) return -ENOTSUPP; - - vxi = find_vx_info(id); + + vxi = locate_vx_info(id); if (!vxi) return -ESRCH; @@ -81,13 +98,12 @@ int vc_set_rlimit(uint32_t id, void __user *data) if (!is_valid_rlimit(vc_data.id)) return -ENOTSUPP; - vxi = find_vx_info(id); + vxi = locate_vx_info(id); if (!vxi) return -ESRCH; if (vc_data.maximum != CRLIM_KEEP) vxi->limit.rlim[vc_data.id] = vc_data.maximum; - printk("setting [%d] = %d\n", vc_data.id, (int)vc_data.maximum); put_vx_info(vxi); return 0; @@ -111,7 +127,7 @@ int vc_get_rlimit_mask(uint32_t id, void __user *data) if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) return -EPERM; if (copy_to_user(data, &mask, sizeof(mask))) - return -EFAULT; + return -EFAULT; return 0; } @@ -124,25 +140,25 @@ void vx_vsi_meminfo(struct sysinfo *val) v = vxi->limit.rlim[RLIMIT_RSS]; if (v != RLIM_INFINITY) val->totalram = min(val->totalram, v); - v = atomic_read(&vxi->limit.res[RLIMIT_RSS]); + v = atomic_read(&vxi->limit.rcur[RLIMIT_RSS]); val->freeram = (v < val->totalram) ? val->totalram - v : 0; val->bufferram = 0; - val->totalhigh = 0; - val->freehigh = 0; + val->totalhigh = 0; + val->freehigh = 0; return; } void vx_vsi_swapinfo(struct sysinfo *val) { struct vx_info *vxi = current->vx_info; - unsigned long w,v; + unsigned long v, w; v = vxi->limit.rlim[RLIMIT_RSS]; w = vxi->limit.rlim[RLIMIT_AS]; if (w != RLIM_INFINITY) val->totalswap = min(val->totalswap, w - ((v != RLIM_INFINITY) ? v : 0)); - w = atomic_read(&vxi->limit.res[RLIMIT_AS]); + w = atomic_read(&vxi->limit.rcur[RLIMIT_AS]); val->freeswap = (w < val->totalswap) ? val->totalswap - w : 0; return; } diff --git a/kernel/vserver/limit_init.h b/kernel/vserver/limit_init.h new file mode 100644 index 000000000..0a9dcf44b --- /dev/null +++ b/kernel/vserver/limit_init.h @@ -0,0 +1,28 @@ + +static inline void vx_info_init_limit(struct _vx_limit *limit) +{ + int lim; + + for (lim=0; limrlim[lim] = RLIM_INFINITY; + limit->rmax[lim] = 0; + atomic_set(&limit->rcur[lim], 0); + atomic_set(&limit->lhit[lim], 0); + } +} + +static inline void vx_info_exit_limit(struct _vx_limit *limit) +{ +#ifdef CONFIG_VSERVER_DEBUG + unsigned long value; + unsigned int lim; + + for (lim=0; limrcur[lim]); + vxwprintk(value, + "!!! limit: %p[%s,%d] = %ld on exit.", + limit, vlimit_name[lim], lim, value); + } +#endif +} + diff --git a/kernel/vserver/limit_proc.h b/kernel/vserver/limit_proc.h new file mode 100644 index 000000000..97696e9a4 --- /dev/null +++ b/kernel/vserver/limit_proc.h @@ -0,0 +1,48 @@ +#ifndef _VX_LIMIT_PROC_H +#define _VX_LIMIT_PROC_H + + +static inline void vx_limit_fixup(struct _vx_limit *limit) +{ + unsigned long value; + unsigned int lim; + + for (lim=0; limrcur[lim]); + if (value > limit->rmax[lim]) + limit->rmax[lim] = value; + if (limit->rmax[lim] > limit->rlim[lim]) + limit->rmax[lim] = limit->rlim[lim]; + } +} + +#define VX_LIMIT_FMT ":\t%10d\t%10ld\t%10ld\t%6d\n" + +#define VX_LIMIT_ARG(r) \ + ,atomic_read(&limit->rcur[r]) \ + ,limit->rmax[r] \ + ,limit->rlim[r] \ + ,atomic_read(&limit->lhit[r]) + +static inline int vx_info_proc_limit(struct _vx_limit *limit, char *buffer) +{ + vx_limit_fixup(limit); + return sprintf(buffer, + "PROC" VX_LIMIT_FMT + "VM" VX_LIMIT_FMT + "VML" VX_LIMIT_FMT + "RSS" VX_LIMIT_FMT + "FILES" VX_LIMIT_FMT + "SOCK" VX_LIMIT_FMT + VX_LIMIT_ARG(RLIMIT_NPROC) + VX_LIMIT_ARG(RLIMIT_AS) + VX_LIMIT_ARG(RLIMIT_MEMLOCK) + VX_LIMIT_ARG(RLIMIT_RSS) + VX_LIMIT_ARG(RLIMIT_NOFILE) + VX_LIMIT_ARG(VLIMIT_NSOCK) + ); +} + +#endif /* _VX_LIMIT_PROC_H */ + + diff --git a/kernel/vserver/namespace.c b/kernel/vserver/namespace.c index 2c76c6fb4..668516102 100644 --- a/kernel/vserver/namespace.c +++ b/kernel/vserver/namespace.c @@ -12,15 +12,62 @@ #include #include +#include +#include #include -#include -#include #include +#include +#include #include #include +int vx_check_vfsmount(struct vx_info *vxi, struct vfsmount *mnt) +{ + struct vfsmount *root_mnt, *altroot_mnt; + struct dentry *root, *altroot, *point; + int r1, r2, s1, s2, ret = 0; + + if (!vxi || !mnt) + return 1; + + spin_lock(&dcache_lock); + altroot_mnt = current->fs->rootmnt; + altroot = current->fs->root; + point = altroot; + + if (vxi->vx_fs) { + root_mnt = vxi->vx_fs->rootmnt; + root = vxi->vx_fs->root; + } else { + root_mnt = altroot_mnt; + root = altroot; + } + /* printk("··· %p:%p/%p:%p ", + root_mnt, root, altroot_mnt, altroot); */ + + while ((mnt != mnt->mnt_parent) && + (mnt != root_mnt) && (mnt != altroot_mnt)) { + point = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + + r1 = (mnt == root_mnt); + s1 = is_subdir(point, root); + r2 = (mnt == altroot_mnt); + s2 = is_subdir(point, altroot); + + ret = (((mnt == root_mnt) && is_subdir(point, root)) || + ((mnt == altroot_mnt) && is_subdir(point, altroot))); + /* printk("··· for %p:%p -> %d:%d/%d:%d = %d\n", + mnt, point, r1, s1, r2, s2, ret); */ + spin_unlock(&dcache_lock); + + return (r2 && s2); +} + + /* virtual host info names */ static char * vx_vhi_name(struct vx_info *vxi, int id) @@ -56,11 +103,11 @@ int vc_set_vhi_name(uint32_t id, void __user *data) return -EPERM; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - - vxi = find_vx_info(id); + + vxi = locate_vx_info(id); if (!vxi) return -ESRCH; - + name = vx_vhi_name(vxi, vc_data.field); if (name) memcpy(name, vc_data.name, 65); @@ -77,14 +124,14 @@ int vc_get_vhi_name(uint32_t id, void __user *data) if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - vxi = find_vx_info(id); + vxi = locate_vx_info(id); if (!vxi) return -ESRCH; name = vx_vhi_name(vxi, vc_data.field); if (!name) goto out_put; - + memcpy(vc_data.name, name, 65); if (copy_to_user (data, &vc_data, sizeof(vc_data))) return -EFAULT; @@ -126,7 +173,7 @@ int vc_enter_namespace(uint32_t id, void *data) if (!vx_check(0, VX_ADMIN)) return -ENOSYS; - vxi = find_vx_info(id); + vxi = locate_vx_info(id); if (!vxi) return -ESRCH; @@ -144,7 +191,7 @@ int vc_enter_namespace(uint32_t id, void *data) old_ns = current->namespace; old_fs = current->fs; get_namespace(vxi->vx_namespace); - current->namespace = vxi->vx_namespace; + current->namespace = vxi->vx_namespace; current->fs = fs; task_unlock(current); @@ -158,11 +205,9 @@ out_put: int vc_cleanup_namespace(uint32_t id, void *data) { down_write(¤t->namespace->sem); - // spin_lock(&dcache_lock); spin_lock(&vfsmount_lock); umount_unused(current->namespace->root, current->fs); spin_unlock(&vfsmount_lock); - // spin_unlock(&dcache_lock); up_write(¤t->namespace->sem); return 0; } diff --git a/kernel/vserver/network.c b/kernel/vserver/network.c index e87c8b617..f1a110ba6 100644 --- a/kernel/vserver/network.c +++ b/kernel/vserver/network.c @@ -14,8 +14,7 @@ #include #include -#include -#include +#include #include #include @@ -70,6 +69,35 @@ static void __dealloc_nx_info(struct nx_info *nxi) kfree(nxi); } +static inline int __free_nx_info(struct nx_info *nxi) +{ + int usecnt, refcnt; + + BUG_ON(!nxi); + + usecnt = atomic_read(&nxi->nx_usecnt); + BUG_ON(usecnt < 0); + + refcnt = atomic_read(&nxi->nx_refcnt); + BUG_ON(refcnt < 0); + + if (!usecnt) + __dealloc_nx_info(nxi); + return usecnt; +} + +static void __rcu_put_nx_info(struct rcu_head *head) +{ + struct nx_info *nxi = container_of(head, struct nx_info, nx_rcu); + + vxdprintk(VXD_CBIT(nid, 3), + "__rcu_put_nx_info(%p[#%d]): %d,%d", + nxi, nxi->nx_id, + atomic_read(&nxi->nx_usecnt), + atomic_read(&nxi->nx_refcnt)); + put_nx_info(nxi); +} + /* hash table for nx_info hash */ @@ -113,7 +141,7 @@ static inline void __unhash_nx_info(struct nx_info *nxi) vxdprintk(VXD_CBIT(nid, 4), "__unhash_nx_info: %p[#%d]", nxi, nxi->nx_id); hlist_del_rcu(&nxi->nx_hlist); - put_nx_info(nxi); + call_rcu(&nxi->nx_rcu, __rcu_put_nx_info); } @@ -142,6 +170,7 @@ static inline struct nx_info *__lookup_nx_info(nid_t nid) /* __nx_dynamic_id() * find unused dynamic nid + * requires the rcu_read_lock() * requires the hash_lock to be held */ static inline nid_t __nx_dynamic_id(void) @@ -177,6 +206,9 @@ static struct nx_info * __loc_nx_info(int id, int *err) return NULL; } + /* FIXME is this required at all ? */ + rcu_read_lock(); + /* required to make dynamic xids unique */ spin_lock(&nx_info_hash_lock); /* dynamic context requested */ @@ -214,6 +246,7 @@ static struct nx_info * __loc_nx_info(int id, int *err) out_unlock: spin_unlock(&nx_info_hash_lock); + rcu_read_unlock(); if (new) __dealloc_nx_info(new); return nxi; @@ -223,28 +256,9 @@ out_unlock: /* exported stuff */ - - - -void rcu_free_nx_info(struct rcu_head *head) +void free_nx_info(struct nx_info *nxi) { - struct nx_info *nxi = container_of(head, struct nx_info, nx_rcu); - int usecnt, refcnt; - - BUG_ON(!nxi || !head); - - usecnt = atomic_read(&nxi->nx_usecnt); - BUG_ON(usecnt < 0); - - refcnt = atomic_read(&nxi->nx_refcnt); - BUG_ON(refcnt < 0); - - vxdprintk(VXD_CBIT(nid, 3), - "rcu_free_nx_info(%p): uc=%d", nxi, usecnt); - if (!usecnt) - __dealloc_nx_info(nxi); - else - printk("!!! rcu didn't free\n"); + BUG_ON(__free_nx_info(nxi)); } void unhash_nx_info(struct nx_info *nxi) @@ -696,7 +710,6 @@ int vc_set_ncaps(uint32_t id, void __user *data) #include -EXPORT_SYMBOL_GPL(rcu_free_nx_info); -EXPORT_SYMBOL_GPL(nx_info_hash_lock); +EXPORT_SYMBOL_GPL(free_nx_info); EXPORT_SYMBOL_GPL(unhash_nx_info); diff --git a/kernel/vserver/proc.c b/kernel/vserver/proc.c index 42bc18200..823226be4 100644 --- a/kernel/vserver/proc.c +++ b/kernel/vserver/proc.c @@ -18,11 +18,19 @@ #include #include #include -#include +#include +#include +#include +#include + +#include #include #include +#include "cvirt_proc.h" +#include "limit_proc.h" +#include "sched_proc.h" static struct proc_dir_entry *proc_virtual; @@ -43,7 +51,7 @@ enum vid_directory_inos { PROC_NID_STATUS, }; -#define PROC_VID_MASK 0x60 +#define PROC_VID_MASK 0x60 /* first the actual feeds */ @@ -66,7 +74,7 @@ int proc_xid_info (int vid, char *buffer) struct vx_info *vxi; int length; - vxi = find_vx_info(vid); + vxi = locate_vx_info(vid); if (!vxi) return 0; length = sprintf(buffer, @@ -86,19 +94,21 @@ int proc_xid_status (int vid, char *buffer) struct vx_info *vxi; int length; - vxi = find_vx_info(vid); + vxi = locate_vx_info(vid); if (!vxi) return 0; length = sprintf(buffer, - "RefC:\t%d\n" + "UseCnt:\t%d\n" + "RefCnt:\t%d\n" "Flags:\t%016llx\n" "BCaps:\t%016llx\n" "CCaps:\t%016llx\n" - "Ticks:\t%d\n" - ,atomic_read(&vxi->vx_refcount) - ,vxi->vx_flags - ,vxi->vx_bcaps - ,vxi->vx_ccaps + "Ticks:\t%d\n" + ,atomic_read(&vxi->vx_usecnt) + ,atomic_read(&vxi->vx_refcnt) + ,(unsigned long long)vxi->vx_flags + ,(unsigned long long)vxi->vx_bcaps + ,(unsigned long long)vxi->vx_ccaps ,atomic_read(&vxi->limit.ticks) ); put_vx_info(vxi); @@ -110,7 +120,7 @@ int proc_xid_limit (int vid, char *buffer) struct vx_info *vxi; int length; - vxi = find_vx_info(vid); + vxi = locate_vx_info(vid); if (!vxi) return 0; length = vx_info_proc_limit(&vxi->limit, buffer); @@ -123,7 +133,7 @@ int proc_xid_sched (int vid, char *buffer) struct vx_info *vxi; int length; - vxi = find_vx_info(vid); + vxi = locate_vx_info(vid); if (!vxi) return 0; length = vx_info_proc_sched(&vxi->sched, buffer); @@ -136,9 +146,10 @@ int proc_xid_cvirt (int vid, char *buffer) struct vx_info *vxi; int length; - vxi = find_vx_info(vid); + vxi = locate_vx_info(vid); if (!vxi) return 0; + vx_update_load(vxi); length = vx_info_proc_cvirt(&vxi->cvirt, buffer); put_vx_info(vxi); return length; @@ -149,7 +160,7 @@ int proc_xid_cacct (int vid, char *buffer) struct vx_info *vxi; int length; - vxi = find_vx_info(vid); + vxi = locate_vx_info(vid); if (!vxi) return 0; length = vx_info_proc_cacct(&vxi->cacct, buffer); @@ -169,7 +180,7 @@ static int proc_vnet_info(int vid, char *buffer) ); } -#define atoquad(a) \ +#define atoquad(a) \ (((a)>>0) & 0xff), (((a)>>8) & 0xff), \ (((a)>>16) & 0xff), (((a)>>24) & 0xff) @@ -178,7 +189,7 @@ int proc_nid_info (int vid, char *buffer) struct nx_info *nxi; int length, i; - nxi = find_nx_info(vid); + nxi = locate_nx_info(vid); if (!nxi) return 0; length = sprintf(buffer, @@ -202,12 +213,14 @@ int proc_nid_status (int vid, char *buffer) struct nx_info *nxi; int length; - nxi = find_nx_info(vid); + nxi = locate_nx_info(vid); if (!nxi) return 0; length = sprintf(buffer, - "RefC:\t%d\n" - ,atomic_read(&nxi->nx_refcount) + "UseCnt:\t%d\n" + "RefCnt:\t%d\n" + ,atomic_read(&nxi->nx_usecnt) + ,atomic_read(&nxi->nx_refcnt) ); put_nx_info(nxi); return length; @@ -216,11 +229,11 @@ int proc_nid_status (int vid, char *buffer) /* here the inode helpers */ +#define fake_ino(id,nr) (((nr) & 0xFFFF) | \ + (((id) & 0xFFFF) << 16)) -#define fake_ino(id,ino) (((id)<<16)|(ino)) - -#define inode_vid(i) ((i)->i_ino >> 16) -#define inode_type(i) ((i)->i_ino & 0xFFFF) +#define inode_vid(i) (((i)->i_ino >> 16) & 0xFFFF) +#define inode_type(i) ((i)->i_ino & 0xFFFF) #define MAX_MULBY10 ((~0U-9)/10) @@ -247,18 +260,18 @@ out: static int proc_vid_revalidate(struct dentry * dentry, struct nameidata *nd) { struct inode * inode = dentry->d_inode; - int vid, valid=0; + int vid, hashed=0; vid = inode_vid(inode); switch (inode_type(inode) & PROC_VID_MASK) { case PROC_XID_INO: - valid = vx_info_id_valid(vid); + hashed = vx_info_is_hashed(vid); break; case PROC_NID_INO: - valid = nx_info_id_valid(vid); + hashed = nx_info_is_hashed(vid); break; - } - if (valid) + } + if (hashed) return 1; d_drop(dentry); return 0; @@ -267,7 +280,7 @@ static int proc_vid_revalidate(struct dentry * dentry, struct nameidata *nd) /* static int proc_vid_delete_dentry(struct dentry * dentry) { - return 1; + return 1; } */ @@ -320,7 +333,7 @@ static struct file_operations proc_vid_info_file_operations = { }; static struct dentry_operations proc_vid_dentry_operations = { - d_revalidate: proc_vid_revalidate, + d_revalidate: proc_vid_revalidate, // d_delete: proc_vid_delete_dentry, }; @@ -364,10 +377,10 @@ static struct dentry *proc_vid_lookup(struct inode *dir, switch (inode_type(dir)) { case PROC_XID_INO: - p = vx_base_stuff; + p = vx_base_stuff; break; case PROC_NID_INO: - p = vn_base_stuff; + p = vn_base_stuff; break; default: goto out; @@ -413,7 +426,7 @@ static struct dentry *proc_vid_lookup(struct inode *dir, case PROC_NID_STATUS: PROC_I(inode)->op.proc_vid_read = proc_nid_status; break; - + default: printk("procfs: impossible type (%d)",p->type); iput(inode); @@ -424,7 +437,7 @@ static struct dentry *proc_vid_lookup(struct inode *dir, inode->i_fop = &proc_vid_info_file_operations; inode->i_nlink = 1; inode->i_flags|=S_IMMUTABLE; - + dentry->d_op = &proc_vid_dentry_operations; d_add(dentry, inode); error = 0; @@ -439,7 +452,7 @@ static int proc_vid_readdir(struct file * filp, int i, size; struct inode *inode = filp->f_dentry->d_inode; struct vid_entry *p; - + i = filp->f_pos; switch (i) { case 0: @@ -461,11 +474,11 @@ static int proc_vid_readdir(struct file * filp, switch (inode_type(inode)) { case PROC_XID_INO: size = sizeof(vx_base_stuff); - p = vx_base_stuff + i; + p = vx_base_stuff + i; break; case PROC_NID_INO: size = sizeof(vn_base_stuff); - p = vn_base_stuff + i; + p = vn_base_stuff + i; break; default: return 1; @@ -564,7 +577,7 @@ struct dentry *proc_virtual_lookup(struct inode *dir, xid = atovid(name, len); if (xid < 0) goto out; - vxi = find_vx_info(xid); + vxi = locate_vx_info(xid); if (!vxi) goto out; @@ -584,7 +597,7 @@ struct dentry *proc_virtual_lookup(struct inode *dir, dentry->d_op = &proc_vid_dentry_operations; d_add(dentry, inode); ret = 0; - + out_release: put_vx_info(vxi); out: @@ -634,7 +647,7 @@ struct dentry *proc_vnet_lookup(struct inode *dir, nid = atovid(name, len); if (nid < 0) goto out; - nxi = find_nx_info(nid); + nxi = locate_nx_info(nid); if (!nxi) goto out; @@ -654,7 +667,7 @@ struct dentry *proc_vnet_lookup(struct inode *dir, dentry->d_op = &proc_vid_dentry_operations; d_add(dentry, inode); ret = 0; - + out_release: put_nx_info(nxi); out: @@ -667,27 +680,6 @@ out: #define PROC_NUMBUF 10 #define PROC_MAXVIDS 32 - -static int get_xid_list(int index, unsigned int *xids) -{ - struct vx_info *p; - int nr_xids = 0; - - index--; - spin_lock(&vxlist_lock); - list_for_each_entry(p, &vx_infos, vx_list) { - int xid = p->vx_id; - - if (--index >= 0) - continue; - xids[nr_xids] = xid; - if (++nr_xids >= PROC_MAXVIDS) - break; - } - spin_unlock(&vxlist_lock); - return nr_xids; -} - int proc_virtual_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -720,7 +712,7 @@ int proc_virtual_readdir(struct file * filp, filp->f_pos++; /* fall through */ case 3: - if (current->xid > 1) { + if (vx_current_xid() > 1) { ino = fake_ino(1, PROC_XID_INO); if (filldir(dirent, "current", 7, filp->f_pos, ino, DT_LNK) < 0) @@ -729,12 +721,11 @@ int proc_virtual_readdir(struct file * filp, filp->f_pos++; } - nr_xids = get_xid_list(nr, xid_array); - + nr_xids = get_xid_list(nr, xid_array, PROC_MAXVIDS); for (i = 0; i < nr_xids; i++) { int xid = xid_array[i]; ino_t ino = fake_ino(xid, PROC_XID_INO); - unsigned long j = PROC_NUMBUF; + unsigned int j = PROC_NUMBUF; do buf[--j] = '0' + (xid % 10); while (xid/=10); @@ -757,27 +748,6 @@ static struct inode_operations proc_virtual_dir_inode_operations = { }; - -static int get_nid_list(int index, unsigned int *nids) -{ - struct nx_info *p; - int nr_nids = 0; - - index--; - spin_lock(&nxlist_lock); - list_for_each_entry(p, &nx_infos, nx_list) { - int nid = p->nx_id; - - if (--index >= 0) - continue; - nids[nr_nids] = nid; - if (++nr_nids >= PROC_MAXVIDS) - break; - } - spin_unlock(&nxlist_lock); - return nr_nids; -} - int proc_vnet_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -810,7 +780,7 @@ int proc_vnet_readdir(struct file * filp, filp->f_pos++; /* fall through */ case 3: - if (current->xid > 1) { + if (vx_current_xid() > 1) { ino = fake_ino(1, PROC_NID_INO); if (filldir(dirent, "current", 7, filp->f_pos, ino, DT_LNK) < 0) @@ -819,8 +789,7 @@ int proc_vnet_readdir(struct file * filp, filp->f_pos++; } - nr_nids = get_nid_list(nr, nid_array); - + nr_nids = get_nid_list(nr, nid_array, PROC_MAXVIDS); for (i = 0; i < nr_nids; i++) { int nid = nid_array[i]; ino_t ino = fake_ino(nid, PROC_NID_INO); @@ -859,7 +828,7 @@ void proc_vx_init(void) } proc_virtual = ent; - ent = proc_mkdir("vnet", 0); + ent = proc_mkdir("virtnet", 0); if (ent) { ent->proc_fops = &proc_vnet_dir_operations; ent->proc_iops = &proc_vnet_dir_inode_operations; @@ -875,9 +844,22 @@ void proc_vx_init(void) char *task_vx_info(struct task_struct *p, char *buffer) { - return buffer + sprintf(buffer, - "XID:\t%d\n" - ,p->xid); + struct vx_info *vxi; + + buffer += sprintf (buffer,"XID:\t%d\n", vx_task_xid(p)); + vxi = task_get_vx_info(p); + if (vxi && !vx_flags(VXF_INFO_HIDE, 0)) { + buffer += sprintf (buffer,"BCaps:\t%016llx\n" + ,(unsigned long long)vxi->vx_bcaps); + buffer += sprintf (buffer,"CCaps:\t%016llx\n" + ,(unsigned long long)vxi->vx_ccaps); + buffer += sprintf (buffer,"CFlags:\t%016llx\n" + ,(unsigned long long)vxi->vx_flags); + buffer += sprintf (buffer,"CIPid:\t%d\n" + ,vxi->vx_initpid); + } + put_vx_info(vxi); + return buffer; } int proc_pid_vx_info(struct task_struct *p, char *buffer) @@ -890,9 +872,25 @@ int proc_pid_vx_info(struct task_struct *p, char *buffer) char *task_nx_info(struct task_struct *p, char *buffer) { - return buffer + sprintf(buffer, - "NID:\t%d\n" - ,p->nid); + struct nx_info *nxi; + + buffer += sprintf (buffer,"NID:\t%d\n", nx_task_nid(p)); + nxi = task_get_nx_info(p); + if (nxi && !vx_flags(VXF_INFO_HIDE, 0)) { + int i; + + for (i=0; inbipv4; i++){ + buffer += sprintf (buffer, + "V4Root[%d]:\t%d.%d.%d.%d/%d.%d.%d.%d\n", i + ,NIPQUAD(nxi->ipv4[i]) + ,NIPQUAD(nxi->mask[i])); + } + buffer += sprintf (buffer, + "V4Root[bcast]:\t%d.%d.%d.%d\n" + ,NIPQUAD(nxi->v4_bcast)); + } + put_nx_info(nxi); + return buffer; } int proc_pid_nx_info(struct task_struct *p, char *buffer) diff --git a/kernel/vserver/sched.c b/kernel/vserver/sched.c index a75195a19..70e964e5e 100644 --- a/kernel/vserver/sched.c +++ b/kernel/vserver/sched.c @@ -12,9 +12,10 @@ #include #include -#include -#include -#include +// #include +#include +#include +#include #include #include @@ -32,7 +33,7 @@ int vx_tokens_recalc(struct vx_info *vxi) { long delta, tokens = 0; - if (__vx_flags(vxi->vx_flags, VXF_SCHED_PAUSE, 0)) + if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0)) /* we are paused */ return 0; @@ -51,7 +52,7 @@ int vx_tokens_recalc(struct vx_info *vxi) atomic_add(tokens, &vxi->sched.tokens); vxi->sched.jiffies += delta; tokens = atomic_read(&vxi->sched.tokens); - + if (tokens > vxi->sched.tokens_max) { tokens = vxi->sched.tokens_max; atomic_set(&vxi->sched.tokens, tokens); @@ -59,7 +60,10 @@ int vx_tokens_recalc(struct vx_info *vxi) spin_unlock(&vxi->sched.tokens_lock); } else { /* no new tokens */ - if ((tokens = vx_tokens_avail(vxi)) < vxi->sched.tokens_min) { + tokens = vx_tokens_avail(vxi); + if (tokens <= 0) + vxi->vx_state |= VXS_ONHOLD; + if (tokens < vxi->sched.tokens_min) { /* enough tokens will be available in */ if (vxi->sched.tokens_min == 0) return delta - vxi->sched.interval; @@ -67,7 +71,14 @@ int vx_tokens_recalc(struct vx_info *vxi) vxi->sched.tokens_min / vxi->sched.fill_rate; } } + /* we have some tokens left */ + if (vx_info_state(vxi, VXS_ONHOLD) && + (tokens >= vxi->sched.tokens_min)) + vxi->vx_state &= ~VXS_ONHOLD; + if (vx_info_state(vxi, VXS_ONHOLD)) + tokens -= vxi->sched.tokens_min; + return tokens; } @@ -118,15 +129,15 @@ int effective_vavavoom(task_t *p, int max_prio) } -int vc_set_sched(uint32_t xid, void __user *data) +int vc_set_sched_v2(uint32_t xid, void __user *data) { struct vcmd_set_sched_v2 vc_data; struct vx_info *vxi; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - - vxi = find_vx_info(xid); + + vxi = locate_vx_info(xid); if (!vxi) return -EINVAL; @@ -160,3 +171,55 @@ int vc_set_sched(uint32_t xid, void __user *data) return 0; } + +int vc_set_sched(uint32_t xid, void __user *data) +{ + struct vcmd_set_sched_v3 vc_data; + struct vx_info *vxi; + unsigned int set_mask; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = locate_vx_info(xid); + if (!vxi) + return -EINVAL; + + set_mask = vc_data.set_mask; + + spin_lock(&vxi->sched.tokens_lock); + + if (set_mask & VXSM_FILL_RATE) + vxi->sched.fill_rate = vc_data.fill_rate; + if (set_mask & VXSM_INTERVAL) + vxi->sched.interval = vc_data.interval; + if (set_mask & VXSM_TOKENS) + atomic_set(&vxi->sched.tokens, vc_data.tokens); + if (set_mask & VXSM_TOKENS_MIN) + vxi->sched.tokens_min = vc_data.tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + vxi->sched.tokens_max = vc_data.tokens_max; + if (set_mask & VXSM_PRIO_BIAS) + vxi->sched.priority_bias = vc_data.priority_bias; + + /* Sanity check the resultant values */ + if (vxi->sched.fill_rate <= 0) + vxi->sched.fill_rate = 1; + if (vxi->sched.interval <= 0) + vxi->sched.interval = HZ; + if (vxi->sched.tokens_max == 0) + vxi->sched.tokens_max = 1; + if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max) + atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max); + if (vxi->sched.tokens_min > vxi->sched.tokens_max) + vxi->sched.tokens_min = vxi->sched.tokens_max; + if (vxi->sched.priority_bias > MAX_PRIO_BIAS) + vxi->sched.priority_bias = MAX_PRIO_BIAS; + if (vxi->sched.priority_bias < MIN_PRIO_BIAS) + vxi->sched.priority_bias = MIN_PRIO_BIAS; + + spin_unlock(&vxi->sched.tokens_lock); + put_vx_info(vxi); + return 0; +} + diff --git a/kernel/vserver/sched_init.h b/kernel/vserver/sched_init.h new file mode 100644 index 000000000..3fbab7cdd --- /dev/null +++ b/kernel/vserver/sched_init.h @@ -0,0 +1,29 @@ + +static inline void vx_info_init_sched(struct _vx_sched *sched) +{ + int i; + + /* scheduling; hard code starting values as constants */ + sched->fill_rate = 1; + sched->interval = 4; + sched->tokens_min = HZ >> 4; + sched->tokens_max = HZ >> 1; + sched->jiffies = jiffies; + sched->tokens_lock = SPIN_LOCK_UNLOCKED; + + atomic_set(&sched->tokens, HZ >> 2); + sched->cpus_allowed = CPU_MASK_ALL; + sched->priority_bias = 0; + + for_each_cpu(i) { + sched->cpu[i].user_ticks = 0; + sched->cpu[i].sys_ticks = 0; + sched->cpu[i].hold_ticks = 0; + } +} + +static inline void vx_info_exit_sched(struct _vx_sched *sched) +{ + return; +} + diff --git a/kernel/vserver/sched_proc.h b/kernel/vserver/sched_proc.h new file mode 100644 index 000000000..1da5fa379 --- /dev/null +++ b/kernel/vserver/sched_proc.h @@ -0,0 +1,38 @@ +#ifndef _VX_SCHED_PROC_H +#define _VX_SCHED_PROC_H + + +static inline int vx_info_proc_sched(struct _vx_sched *sched, char *buffer) +{ + int length = 0; + int i; + + length += sprintf(buffer, + "Token:\t\t%8d\n" + "FillRate:\t%8d\n" + "Interval:\t%8d\n" + "TokensMin:\t%8d\n" + "TokensMax:\t%8d\n" + "PrioBias:\t%8d\n" + ,atomic_read(&sched->tokens) + ,sched->fill_rate + ,sched->interval + ,sched->tokens_min + ,sched->tokens_max + ,sched->priority_bias + ); + + for_each_online_cpu(i) { + length += sprintf(buffer + length, + "cpu %d: %lld %lld %lld\n" + ,i + ,(long long)sched->cpu[i].user_ticks + ,(long long)sched->cpu[i].sys_ticks + ,(long long)sched->cpu[i].hold_ticks + ); + } + + return length; +} + +#endif /* _VX_SCHED_PROC_H */ diff --git a/kernel/vserver/signal.c b/kernel/vserver/signal.c index 464ea1be4..bdf3c2264 100644 --- a/kernel/vserver/signal.c +++ b/kernel/vserver/signal.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include @@ -31,14 +31,14 @@ int vc_ctx_kill(uint32_t id, void __user *data) return -ENOSYS; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - + info.si_signo = vc_data.sig; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = current->pid; info.si_uid = current->uid; - vxi = find_vx_info(id); + vxi = locate_vx_info(id); if (!vxi) return -ESRCH; @@ -61,14 +61,14 @@ int vc_ctx_kill(uint32_t id, void __user *data) retval = err; } break; - + default: - p = find_task_by_pid(vc_data.pid); + p = find_task_by_real_pid(vc_data.pid); if (p) { if (!thread_group_leader(p)) { struct task_struct *tg; - - tg = find_task_by_pid(p->tgid); + + tg = find_task_by_real_pid(p->tgid); if (tg) p = tg; } @@ -83,3 +83,44 @@ int vc_ctx_kill(uint32_t id, void __user *data) } +static int __wait_exit(struct vx_info *vxi) +{ + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + add_wait_queue(&vxi->vx_exit, &wait); + set_current_state(TASK_INTERRUPTIBLE); + +wait: + if (vx_info_state(vxi, VXS_DEFUNCT)) + goto out; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto out; + } + schedule(); + goto wait; + +out: + set_current_state(TASK_RUNNING); + remove_wait_queue(&vxi->vx_exit, &wait); + return ret; +} + + + +int vc_wait_exit(uint32_t id, void __user *data) +{ +// struct vcmd_wait_exit_v0 vc_data; + struct vx_info *vxi; + int ret; + + vxi = locate_vx_info(id); + if (!vxi) + return -ESRCH; + + ret = __wait_exit(vxi); + put_vx_info(vxi); + return ret; +} + diff --git a/kernel/vserver/switch.c b/kernel/vserver/switch.c index 90fee1412..271f63074 100644 --- a/kernel/vserver/switch.c +++ b/kernel/vserver/switch.c @@ -3,21 +3,24 @@ * * Virtual Server: Syscall Switch * - * Copyright (C) 2003-2004 Herbert Pötzl + * Copyright (C) 2003-2005 Herbert Pötzl * * V0.01 syscall switch * V0.02 added signal to context * V0.03 added rlimit functions * V0.04 added iattr, task/xid functions + * V0.05 added debug/history stuff * */ #include #include +#include #include +#include #include -#include +#include static inline int @@ -26,34 +29,50 @@ vc_get_version(uint32_t id) return VCI_VERSION; } +#include +#include +#include +#include +#include +#include #include -#include -#include #include -#include -#include #include #include - - -extern unsigned int vx_debug_switch; +#include extern asmlinkage long sys_vserver(uint32_t cmd, uint32_t id, void __user *data) { + vxdprintk(VXD_CBIT(switch, 0), + "vc: VCMD_%02d_%d[%d], %d", + VC_CATEGORY(cmd), VC_COMMAND(cmd), + VC_VERSION(cmd), id); - if (vx_debug_switch) - printk( "vc: VCMD_%02d_%d[%d], %d\n", - VC_CATEGORY(cmd), VC_COMMAND(cmd), - VC_VERSION(cmd), id); +#ifdef CONFIG_VSERVER_LEGACY + if (!capable(CAP_CONTEXT) && + /* dirty hack for capremove */ + !(cmd==VCMD_new_s_context && id==-2)) + return -EPERM; +#else + if (!capable(CAP_CONTEXT)) + return -EPERM; +#endif switch (cmd) { case VCMD_get_version: return vc_get_version(id); -#ifdef CONFIG_VSERVER_LEGACY + case VCMD_dump_history: +#ifdef CONFIG_VSERVER_HISTORY + return vc_dump_history(id); +#else + return -ENOSYS; +#endif + +#ifdef CONFIG_VSERVER_LEGACY case VCMD_new_s_context: return vc_new_s_context(id, data); case VCMD_set_ipv4root: @@ -97,7 +116,7 @@ sys_vserver(uint32_t cmd, uint32_t id, void __user *data) return vc_set_rlimit(id, data); case VCMD_get_rlimit_mask: return vc_get_rlimit_mask(id, data); - + case VCMD_vx_get_vhi_name: return vc_get_vhi_name(id, data); case VCMD_vx_set_vhi_name: @@ -123,8 +142,20 @@ sys_vserver(uint32_t cmd, uint32_t id, void __user *data) case VCMD_get_ncaps: return vc_get_ncaps(id, data); + case VCMD_set_sched_v2: + return vc_set_sched_v2(id, data); + /* this is version 3 */ case VCMD_set_sched: return vc_set_sched(id, data); + + case VCMD_add_dlimit: + return vc_add_dlimit(id, data); + case VCMD_rem_dlimit: + return vc_rem_dlimit(id, data); + case VCMD_set_dlimit: + return vc_set_dlimit(id, data); + case VCMD_get_dlimit: + return vc_get_dlimit(id, data); } /* below here only with VX_ADMIN */ @@ -135,9 +166,14 @@ sys_vserver(uint32_t cmd, uint32_t id, void __user *data) case VCMD_ctx_kill: return vc_ctx_kill(id, data); -#ifdef CONFIG_VSERVER_LEGACY + case VCMD_wait_exit: + return vc_wait_exit(id, data); + case VCMD_create_context: +#ifdef CONFIG_VSERVER_LEGACY return vc_ctx_create(id, data); +#else + return -ENOSYS; #endif case VCMD_get_iattr: @@ -149,7 +185,7 @@ sys_vserver(uint32_t cmd, uint32_t id, void __user *data) return vc_enter_namespace(id, data); case VCMD_ctx_create: -#ifdef CONFIG_VSERVER_LEGACY +#ifdef CONFIG_VSERVER_LEGACY if (id == 1) { current->xid = 1; return 1; diff --git a/kernel/vserver/sysctl.c b/kernel/vserver/sysctl.c index fffc0dd46..6a9006738 100644 --- a/kernel/vserver/sysctl.c +++ b/kernel/vserver/sysctl.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -52,10 +51,6 @@ void vserver_register_sysctl(void) { if (!vserver_table_header) { vserver_table_header = register_sysctl_table(vserver_table, 1); -#ifdef CONFIG_PROC_FS -// if (vserver_table[0].de) -// vserver_table[0].de->owner = THIS_MODULE; -#endif } } diff --git a/mm/fremap.c b/mm/fremap.c index 2362ba24b..b7f0f91f0 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/mm/memory.c b/mm/memory.c index 9a4f6959e..3a911dda5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1584,9 +1584,9 @@ retry: */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - //++mm->rss; - vx_rsspages_inc(mm); + if (!PageReserved(new_page)) + // ++mm->rss; + vx_rsspages_inc(mm); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) diff --git a/mm/mlock.c b/mm/mlock.c index 3be348d48..fb3a1cf45 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -9,6 +9,7 @@ #include #include #include +#include static int mlock_fixup(struct vm_area_struct * vma, diff --git a/mm/mmap.c b/mm/mmap.c index c17c39e71..5fc8e0128 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1469,10 +1469,9 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address) vma->vm_end = address; // vma->vm_mm->total_vm += grow; vx_vmpages_add(vma->vm_mm, grow); - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) // vma->vm_mm->locked_vm += grow; vx_vmlocked_add(vma->vm_mm, grow); - } __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); anon_vma_unlock(vma); return 0; @@ -1548,10 +1547,9 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) vma->vm_pgoff -= grow; // vma->vm_mm->total_vm += grow; vx_vmpages_add(vma->vm_mm, grow); - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) // vma->vm_mm->locked_vm += grow; vx_vmlocked_add(vma->vm_mm, grow); - } __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); anon_vma_unlock(vma); return 0; @@ -1657,11 +1655,10 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) // area->vm_mm->total_vm -= len >> PAGE_SHIFT; vx_vmpages_sub(area->vm_mm, len >> PAGE_SHIFT); - - if (area->vm_flags & VM_LOCKED) { + + if (area->vm_flags & VM_LOCKED) // area->vm_mm->locked_vm -= len >> PAGE_SHIFT; vx_vmlocked_sub(area->vm_mm, len >> PAGE_SHIFT); - } vm_stat_unaccount(area); area->vm_mm->unmap_area(area); remove_vm_struct(area); @@ -2007,7 +2004,6 @@ void exit_mmap(struct mm_struct *mm) vx_vmpages_sub(mm, mm->total_vm); // mm->locked_vm = 0; vx_vmlocked_sub(mm, mm->locked_vm); - arch_flush_exec_range(mm); spin_unlock(&mm->page_table_lock); diff --git a/mm/mremap.c b/mm/mremap.c index 8ad4f7744..b9bc4871d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index 1e780d5c5..834a36467 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -440,7 +440,8 @@ unsigned long do_mmap_pgoff( tblock->next = current->mm->context.tblock.next; current->mm->context.tblock.next = tblock; - current->mm->total_vm += len >> PAGE_SHIFT; + // current->mm->total_vm += len >> PAGE_SHIFT; + vx_vmpages_add(current->mm, len >> PAGE_SHIFT); #ifdef DEBUG printk("do_mmap:\n"); @@ -494,7 +495,8 @@ int do_munmap(struct mm_struct * mm, unsigned long addr, size_t len) realalloc -= kobjsize(tblock); askedalloc -= sizeof(struct mm_tblock_struct); kfree(tblock); - mm->total_vm -= len >> PAGE_SHIFT; + // mm->total_vm -= len >> PAGE_SHIFT; + vx_vmpages_sub(mm, len >> PAGE_SHIFT); #ifdef DEBUG show_process_blocks(); @@ -507,7 +509,8 @@ int do_munmap(struct mm_struct * mm, unsigned long addr, size_t len) void exit_mmap(struct mm_struct * mm) { struct mm_tblock_struct *tmp; - mm->total_vm = 0; + // mm->total_vm = 0; + vx_vmpages_sub(mm, mm->total_vm); if (!mm) return; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index abc73e046..35e10840f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -55,7 +55,7 @@ static unsigned long badness(struct task_struct *p, unsigned long uptime) * The memory size of the process is the basis for the badness. */ points = p->mm->total_vm; - /* add vserver badness ;) */ + /* FIXME add vserver badness ;) */ /* * CPU time is in tens of seconds and run time is in thousands diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be1d6dc9d..71e5a7dce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -35,6 +35,7 @@ #include #include #include +#include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index 42288bbba..1b4dae696 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -30,7 +30,6 @@ #include #include #include -#include #include spinlock_t swaplock = SPIN_LOCK_UNLOCKED; diff --git a/mm/vmscan.c b/mm/vmscan.c index ba42ce745..451347268 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,7 +41,7 @@ #include #ifndef AT_LIMIT_SUPPORT -#warning "ckrm_at_limit disabled due to problems with memory hog tests -- seting ckrm_shrink_list_empty to true" +#warning "ckrm_at_limit disabled due to problems with memory hog tests -- setting ckrm_shrink_list_empty to true" #undef ckrm_shrink_list_empty #define ckrm_shrink_list_empty() (1) #endif diff --git a/net/core/dev.c b/net/core/dev.c index 9227745b2..65aedf888 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -112,6 +112,7 @@ #include /* Note : will define WIRELESS_EXT */ #include #endif /* CONFIG_NET_RADIO */ +#include #include #include @@ -1894,6 +1895,9 @@ static int dev_ifconf(char __user *arg) total = 0; for (dev = dev_base; dev; dev = dev->next) { + if (vx_flags(VXF_HIDE_NETIF, 0) && + !dev_in_nx_info(dev, current->nx_info)) + continue; for (i = 0; i < NPROTO; i++) { if (gifconf_list[i]) { int done; @@ -1954,6 +1958,10 @@ void dev_seq_stop(struct seq_file *seq, void *v) static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { + struct nx_info *nxi = current->nx_info; + + if (vx_flags(VXF_HIDE_NETIF, 0) && !dev_in_nx_info(dev, nxi)) + return; if (dev->get_stats) { struct net_device_stats *stats = dev->get_stats(dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 52641b0a5..2a8e28941 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -251,6 +251,9 @@ int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; + if (vx_info_flags(skb->sk->sk_vx_info, VXF_HIDE_NETIF, 0) && + !dev_in_nx_info(dev, skb->sk->sk_nx_info)) + continue; if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) break; } @@ -416,6 +419,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) sizeof(struct rtnl_link_ifmap) + sizeof(struct rtnl_link_stats) + 128); + if (vx_flags(VXF_HIDE_NETIF, 0) && + !dev_in_nx_info(dev, current->nx_info)) + return; skb = alloc_skb(size, GFP_KERNEL); if (!skb) return; diff --git a/net/socket.c b/net/socket.c index f030e0f15..b5d42a252 100644 --- a/net/socket.c +++ b/net/socket.c @@ -94,7 +94,6 @@ #include #include -#include #include static int sock_no_open(struct inode *irrelevant, struct file *dontcare); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 2fd2975c8..697cdb147 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -262,7 +262,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int taskflags) get_group_info(current->group_info); acred.uid = current->fsuid; acred.gid = current->fsgid; - acred.xid = current->xid; + acred.xid = vx_current_xid(); acred.group_info = current->group_info; dprintk("RPC: looking up %s cred\n", @@ -282,7 +282,7 @@ rpcauth_bindcred(struct rpc_task *task) get_group_info(current->group_info); acred.uid = current->fsuid; acred.gid = current->fsgid; - acred.xid = current->xid; + acred.xid = vx_current_xid(); acred.group_info = current->group_info; dprintk("RPC: %4d looking up %s cred\n", diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 294875e44..19f17f74c 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -83,7 +83,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) if (flags & RPC_TASK_ROOTCREDS) { cred->uc_uid = cred->uc_puid = 0; cred->uc_gid = cred->uc_pgid = 0; - cred->uc_xid = cred->uc_pxid = current->xid; + cred->uc_xid = cred->uc_pxid = vx_current_xid(); cred->uc_gids[0] = NOGROUP; } else { int groups = acred->group_info->ngroups; @@ -95,7 +95,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) cred->uc_xid = acred->xid; cred->uc_puid = current->uid; cred->uc_pgid = current->gid; - cred->uc_pxid = current->xid; + cred->uc_pxid = vx_current_xid(); for (i = 0; i < groups; i++) cred->uc_gids[i] = GROUP_AT(acred->group_info, i); if (i < NFS_NGROUPS) @@ -131,7 +131,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int taskflags) || cred->uc_xid != acred->xid || cred->uc_puid != current->uid || cred->uc_pgid != current->gid - || cred->uc_pxid != current->xid) + || cred->uc_pxid != vx_current_xid()) return 0; groups = acred->group_info->ngroups; diff --git a/security/security.c b/security/security.c index e8e79c3cf..4e9c19874 100644 --- a/security/security.c +++ b/security/security.c @@ -185,6 +185,8 @@ int mod_unreg_security(const char *name, struct security_operations *ops) */ int capable(int cap) { + if (vx_check_bit(VXC_CAP_MASK, cap) && !vx_mcaps(1L << cap)) + return 0; if (security_ops->capable(current, cap)) { /* capability denied */ return 0; @@ -195,9 +197,24 @@ int capable(int cap) return 1; } +int vx_capable(int cap, int ccap) +{ + if (security_ops->capable(current, cap)) { + /* capability denied */ + return 0; + } + if (!vx_ccaps(ccap)) + return 0; + + /* capability granted */ + current->flags |= PF_SUPERPRIV; + return 1; +} + EXPORT_SYMBOL_GPL(register_security); EXPORT_SYMBOL_GPL(unregister_security); EXPORT_SYMBOL_GPL(mod_reg_security); EXPORT_SYMBOL_GPL(mod_unreg_security); EXPORT_SYMBOL(capable); +EXPORT_SYMBOL(vx_capable); EXPORT_SYMBOL(security_ops);