vserver 1.9.3
[linux-2.6.git] / fs / exec.c
index f73d2c4..121b5a4 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,9 +46,9 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
+#include <linux/vs_memory.h>
 
 #include <asm/uaccess.h>
-#include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 
 #ifdef CONFIG_KMOD
@@ -293,53 +293,49 @@ EXPORT_SYMBOL(copy_strings_kernel);
  * This routine is used to map in a page into an address space: needed by
  * execve() for the initial stack and environment pages.
  *
- * tsk->mmap_sem is held for writing.
+ * vma->vm_mm->mmap_sem is held for writing.
  */
-void put_dirty_page(struct task_struct *tsk, struct page *page,
-                       unsigned long address, pgprot_t prot)
+void install_arg_page(struct vm_area_struct *vma,
+                       struct page *page, unsigned long address)
 {
+       struct mm_struct *mm = vma->vm_mm;
        pgd_t * pgd;
        pmd_t * pmd;
        pte_t * pte;
-       struct pte_chain *pte_chain;
 
-       if (page_count(page) != 1)
-               printk(KERN_ERR "mem_map disagrees with %p at %08lx\n",
-                               page, address);
-
-       pgd = pgd_offset(tsk->mm, address);
-       pte_chain = pte_chain_alloc(GFP_KERNEL);
-       if (!pte_chain)
+       if (unlikely(anon_vma_prepare(vma)))
                goto out_sig;
-       spin_lock(&tsk->mm->page_table_lock);
-       pmd = pmd_alloc(tsk->mm, pgd, address);
+
+       flush_dcache_page(page);
+       pgd = pgd_offset(mm, address);
+
+       spin_lock(&mm->page_table_lock);
+       pmd = pmd_alloc(mm, pgd, address);
        if (!pmd)
                goto out;
-       pte = pte_alloc_map(tsk->mm, pmd, address);
+       pte = pte_alloc_map(mm, pmd, address);
        if (!pte)
                goto out;
        if (!pte_none(*pte)) {
                pte_unmap(pte);
                goto out;
        }
+       // mm->rss++;
+       vx_rsspages_inc(mm);
        lru_cache_add_active(page);
-       flush_dcache_page(page);
-       set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
-       pte_chain = page_add_rmap(page, pte, pte_chain);
+       set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
+                                       page, vma->vm_page_prot))));
+       page_add_anon_rmap(page, vma, address);
        pte_unmap(pte);
-       tsk->mm->rss++;
-       spin_unlock(&tsk->mm->page_table_lock);
+       spin_unlock(&mm->page_table_lock);
 
        /* no need for flush_tlb */
-       pte_chain_free(pte_chain);
        return;
 out:
-       spin_unlock(&tsk->mm->page_table_lock);
+       spin_unlock(&mm->page_table_lock);
 out_sig:
        __free_page(page);
-       force_sig(SIGKILL, tsk);
-       pte_chain_free(pte_chain);
-       return;
+       force_sig(SIGKILL, current);
 }
 
 int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
@@ -409,11 +405,14 @@ int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
        if (!mpnt)
                return -ENOMEM;
 
-       if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) {
+       if (security_vm_enough_memory(arg_size >> PAGE_SHIFT) ||
+               !vx_vmpages_avail(mm, arg_size >> PAGE_SHIFT)) {
                kmem_cache_free(vm_area_cachep, mpnt);
                return -ENOMEM;
        }
 
+       memset(mpnt, 0, sizeof(*mpnt));
+
        down_write(&mm->mmap_sem);
        {
                mpnt->vm_mm = mm;
@@ -434,22 +433,19 @@ int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
                        mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
                else
                        mpnt->vm_flags = VM_STACK_FLAGS;
+               mpnt->vm_flags |= mm->def_flags;
                mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
-               mpnt->vm_ops = NULL;
-               mpnt->vm_pgoff = 0;
-               mpnt->vm_file = NULL;
-               INIT_LIST_HEAD(&mpnt->shared);
-               mpnt->vm_private_data = (void *) 0;
                insert_vm_struct(mm, mpnt);
-               mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+               // mm->stack_vm = mm->total_vm = vma_pages(mpnt);
+               vx_vmpages_sub(mm, mm->total_vm - vma_pages(mpnt));
+               mm->stack_vm = mm->total_vm;
        }
 
        for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
                struct page *page = bprm->page[i];
                if (page) {
                        bprm->page[i] = NULL;
-                       put_dirty_page(current, page, stack_base,
-                                       mpnt->vm_page_prot);
+                       install_arg_page(mpnt, page, stack_base);
                }
                stack_base += PAGE_SIZE;
        }
@@ -555,6 +551,7 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
        task_unlock(tsk);
+       arch_pick_mmap_layout(mm);
        if (old_mm) {
                if (active_mm != old_mm) BUG();
                mmput(old_mm);
@@ -749,8 +746,10 @@ no_thread_group:
        spin_unlock(&oldsighand->siglock);
        write_unlock_irq(&tasklist_lock);
 
-       if (newsig && atomic_dec_and_test(&oldsig->count))
+       if (newsig && atomic_dec_and_test(&oldsig->count)) {
+               exit_itimers(oldsig);
                kmem_cache_free(signal_cachep, oldsig);
+       }
 
        if (atomic_dec_and_test(&oldsighand->count))
                kmem_cache_free(sighand_cachep, oldsighand);
@@ -795,11 +794,27 @@ static inline void flush_old_files(struct files_struct * files)
        spin_unlock(&files->file_lock);
 }
 
+void get_task_comm(char *buf, struct task_struct *tsk)
+{
+       /* buf must be at least sizeof(tsk->comm) in size */
+       task_lock(tsk);
+       memcpy(buf, tsk->comm, sizeof(tsk->comm));
+       task_unlock(tsk);
+}
+
+void set_task_comm(struct task_struct *tsk, char *buf)
+{
+       task_lock(tsk);
+       strlcpy(tsk->comm, buf, sizeof(tsk->comm));
+       task_unlock(tsk);
+}
+
 int flush_old_exec(struct linux_binprm * bprm)
 {
        char * name;
        int i, ch, retval;
        struct files_struct *files;
+       char tcomm[sizeof(current->comm)];
 
        /*
         * Make sure we have a private signal table and that
@@ -840,15 +855,17 @@ int flush_old_exec(struct linux_binprm * bprm)
                if (ch == '/')
                        i = 0;
                else
-                       if (i < 15)
-                               current->comm[i++] = ch;
+                       if (i < (sizeof(tcomm) - 1))
+                               tcomm[i++] = ch;
        }
-       current->comm[i] = '\0';
+       tcomm[i] = '\0';
+       set_task_comm(current, tcomm);
 
        flush_thread();
 
        if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
-           permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL))
+           permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) ||
+           (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP))
                current->mm->dumpable = 0;
 
        /* An exec changes our domain. We are no longer part of the thread
@@ -895,8 +912,10 @@ int prepare_binprm(struct linux_binprm *bprm)
 
        if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
                /* Set-uid? */
-               if (mode & S_ISUID)
+               if (mode & S_ISUID) {
+                       current->personality &= ~PER_CLEAR_ON_SETID;
                        bprm->e_uid = inode->i_uid;
+               }
 
                /* Set-gid? */
                /*
@@ -904,8 +923,10 @@ int prepare_binprm(struct linux_binprm *bprm)
                 * is a candidate for mandatory locking, not a setgid
                 * executable.
                 */
-               if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
+               if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+                       current->personality &= ~PER_CLEAR_ON_SETID;
                        bprm->e_gid = inode->i_gid;
+               }
        }
 
        /* fill in binprm security blob */
@@ -1003,7 +1024,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                        return retval;
 
                /* Remember if the application is TASO.  */
-               bprm->sh_bang = eh->ah.entry < 0x100000000;
+               bprm->sh_bang = eh->ah.entry < 0x100000000UL;
 
                bprm->file = file;
                bprm->loader = loader;
@@ -1078,96 +1099,100 @@ int do_execve(char * filename,
        char __user *__user *envp,
        struct pt_regs * regs)
 {
-       struct linux_binprm bprm;
+       struct linux_binprm *bprm;
        struct file *file;
        int retval;
        int i;
 
-       sched_balance_exec();
-
        file = open_exec(filename);
 
        retval = PTR_ERR(file);
        if (IS_ERR(file))
                return retval;
 
-       bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-       memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0]));
-
-       bprm.file = file;
-       bprm.filename = filename;
-       bprm.interp = filename;
-       bprm.sh_bang = 0;
-       bprm.loader = 0;
-       bprm.exec = 0;
-       bprm.security = NULL;
-       bprm.mm = mm_alloc();
+       sched_exec();
+
        retval = -ENOMEM;
-       if (!bprm.mm)
+       bprm = kmalloc(sizeof(*bprm), GFP_KERNEL);
+       if (!bprm)
+               goto out_ret;
+       memset(bprm, 0, sizeof(*bprm));
+
+       bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+
+       bprm->file = file;
+       bprm->filename = filename;
+       bprm->interp = filename;
+       bprm->mm = mm_alloc();
+       if (!bprm->mm)
                goto out_file;
 
-       retval = init_new_context(current, bprm.mm);
+       retval = init_new_context(current, bprm->mm);
        if (retval < 0)
                goto out_mm;
 
-       bprm.argc = count(argv, bprm.p / sizeof(void *));
-       if ((retval = bprm.argc) < 0)
+       bprm->argc = count(argv, bprm->p / sizeof(void *));
+       if ((retval = bprm->argc) < 0)
                goto out_mm;
 
-       bprm.envc = count(envp, bprm.p / sizeof(void *));
-       if ((retval = bprm.envc) < 0)
+       bprm->envc = count(envp, bprm->p / sizeof(void *));
+       if ((retval = bprm->envc) < 0)
                goto out_mm;
 
-       retval = security_bprm_alloc(&bprm);
+       retval = security_bprm_alloc(bprm);
        if (retval)
                goto out;
 
-       retval = prepare_binprm(&bprm);
+       retval = prepare_binprm(bprm);
        if (retval < 0)
                goto out;
 
-       retval = copy_strings_kernel(1, &bprm.filename, &bprm);
+       retval = copy_strings_kernel(1, &bprm->filename, bprm);
        if (retval < 0)
                goto out;
 
-       bprm.exec = bprm.p;
-       retval = copy_strings(bprm.envc, envp, &bprm);
+       bprm->exec = bprm->p;
+       retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out;
 
-       retval = copy_strings(bprm.argc, argv, &bprm);
+       retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out;
 
-       retval = search_binary_handler(&bprm,regs);
+       retval = search_binary_handler(bprm,regs);
        if (retval >= 0) {
-               free_arg_pages(&bprm);
+               free_arg_pages(bprm);
 
                /* execve success */
-               security_bprm_free(&bprm);
+               security_bprm_free(bprm);
+               kfree(bprm);
                return retval;
        }
 
 out:
        /* Something went wrong, return the inode and free the argument pages*/
        for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-               struct page * page = bprm.page[i];
+               struct page * page = bprm->page[i];
                if (page)
                        __free_page(page);
        }
 
-       if (bprm.security)
-               security_bprm_free(&bprm);
+       if (bprm->security)
+               security_bprm_free(bprm);
 
 out_mm:
-       if (bprm.mm)
-               mmdrop(bprm.mm);
+       if (bprm->mm)
+               mmdrop(bprm->mm);
 
 out_file:
-       if (bprm.file) {
-               allow_write_access(bprm.file);
-               fput(bprm.file);
+       if (bprm->file) {
+               allow_write_access(bprm->file);
+               fput(bprm->file);
        }
+       kfree(bprm);
+
+out_ret:
        return retval;
 }
 
@@ -1195,7 +1220,7 @@ EXPORT_SYMBOL(set_binfmt);
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-void format_corename(char *corename, const char *pattern, long signr)
+static void format_corename(char *corename, const char *pattern, long signr)
 {
        const char *pat_ptr = pattern;
        char *out_ptr = corename;
@@ -1359,7 +1384,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        struct file * file;
        int retval = 0;
 
-       lock_kernel();
        binfmt = current->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
@@ -1377,7 +1401,13 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
                goto fail_unlock;
 
-       format_corename(corename, core_pattern, signr);
+       /*
+        * lock_kernel() because format_corename() is controlled by sysctl, which
+        * uses lock_kernel()
+        */
+       lock_kernel();
+       format_corename(corename, core_pattern, signr);
+       unlock_kernel();
        file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
        if (IS_ERR(file))
                goto fail_unlock;
@@ -1404,6 +1434,5 @@ close_fail:
 fail_unlock:
        complete_all(&mm->core_done);
 fail:
-       unlock_kernel();
        return retval;
 }