X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Ffork.c;h=246075943ce11e5e3f7d37fd20b9dc81461e21b0;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=bfb4ad6672b322e103099af112441e40f4832e86;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/kernel/fork.c b/kernel/fork.c index bfb4ad667..246075943 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,7 +11,6 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ -#include #include #include #include @@ -19,25 +18,43 @@ #include #include #include -#include +#include #include #include #include #include +#include #include #include #include +#include +#include #include +#include #include +#include #include #include #include -#include +#include +#include +#include #include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include #include #include @@ -46,19 +63,17 @@ #include #include -/* The idle threads do not count.. - * Protected by write_lock_irq(&tasklist_lock) +/* + * Protected counters by write_lock_irq(&tasklist_lock) */ -int nr_threads; - -int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ +int nr_threads; /* The idle threads do not count.. */ -DEFINE_PER_CPU(unsigned long, process_counts) = 0; +int max_threads; /* tunable limit on nr_threads */ -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +DEFINE_PER_CPU(unsigned long, process_counts) = 0; -EXPORT_SYMBOL(tasklist_lock); +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ int nr_processes(void) { @@ -74,147 +89,52 @@ int nr_processes(void) #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) -static kmem_cache_t *task_struct_cachep; +static struct kmem_cache *task_struct_cachep; #endif -static void free_task(struct task_struct *tsk) +/* SLAB cache for signal_struct structures (tsk->signal) */ +static struct kmem_cache *signal_cachep; + +/* SLAB cache for sighand_struct structures (tsk->sighand) */ +struct kmem_cache *sighand_cachep; + +/* SLAB cache for files_struct structures (tsk->files) */ +struct kmem_cache *files_cachep; + +/* SLAB cache for fs_struct structures (tsk->fs) */ +struct kmem_cache *fs_cachep; + +/* SLAB cache for vm_area_struct structures */ +struct kmem_cache *vm_area_cachep; + +/* SLAB cache for mm_struct structures (tsk->mm) */ +static struct kmem_cache *mm_cachep; + +void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); - vxdprintk("freeing up task %p\n", tsk); + rt_mutex_debug_task_free(tsk); clr_vx_info(&tsk->vx_info); clr_nx_info(&tsk->nx_info); free_task_struct(tsk); } +EXPORT_SYMBOL(free_task); void __put_task_struct(struct task_struct *tsk) { - WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE))); + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - if (unlikely(tsk->audit_context)) - audit_free(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); - free_task(tsk); -} - -void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(add_wait_queue); - -void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(add_wait_queue_exclusive); + delayacct_tsk_free(tsk); -void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + if (!profile_handoff_task(tsk)) + free_task(tsk); } -EXPORT_SYMBOL(remove_wait_queue); - - -/* - * Note: we use "set_current_state()" _after_ the wait-queue add, - * because we need a memory barrier there on SMP, so that any - * wake-function that tests for the wait-queue being active - * will be guaranteed to see waitqueue addition _or_ subsequent - * tests in this thread will see the wakeup having taken place. - * - * The spin_unlock() itself is semi-permeable and only protects - * one way (it only protects stuff inside the critical region and - * stops them from bleeding out - it would still allow subsequent - * loads to move into the the critical region). - */ -void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(prepare_to_wait); - -void fastcall -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(prepare_to_wait_exclusive); - -void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - /* - * We can check for list emptiness outside the lock - * IFF: - * - we use the "careful" check that verifies both - * the next and prev pointers, so that there cannot - * be any half-pending updates in progress on other - * CPU's that we haven't seen yet (and that might - * still change the stack area. - * and - * - all other users take the lock (ie we can only - * have _one_ other CPU that looks at or modifies - * the list). - */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); - } -} - -EXPORT_SYMBOL(finish_wait); - -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - int ret = default_wake_function(wait, mode, sync, key); - - if (ret) - list_del_init(&wait->task_list); - return ret; -} - -EXPORT_SYMBOL(autoremove_wake_function); - void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR @@ -232,15 +152,18 @@ void __init fork_init(unsigned long mempages) * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); + /* * we need to allow at least 20 threads to boot a system */ if(max_threads < 20) max_threads = 20; - init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; - init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_SIGPENDING] = + init_task.signal->rlim[RLIMIT_NPROC]; } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -260,55 +183,64 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - *ti = *orig->thread_info; *tsk = *orig; tsk->thread_info = ti; - ti->task = tsk; + setup_thread_stack(tsk, orig); + +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); +#ifdef CONFIG_BLK_DEV_IO_TRACE + tsk->btrace_seq = 0; +#endif + tsk->splice_pipe = NULL; return tsk; } #ifdef CONFIG_MMU -static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) +static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct * mpnt, *tmp, **pprev; + struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(current->mm); + flush_cache_dup_mm(oldmm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->free_area_cache = oldmm->mmap_base; + mm->cached_hole_size = ~0UL; mm->map_count = 0; - mm->rss = 0; + __set_mm_counter(mm, file_rss, 0); + __set_mm_counter(mm, anon_rss, 0); cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; - /* - * Add it to the mmlist after the parent. - * Doing it this way means that we can order the list, - * and fork() won't mess up the ordering significantly. - * Add it first so that swapoff can see any swap entries. - */ - spin_lock(&mmlist_lock); - list_add(&mm->mmlist, ¤t->mm->mmlist); - mmlist_nr++; - spin_unlock(&mmlist_lock); - - for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; - if(mpnt->vm_flags & VM_DONTCOPY) + if (mpnt->vm_flags & VM_DONTCOPY) { + long pages = vma_pages(mpnt); + vx_vmpages_sub(mm, pages); + vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + -pages); continue; + } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -316,7 +248,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -329,16 +261,16 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) tmp->vm_mm = mm; tmp->vm_next = NULL; anon_vma_link(tmp); - vma_prio_tree_init(tmp); file = tmp->vm_file; if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ spin_lock(&file->f_mapping->i_mmap_lock); + tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); @@ -346,11 +278,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) } /* - * Link in the new vma and copy the page table entries: - * link in first so that swapoff can see swap entries, - * and try_to_unmap_one's find_vma find the new vma. + * Link in the new vma and copy the page table entries. */ - spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; @@ -359,8 +288,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, current->mm, tmp); - spin_unlock(&mm->page_table_lock); + retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -368,10 +296,13 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) if (retval) goto out; } +#ifdef arch_dup_mmap + arch_dup_mmap(mm, oldmm); +#endif retval = 0; - out: - flush_tlb_mm(current->mm); + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: @@ -400,10 +331,9 @@ static inline void mm_free_pgd(struct mm_struct * mm) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ -spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -int mmlist_nr; + __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) #include @@ -413,12 +343,16 @@ static struct mm_struct * mm_init(struct mm_struct * mm) atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); + INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; - mm->page_table_lock = SPIN_LOCK_UNLOCKED; - mm->ioctx_list_lock = RW_LOCK_UNLOCKED; + mm->nr_ptes = 0; + __set_mm_counter(mm, file_rss, 0); + __set_mm_counter(mm, anon_rss, 0); + spin_lock_init(&mm->page_table_lock); + rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; - mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -463,32 +397,47 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { - if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - list_del(&mm->mmlist); - mmlist_nr--; - spin_unlock(&mmlist_lock); + might_sleep(); + + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + put_swap_token(mm); mmdrop(mm); } } +EXPORT_SYMBOL_GPL(mmput); -/* - * Checks if the use count of an mm is non-zero and if so - * returns a reference to it after bumping up the use count. - * If the use count is zero, it means this mm is going away, - * so return NULL. +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * this kernel workthread has transiently adopted a user mm with use_mm, + * to do its AIO) is not set and if so returns a reference to it, after + * bumping up the use count. User must release the mm via mmput() + * after use. Typically used by /proc and ptrace. */ -struct mm_struct *mmgrab(struct mm_struct *mm) +struct mm_struct *get_task_mm(struct task_struct *task) { - spin_lock(&mmlist_lock); - if (!atomic_read(&mm->mm_users)) - mm = NULL; - else - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (task->flags & PF_BORROWED_MM) + mm = NULL; + else + atomic_inc(&mm->mm_users); + } + task_unlock(task); return mm; } +EXPORT_SYMBOL_GPL(get_task_mm); /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, @@ -515,7 +464,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->vfork_done = NULL; complete(vfork_done); } - if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + + /* + * If we're exiting normally, clear a user-space tid field if + * requested. We leave this alone when dying by signal, to leave + * the value intact in a core dump, and to save the unnecessary + * trouble otherwise. Userland only wants this done for a sys_exit. + */ + if (tsk->clear_child_tid + && !(tsk->flags & PF_SIGNALED) + && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; @@ -528,14 +486,68 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } } +/* + * Allocate a new mm structure and copy contents from the + * mm structure of the passed in task structure. + */ +static struct mm_struct *dup_mm(struct task_struct *tsk) +{ + struct mm_struct *mm, *oldmm = current->mm; + int err; + + if (!oldmm) + return NULL; + + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); + mm->mm_vx_info = NULL; + + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + + if (!mm_init(mm)) + goto fail_nomem; + + if (init_new_context(tsk, mm)) + goto fail_nocontext; + + err = dup_mmap(mm, oldmm); + if (err) + goto free_pt; + + mm->hiwater_rss = get_mm_rss(mm); + mm->hiwater_vm = mm->total_vm; + + return mm; + +free_pt: + mmput(mm); + +fail_nomem: + return NULL; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + clr_vx_info(&mm->mm_vx_info); + mm_free_pgd(mm); + free_mm(mm); + return NULL; +} + static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; - tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0; + tsk->nvcsw = tsk->nivcsw = 0; tsk->mm = NULL; tsk->active_mm = NULL; @@ -552,52 +564,25 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; - /* - * There are cases where the PTL is held to ensure no - * new threads start up in user mode using an mm, which - * allows optimizing out ipis; the tlb_gather_mmu code - * is an example. - */ - spin_unlock_wait(&oldmm->page_table_lock); goto good_mm; } retval = -ENOMEM; - mm = allocate_mm(); + mm = dup_mm(tsk); if (!mm) goto fail_nomem; - /* Copy the current MM stuff.. */ - memcpy(mm, oldmm, sizeof(*mm)); - mm->mm_vx_info = NULL; - if (!mm_init(mm)) - goto fail_nomem; - - if (init_new_context(tsk,mm)) - goto fail_nocontext; - - retval = dup_mmap(mm, oldmm); - if (retval) - goto free_pt; - good_mm: + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + tsk->mm = mm; tsk->active_mm = mm; return 0; -free_pt: - mmput(mm); fail_nomem: return retval; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return retval; } static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) @@ -606,7 +591,7 @@ static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) /* We don't need to lock fs - think why ;-) */ if (fs) { atomic_set(&fs->count, 1); - fs->lock = RW_LOCK_UNLOCKED; + rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); fs->rootmnt = mntget(old->rootmnt); @@ -621,6 +606,7 @@ static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) fs->altroot = NULL; } read_unlock(&old->lock); + atomic_inc(&vs_global_fs); } return fs; } @@ -644,129 +630,169 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -static int count_open_files(struct files_struct *files, int size) +static int count_open_files(struct fdtable *fdt) { + int size = fdt->max_fds; int i; /* Find the last open fd */ for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (fdt->open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); return i; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +static struct files_struct *alloc_files(void) { - struct files_struct *oldf, *newf; - struct file **old_fds, **new_fds; - int open_files, nfds, size, i, error = 0; + struct files_struct *newf; + struct fdtable *fdt; - /* - * A background process may not have any files ... - */ - oldf = current->files; - if (!oldf) + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); + if (!newf) goto out; - if (clone_flags & CLONE_FILES) { - atomic_inc(&oldf->count); - goto out; - } + atomic_set(&newf->count, 1); - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; - error = -ENOMEM; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - if (!newf) - goto out; + spin_lock_init(&newf->file_lock); + newf->next_fd = 0; + fdt = &newf->fdtab; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; + fdt->open_fds = (fd_set *)&newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); +out: + return newf; +} - atomic_set(&newf->count, 1); +/* + * Allocate a new files structure and copy contents from the + * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. + */ +static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +{ + struct files_struct *newf; + struct file **old_fds, **new_fds; + int open_files, size, i; + struct fdtable *old_fdt, *new_fdt; - newf->file_lock = SPIN_LOCK_UNLOCKED; - newf->next_fd = 0; - newf->max_fds = NR_OPEN_DEFAULT; - newf->max_fdset = __FD_SETSIZE; - newf->close_on_exec = &newf->close_on_exec_init; - newf->open_fds = &newf->open_fds_init; - newf->fd = &newf->fd_array[0]; - - /* We don't yet have the oldf readlock, but even if the old - fdset gets grown now, we'll only copy up to "size" fds */ - size = oldf->max_fdset; - if (size > __FD_SETSIZE) { - newf->max_fdset = 0; - spin_lock(&newf->file_lock); - error = expand_fdset(newf, size-1); - spin_unlock(&newf->file_lock); - if (error) - goto out_release; - } - spin_lock(&oldf->file_lock); + *errorp = -ENOMEM; + newf = alloc_files(); + if (!newf) + goto out; - open_files = count_open_files(oldf, size); + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + open_files = count_open_files(old_fdt); /* - * Check whether we need to allocate a larger fd array. - * Note: we're not a clone task, so the open count won't - * change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - nfds = NR_OPEN_DEFAULT; - if (open_files > nfds) { + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; spin_unlock(&oldf->file_lock); - newf->max_fds = 0; spin_lock(&newf->file_lock); - error = expand_fd_array(newf, open_files-1); + *errorp = expand_files(newf, open_files-1); spin_unlock(&newf->file_lock); - if (error) + if (*errorp < 0) goto out_release; - nfds = newf->max_fds; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); } - old_fds = oldf->fd; - new_fds = newf->fd; + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; - memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); - memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; - if (f) + if (f) { get_file(f); - *new_fds++ = f; + /* TODO: sum it first for check and performance */ + vx_openfd_inc(open_files - i); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + FD_CLR(open_files - i, new_fdt->open_fds); + } + rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ - size = (newf->max_fds - open_files) * sizeof(struct file *); + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (newf->max_fdset > open_files) { - int left = (newf->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); - memset(&newf->open_fds->fds_bits[start], 0, left); - memset(&newf->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } + return newf; + +out_release: + kmem_cache_free(files_cachep, newf); +out: + return NULL; +} + +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + int error = 0; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + /* + * Note: we may be using current for both targets (See exec.c) + * This works because we cache current->files (old) as oldf. Don't + * break this. + */ + tsk->files = NULL; + newf = dup_fd(oldf, &error); + if (!newf) + goto out; + tsk->files = newf; error = 0; out: return error; - -out_release: - free_fdset (newf->close_on_exec, newf->max_fdset); - free_fdset (newf->open_fds, newf->max_fdset); - kmem_cache_free(files_cachep, newf); - goto out; } /* @@ -780,8 +806,7 @@ int unshare_files(void) struct files_struct *files = current->files; int rc; - if(!files) - BUG(); + BUG_ON(!files); /* This can race but the race causes us to copy when we don't need to and drop the copy */ @@ -807,29 +832,45 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - tsk->sighand = sig; + rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; - spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; } +void __cleanup_sighand(struct sighand_struct *sighand) +{ + if (atomic_dec_and_test(&sighand->count)) + kmem_cache_free(sighand_cachep, sighand); +} + static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) { struct signal_struct *sig; + int ret; if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; + + ret = copy_thread_group_keys(tsk); + if (ret < 0) { + kmem_cache_free(signal_cachep, sig); + return ret; + } + atomic_set(&sig->count, 1); - sig->group_exit = 0; + atomic_set(&sig->live, 1); + init_waitqueue_head(&sig->wait_chldexit); + sig->flags = 0; sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; @@ -837,23 +878,68 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - sig->tty = current->signal->tty; - sig->pgrp = process_group(current); - sig->session = current->signal->session; + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); + sig->it_real_incr.tv64 = 0; + sig->real_timer.function = it_real_fn; + sig->tsk = tsk; + + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = 0; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; + sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + sig->sched_time = 0; + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); + + task_lock(current->group_leader); + memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); + task_unlock(current->group_leader); + + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + /* + * New sole thread in the process gets an expiry time + * of the whole CPU time limit. + */ + tsk->it_prof_expires = + secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + } + acct_init_pacct(&sig->pacct); + return 0; } +void __cleanup_signal(struct signal_struct *sig) +{ + exit_thread_group_keys(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void cleanup_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + atomic_dec(&sig->live); + + if (atomic_dec_and_test(&sig->count)) + __cleanup_signal(sig); +} + static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; } @@ -864,6 +950,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ + spin_lock_init(&p->pi_lock); +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -872,16 +967,18 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -struct task_struct *copy_process(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) +static struct task_struct *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + int pid) { int retval; struct task_struct *p = NULL; struct vx_info *vxi; + struct nx_info *nxi; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -906,37 +1003,40 @@ struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); if (!p) goto fork_out; - p->vx_info = NULL; - set_vx_info(&p->vx_info, current->vx_info); - p->nx_info = NULL; - set_nx_info(&p->nx_info, current->nx_info); + rt_mutex_init_task(p); + +#ifdef CONFIG_TRACE_IRQFLAGS + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); +#endif + init_vx_info(&p->vx_info, current->vx_info); + init_nx_info(&p->nx_info, current->nx_info); /* check vserver memory */ if (p->mm && !(clone_flags & CLONE_VM)) { if (vx_vmpages_avail(p->mm, p->mm->total_vm)) - vx_pages_add(p->mm->mm_vx_info, RLIMIT_AS, p->mm->total_vm); + vx_pages_add(p->vx_info, RLIMIT_AS, p->mm->total_vm); else goto bad_fork_free; } if (p->mm && vx_flags(VXF_FORK_RSS, 0)) { - if (!vx_rsspages_avail(p->mm, p->mm->rss)) - goto bad_fork_free; + if (!vx_rss_avail(p->mm, get_mm_counter(p->mm, file_rss))) + goto bad_fork_cleanup_vm; } retval = -EAGAIN; - if (!vx_nproc_avail(1)) - goto bad_fork_free; + if (!vx_nproc_avail(1)) + goto bad_fork_cleanup_vm; if (atomic_read(&p->user->processes) >= - p->rlim[RLIMIT_NPROC].rlim_cur) { + p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->user != &root_user) - goto bad_fork_free; + goto bad_fork_cleanup_vm; } atomic_inc(&p->user->__count); @@ -951,60 +1051,96 @@ struct task_struct *copy_process(unsigned long clone_flags, if (nr_threads >= max_threads) goto bad_fork_cleanup_count; - if (!try_module_get(p->thread_info->exec_domain->module)) + if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); - if (clone_flags & CLONE_IDLETASK) - p->pid = 0; - else { - p->pid = alloc_pidmap(); - if (p->pid == -1) - goto bad_fork_cleanup; - } + p->pid = pid; retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup; - - p->proc_dentry = NULL; + goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); - init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); + ptrace_init_task(p); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); - p->it_real_value = p->it_virt_value = p->it_prof_value = 0; - p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; - init_timer(&p->real_timer); - p->real_timer.data = (unsigned long) p; + p->utime = cputime_zero; + p->stime = cputime_zero; + p->sched_time = 0; + p->rchar = 0; /* I/O counter: bytes read */ + p->wchar = 0; /* I/O counter: bytes written */ + p->syscr = 0; /* I/O counter: read syscalls */ + p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); + acct_clear_integrals(p); + + p->it_virt_expires = cputime_zero; + p->it_prof_expires = cputime_zero; + p->it_sched_expires = 0; + INIT_LIST_HEAD(&p->cpu_timers[0]); + INIT_LIST_HEAD(&p->cpu_timers[1]); + INIT_LIST_HEAD(&p->cpu_timers[2]); - p->utime = p->stime = 0; - p->cutime = p->cstime = 0; p->lock_depth = -1; /* -1 = no lock */ - p->start_time = get_jiffies_64(); + do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; p->io_context = NULL; + p->io_wait = NULL; p->audit_context = NULL; + cpuset_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup; + goto bad_fork_cleanup_cpuset; } + mpol_fix_fork_child_flag(p); +#endif +#ifdef CONFIG_TRACE_IRQFLAGS + p->irq_events = 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + p->hardirqs_enabled = 1; +#else + p->hardirqs_enabled = 0; +#endif + p->hardirq_enable_ip = 0; + p->hardirq_enable_event = 0; + p->hardirq_disable_ip = _THIS_IP_; + p->hardirq_disable_event = 0; + p->softirqs_enabled = 1; + p->softirq_enable_ip = _THIS_IP_; + p->softirq_enable_event = 0; + p->softirq_disable_ip = 0; + p->softirq_disable_event = 0; + p->hardirq_context = 0; + p->softirq_context = 0; +#endif +#ifdef CONFIG_LOCKDEP + p->lockdep_depth = 0; /* no locks held yet */ + p->curr_chain_key = 0; + p->lockdep_recursion = 0; #endif - retval = -ENOMEM; +#ifdef CONFIG_DEBUG_MUTEXES + p->blocked_on = NULL; /* not blocked yet */ +#endif + + p->tgid = p->pid; + if (clone_flags & CLONE_THREAD) + p->tgid = current->tgid; + if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) @@ -1022,132 +1158,171 @@ struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; + if ((retval = copy_namespaces(clone_flags, p))) + goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + p->robust_list = NULL; +#ifdef CONFIG_COMPAT + p->compat_robust_list = NULL; +#endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; + + /* + * sigaltstack should be cleared when sharing the same VM + */ + if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) + p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing should be turned off in the child regardless * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; - - /* Perform scheduler related setup */ - sched_fork(p); + p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ - p->tgid = p->pid; p->group_leader = p; - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); + INIT_LIST_HEAD(&p->thread_group); + + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p, clone_flags); /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + + /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ + p->ioprio = current->ioprio; + /* - * Check for pending SIGKILL! The new thread should not be allowed - * to slip out of an OOM kill. (or normal SIGKILL.) + * The task hasn't been attached yet, so its cpus_allowed mask will + * not be changed, nor will its assigned CPU. + * + * The cpus_allowed mask of the parent may have changed after it was + * copied first time - so re-copy it here, then check the child's CPU + * to ensure it is on a valid CPU (and if not, just force it back to + * parent's CPU). This avoids alot of nasty races. */ - if (sigismember(¤t->pending.signal, SIGKILL)) { - write_unlock_irq(&tasklist_lock); - retval = -EINTR; - goto bad_fork_cleanup_namespace; - } + p->cpus_allowed = current->cpus_allowed; + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || + !cpu_online(task_cpu(p)))) + set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & CLONE_PARENT) - p->real_parent = current->real_parent; + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + p->parent = current->parent; else - p->real_parent = current; - p->parent = p->real_parent; + p->parent = current; + + spin_lock(¤t->sighand->siglock); + + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -ERESTARTNOINTR; + goto bad_fork_cleanup_namespaces; + } if (clone_flags & CLONE_THREAD) { - spin_lock(¤t->sighand->siglock); - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->group_exit) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } - p->tgid = current->tgid; p->group_leader = current->group_leader; - - if (current->signal->group_stop_count > 0) { + list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + + if (!cputime_eq(current->signal->it_virt_expires, + cputime_zero) || + !cputime_eq(current->signal->it_prof_expires, + cputime_zero) || + current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || + !list_empty(¤t->signal->cpu_timers[0]) || + !list_empty(¤t->signal->cpu_timers[1]) || + !list_empty(¤t->signal->cpu_timers[2])) { /* - * There is an all-stop in progress for the group. - * We ourselves will stop as soon as we check signals. - * Make the new thread part of that group stop too. + * Have child wake up on its first tick to check + * for process CPU timers. */ - current->signal->group_stop_count++; - set_tsk_thread_flag(p, TIF_SIGPENDING); + p->it_prof_expires = jiffies_to_cputime(1); } - - spin_unlock(¤t->sighand->siglock); } - SET_LINKS(p); - if (p->ptrace & PT_PTRACED) - __ptrace_link(p, current->parent); + if (likely(p->pid)) { + add_parent(p); + tracehook_init_task(p); + + if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); + set_signal_session(p->signal, process_session(current)); + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, process_session(p)); - attach_pid(p, PIDTYPE_PID, p->pid); - if (thread_group_leader(p)) { - attach_pid(p, PIDTYPE_TGID, p->tgid); - attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); - if (p->pid) + list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; - } else - link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); + } + attach_pid(p, PIDTYPE_PID, p->pid); + nr_threads++; + } - nr_threads++; - vxi = current->vx_info; + total_forks++; + spin_unlock(¤t->sighand->siglock); + + /* p is copy of current */ + vxi = p->vx_info; if (vxi) { - atomic_inc(&vxi->cacct.nr_threads); - // atomic_inc(&vxi->limit.rcur[RLIMIT_NPROC]); + claim_vx_info(vxi, p); + atomic_inc(&vxi->cvirt.nr_threads); + atomic_inc(&vxi->cvirt.total_forks); + vx_nproc_inc(p); } - vx_nproc_inc(); + nxi = p->nx_info; + if (nxi) + claim_nx_info(nxi, p); write_unlock_irq(&tasklist_lock); - retval = 0; - -fork_out: - if (retval) - return ERR_PTR(retval); + proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); +bad_fork_cleanup_keys: + exit_keys(p); bad_fork_cleanup_mm: - exit_mm(p); - if (p->active_mm) - mmdrop(p->active_mm); + if (p->mm) + mmput(p->mm); bad_fork_cleanup_signal: - exit_signal(p); + cleanup_signal(p); bad_fork_cleanup_sighand: - exit_sighand(p); + __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: @@ -1161,37 +1336,44 @@ bad_fork_cleanup_security: bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); +bad_fork_cleanup_cpuset: #endif -bad_fork_cleanup: - if (p->pid > 0) - free_pidmap(p->pid); + cpuset_exit(p); +bad_fork_cleanup_delays_binfmt: + delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: - module_put(p->thread_info->exec_domain->module); + module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: put_group_info(p->group_info); atomic_dec(&p->user->processes); free_uid(p->user); +bad_fork_cleanup_vm: + if (p->mm && !(clone_flags & CLONE_VM)) + vx_pages_sub(p->vx_info, RLIMIT_AS, p->mm->total_vm); bad_fork_free: free_task(p); - goto fork_out; +fork_out: + return ERR_PTR(retval); } -static inline int fork_traceflag (unsigned clone_flags) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { - if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK)) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; + memset(regs, 0, sizeof(struct pt_regs)); + return regs; +} - return 0; +struct task_struct * __cpuinit fork_idle(int cpu) +{ + struct task_struct *task; + struct pt_regs regs; + + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); + if (!IS_ERR(task)) + init_idle(task, cpu); + + return task; } /* @@ -1208,23 +1390,32 @@ long do_fork(unsigned long clone_flags, int __user *child_tidptr) { struct task_struct *p; - int trace = 0; - long pid; - - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; + struct pid *pid = alloc_pid(); + long nr; + + if (!pid) + return -EAGAIN; + + /* kernel threads are host only */ + if ((clone_flags & CLONE_KTHREAD) && !vx_check(0, VS_ADMIN)) { + vxwprintk(1, "xid=%d tried to spawn a kernel thread.", + vx_current_xid()); + free_pid(pid); + return -EPERM; } - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr); + nr = pid->nr; + + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ - pid = IS_ERR(p) ? PTR_ERR(p) : p->pid; - if (!IS_ERR(p)) { + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + int is_user = likely(user_mode(regs)); struct completion vfork; if (clone_flags & CLONE_VFORK) { @@ -1232,83 +1423,56 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + if (likely(is_user)) + tracehook_report_clone(clone_flags, p); + + p->flags &= ~PF_STARTING; + + if (clone_flags & CLONE_STOPPED) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); - } - - if (!(clone_flags & CLONE_STOPPED)) { - /* - * Do the wakeup last. On SMP we treat fork() and - * CLONE_VM separately, because fork() has already - * created cache footprint on this CPU (due to - * copying the pagetables), hence migration would - * probably be costy. Threads on the other hand - * have less traction to the current CPU, and if - * there's an imbalance then the scheduler can - * migrate this fresh thread now, before it - * accumulates a larger cache footprint: - */ - if (clone_flags & CLONE_VM) - wake_up_forked_thread(p); - else - wake_up_forked_process(p); - } else { - int cpu = get_cpu(); - p->state = TASK_STOPPED; - if (cpu_is_offline(task_cpu(p))) - set_task_cpu(p, cpu); - - put_cpu(); } - ++total_forks; + else + wake_up_new_task(p, clone_flags); - if (unlikely (trace)) { - current->ptrace_message = pid; - ptrace_notify ((trace << 8) | SIGTRAP); - } + if (likely(is_user)) + tracehook_report_clone_complete(clone_flags, nr, p); if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } else - /* - * Let the child process run first, to avoid most of the - * COW overhead when the child exec()s afterwards. - */ - set_need_resched(); + if (likely(is_user)) + tracehook_report_vfork_done(p, nr); + } + } else { + free_pid(pid); + nr = PTR_ERR(p); } - return pid; + return nr; } -/* SLAB cache for signal_struct structures (tsk->signal) */ -kmem_cache_t *signal_cachep; - -/* SLAB cache for sighand_struct structures (tsk->sighand) */ -kmem_cache_t *sighand_cachep; - -/* SLAB cache for files_struct structures (tsk->files) */ -kmem_cache_t *files_cachep; - -/* SLAB cache for fs_struct structures (tsk->fs) */ -kmem_cache_t *fs_cachep; +#ifndef ARCH_MIN_MMSTRUCT_ALIGN +#define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif -/* SLAB cache for vm_area_struct structures */ -kmem_cache_t *vm_area_cachep; +static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) +{ + struct sighand_struct *sighand = data; -/* SLAB cache for mm_struct structures (tsk->mm) */ -kmem_cache_t *mm_cachep; + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + spin_lock_init(&sighand->siglock); +} void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + sighand_ctor, NULL); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); @@ -1322,6 +1486,305 @@ void __init proc_caches_init(void) sizeof(struct vm_area_struct), 0, SLAB_PANIC, NULL, NULL); mm_cachep = kmem_cache_create("mm_struct", - sizeof(struct mm_struct), 0, + sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); } + + +/* + * Check constraints on flags passed to the unshare system call and + * force unsharing of additional process context as appropriate. + */ +static inline void check_unshare_flags(unsigned long *flags_ptr) +{ + /* + * If unsharing a thread from a thread group, must also + * unshare vm. + */ + if (*flags_ptr & CLONE_THREAD) + *flags_ptr |= CLONE_VM; + + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (*flags_ptr & CLONE_VM) + *flags_ptr |= CLONE_SIGHAND; + + /* + * If unsharing signal handlers and the task was created + * using CLONE_THREAD, then must unshare the thread + */ + if ((*flags_ptr & CLONE_SIGHAND) && + (atomic_read(¤t->signal->count) > 1)) + *flags_ptr |= CLONE_THREAD; + + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (*flags_ptr & CLONE_NEWNS) + *flags_ptr |= CLONE_FS; +} + +/* + * Unsharing of tasks created with CLONE_THREAD is not supported yet + */ +static int unshare_thread(unsigned long unshare_flags) +{ + if (unshare_flags & CLONE_THREAD) + return -EINVAL; + + return 0; +} + +/* + * Unshare the filesystem structure if it is being shared + */ +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) +{ + struct fs_struct *fs = current->fs; + + if ((unshare_flags & CLONE_FS) && + (fs && atomic_read(&fs->count) > 1)) { + *new_fsp = __copy_fs_struct(current->fs); + if (!*new_fsp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unshare the mnt_namespace structure if it is being shared + */ +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + + if ((unshare_flags & CLONE_NEWNS) && ns) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); + if (!*new_nsp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unsharing of sighand is not supported yet + */ +static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) +{ + struct sighand_struct *sigh = current->sighand; + + if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) + return -EINVAL; + else + return 0; +} + +/* + * Unshare vm if it is being shared + */ +static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) +{ + struct mm_struct *mm = current->mm; + + if ((unshare_flags & CLONE_VM) && + (mm && atomic_read(&mm->mm_users) > 1)) { + return -EINVAL; + } + + return 0; +} + +/* + * Unshare file descriptor table if it is being shared + */ +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +{ + struct files_struct *fd = current->files; + int error = 0; + + if ((unshare_flags & CLONE_FILES) && + (fd && atomic_read(&fd->count) > 1)) { + *new_fdp = dup_fd(fd, &error); + if (!*new_fdp) + return error; + } + + return 0; +} + +/* + * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not + * supported yet + */ +static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) +{ + if (unshare_flags & CLONE_SYSVSEM) + return -EINVAL; + + return 0; +} + +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + +/* + * unshare allows a process to 'unshare' part of the process + * context which was originally shared using clone. copy_* + * functions used by do_fork() cannot be used here directly + * because they modify an inactive task_struct that is being + * constructed. Here we are modifying the current, active, + * task_struct. + */ +asmlinkage long sys_unshare(unsigned long unshare_flags) +{ + int err = 0; + struct fs_struct *fs, *new_fs = NULL; + struct mnt_namespace *ns, *new_ns = NULL; + struct sighand_struct *new_sigh = NULL; + struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; + struct files_struct *fd, *new_fd = NULL; + struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; + + check_unshare_flags(&unshare_flags); + + /* Return -EINVAL for all unsupported flags */ + err = -EINVAL; + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) + goto bad_unshare_out; + + if ((err = unshare_thread(unshare_flags))) + goto bad_unshare_out; + if ((err = unshare_fs(unshare_flags, &new_fs))) + goto bad_unshare_cleanup_thread; + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) + goto bad_unshare_cleanup_fs; + if ((err = unshare_sighand(unshare_flags, &new_sigh))) + goto bad_unshare_cleanup_ns; + if ((err = unshare_vm(unshare_flags, &new_mm))) + goto bad_unshare_cleanup_sigh; + if ((err = unshare_fd(unshare_flags, &new_fd))) + goto bad_unshare_cleanup_vm; + if ((err = unshare_semundo(unshare_flags, &new_ulist))) + goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; + + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } + + if (new_fs || new_ns || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { + + task_lock(current); + + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + + if (new_fs) { + fs = current->fs; + current->fs = new_fs; + new_fs = fs; + } + + if (new_ns) { + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; + new_ns = ns; + } + + if (new_mm) { + mm = current->mm; + active_mm = current->active_mm; + current->mm = new_mm; + current->active_mm = new_mm; + activate_mm(active_mm, new_mm); + new_mm = mm; + } + + if (new_fd) { + fd = current->files; + current->files = new_fd; + new_fd = fd; + } + + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + + task_unlock(current); + } + + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: +bad_unshare_cleanup_fd: + if (new_fd) + put_files_struct(new_fd); + +bad_unshare_cleanup_vm: + if (new_mm) + mmput(new_mm); + +bad_unshare_cleanup_sigh: + if (new_sigh) + if (atomic_dec_and_test(&new_sigh->count)) + kmem_cache_free(sighand_cachep, new_sigh); + +bad_unshare_cleanup_ns: + if (new_ns) + put_mnt_ns(new_ns); + +bad_unshare_cleanup_fs: + if (new_fs) + put_fs_struct(new_fs); + +bad_unshare_cleanup_thread: +bad_unshare_out: + return err; +}