X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Ffork.c;h=246075943ce11e5e3f7d37fd20b9dc81461e21b0;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=ca2985a8f80f30f0232d576b252333174b3ea6e1;hpb=76828883507a47dae78837ab5dec5a5b4513c667;p=linux-2.6.git diff --git a/kernel/fork.c b/kernel/fork.c index ca2985a8f..246075943 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,7 +11,6 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ -#include #include #include #include @@ -19,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -28,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -36,18 +36,25 @@ #include #include #include +#include #include -#include +#include #include #include #include #include #include +#include #include +#include +#include +#include +#include #include #include #include #include +#include #include #include @@ -66,9 +73,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; - __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ - -EXPORT_SYMBOL(tasklist_lock); +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ int nr_processes(void) { @@ -84,49 +89,47 @@ int nr_processes(void) #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) -static kmem_cache_t *task_struct_cachep; +static struct kmem_cache *task_struct_cachep; #endif /* SLAB cache for signal_struct structures (tsk->signal) */ -kmem_cache_t *signal_cachep; +static struct kmem_cache *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ -kmem_cache_t *sighand_cachep; +struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ -kmem_cache_t *files_cachep; +struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ -kmem_cache_t *fs_cachep; +struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -kmem_cache_t *vm_area_cachep; +struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ -static kmem_cache_t *mm_cachep; +static struct kmem_cache *mm_cachep; void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); + rt_mutex_debug_task_free(tsk); clr_vx_info(&tsk->vx_info); clr_nx_info(&tsk->nx_info); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); -void __put_task_struct_cb(struct rcu_head *rhp) +void __put_task_struct(struct task_struct *tsk) { - struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); - WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - if (unlikely(tsk->audit_context)) - audit_free(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); + delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -184,9 +187,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->thread_info = ti; setup_thread_stack(tsk, orig); +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif + /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); +#ifdef CONFIG_BLK_DEV_IO_TRACE + tsk->btrace_seq = 0; +#endif + tsk->splice_pipe = NULL; return tsk; } @@ -200,8 +211,11 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(oldmm); - down_write(&mm->mmap_sem); + flush_cache_dup_mm(oldmm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; @@ -234,7 +248,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -249,7 +263,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) anon_vma_link(tmp); file = tmp->vm_file; if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); @@ -282,6 +296,9 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; } +#ifdef arch_dup_mmap + arch_dup_mmap(mm, oldmm); +#endif retval = 0; out: up_write(&mm->mmap_sem); @@ -316,7 +333,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) #include @@ -329,6 +346,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->nr_ptes = 0; + __set_mm_counter(mm, file_rss, 0); + __set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; @@ -378,6 +397,8 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { + might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); @@ -443,7 +464,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->vfork_done = NULL; complete(vfork_done); } - if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + + /* + * If we're exiting normally, clear a user-space tid field if + * requested. We leave this alone when dying by signal, to leave + * the value intact in a core dump, and to save the unnecessary + * trouble otherwise. Userland only wants this done for a sys_exit. + */ + if (tsk->clear_child_tid + && !(tsk->flags & PF_SIGNALED) + && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; @@ -475,6 +505,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm->mm_vx_info = NULL; + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + if (!mm_init(mm)) goto fail_nomem; @@ -539,6 +573,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) goto fail_nomem; good_mm: + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + tsk->mm = mm; tsk->active_mm = mm; return 0; @@ -568,6 +606,7 @@ static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) fs->altroot = NULL; } read_unlock(&old->lock); + atomic_inc(&vs_global_fs); } return fs; } @@ -593,7 +632,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fdset; + int size = fdt->max_fds; int i; /* Find the last open fd */ @@ -610,22 +649,20 @@ static struct files_struct *alloc_files(void) struct files_struct *newf; struct fdtable *fdt; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->next_fd = 0; fdt = &newf->fdtab; - fdt->next_fd = 0; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = __FD_SETSIZE; - fdt->close_on_exec = &newf->close_on_exec_init; - fdt->open_fds = &newf->open_fds_init; + fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; + fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); - fdt->free_files = NULL; fdt->next = NULL; rcu_assign_pointer(newf->fdt, fdt); out: @@ -635,14 +672,16 @@ out: /* * Allocate a new files structure and copy contents from the * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, expand; + int open_files, size, i; struct fdtable *old_fdt, *new_fdt; + *errorp = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -650,25 +689,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); - size = old_fdt->max_fdset; open_files = count_open_files(old_fdt); - expand = 0; /* - * Check whether we need to allocate a larger fd array or fd set. - * Note: we're not a clone task, so the open count won't change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - if (open_files > new_fdt->max_fdset) { - new_fdt->max_fdset = 0; - expand = 1; - } if (open_files > new_fdt->max_fds) { new_fdt->max_fds = 0; - expand = 1; - } - - /* if the old fdset gets grown now, we'll only copy up to "size" fds */ - if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); *errorp = expand_files(newf, open_files-1); @@ -688,14 +716,16 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); - /* FIXME: sum it first for check and performance */ + /* TODO: sum it first for check and performance */ vx_openfd_inc(open_files - i); } else { /* @@ -716,22 +746,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (new_fdt->max_fdset > open_files) { - int left = (new_fdt->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&new_fdt->open_fds->fds_bits[start], 0, left); memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } -out: return newf; out_release: - free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); - free_fdset (new_fdt->open_fds, new_fdt->max_fdset); - free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); +out: return NULL; } @@ -758,7 +785,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * break this. */ tsk->files = NULL; - error = -ENOMEM; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -780,8 +806,7 @@ int unshare_files(void) struct files_struct *files = current->files; int rc; - if(!files) - BUG(); + BUG_ON(!files); /* This can race but the race causes us to copy when we don't need to and drop the copy */ @@ -798,14 +823,6 @@ int unshare_files(void) EXPORT_SYMBOL(unshare_files); -void sighand_free_cb(struct rcu_head *rhp) -{ - struct sighand_struct *sp; - - sp = container_of(rhp, struct sighand_struct, rcu); - kmem_cache_free(sighand_cachep, sp); -} - static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { struct sighand_struct *sig; @@ -818,12 +835,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; - spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; } +void __cleanup_sighand(struct sighand_struct *sighand) +{ + if (atomic_dec_and_test(&sighand->count)) + kmem_cache_free(sighand_cachep, sighand); +} + static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) { struct signal_struct *sig; @@ -859,7 +881,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->real_timer.data = tsk; + sig->tsk = tsk; sig->it_virt_expires = cputime_zero; sig->it_virt_incr = cputime_zero; @@ -876,6 +898,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -889,18 +912,34 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts tsk->it_prof_expires = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); } + acct_init_pacct(&sig->pacct); return 0; } +void __cleanup_signal(struct signal_struct *sig) +{ + exit_thread_group_keys(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void cleanup_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + atomic_dec(&sig->live); + + if (atomic_dec_and_test(&sig->count)) + __cleanup_signal(sig); +} + static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; } @@ -911,6 +950,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ + spin_lock_init(&p->pi_lock); +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -919,13 +967,13 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static task_t *copy_process(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - int pid) +static struct task_struct *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + int pid) { int retval; struct task_struct *p = NULL; @@ -959,6 +1007,12 @@ static task_t *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + rt_mutex_init_task(p); + +#ifdef CONFIG_TRACE_IRQFLAGS + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); +#endif init_vx_info(&p->vx_info, current->vx_info); init_nx_info(&p->nx_info, current->nx_info); @@ -970,7 +1024,7 @@ static task_t *copy_process(unsigned long clone_flags, goto bad_fork_free; } if (p->mm && vx_flags(VXF_FORK_RSS, 0)) { - if (!vx_rsspages_avail(p->mm, get_mm_counter(p->mm, file_rss))) + if (!vx_rss_avail(p->mm, get_mm_counter(p->mm, file_rss))) goto bad_fork_cleanup_vm; } @@ -1004,20 +1058,19 @@ static task_t *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup; - - p->proc_dentry = NULL; + goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); + ptrace_init_task(p); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); @@ -1029,6 +1082,7 @@ static task_t *copy_process(unsigned long clone_flags, p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; @@ -1052,6 +1106,31 @@ static task_t *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_cpuset; } + mpol_fix_fork_child_flag(p); +#endif +#ifdef CONFIG_TRACE_IRQFLAGS + p->irq_events = 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + p->hardirqs_enabled = 1; +#else + p->hardirqs_enabled = 0; +#endif + p->hardirq_enable_ip = 0; + p->hardirq_enable_event = 0; + p->hardirq_disable_ip = _THIS_IP_; + p->hardirq_disable_event = 0; + p->softirqs_enabled = 1; + p->softirq_enable_ip = _THIS_IP_; + p->softirq_enable_event = 0; + p->softirq_disable_ip = 0; + p->softirq_disable_event = 0; + p->hardirq_context = 0; + p->softirq_context = 0; +#endif +#ifdef CONFIG_LOCKDEP + p->lockdep_depth = 0; /* no locks held yet */ + p->curr_chain_key = 0; + p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1081,17 +1160,23 @@ static task_t *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + p->robust_list = NULL; +#ifdef CONFIG_COMPAT + p->compat_robust_list = NULL; +#endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; /* * sigaltstack should be cleared when sharing the same VM @@ -1110,7 +1195,6 @@ static task_t *copy_process(unsigned long clone_flags, /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ @@ -1123,8 +1207,7 @@ static task_t *copy_process(unsigned long clone_flags, * We dont wake it up yet. */ p->group_leader = p; - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); + INIT_LIST_HEAD(&p->thread_group); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); @@ -1132,6 +1215,9 @@ static task_t *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ + p->ioprio = current->ioprio; + /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. @@ -1146,47 +1232,33 @@ static task_t *copy_process(unsigned long clone_flags, !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); - /* - * Check for pending SIGKILL! The new thread should not be allowed - * to slip out of an OOM kill. (or normal SIGKILL.) - */ - if (sigismember(¤t->pending.signal, SIGKILL)) { - write_unlock_irq(&tasklist_lock); - retval = -EINTR; - goto bad_fork_cleanup_namespace; - } - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) - p->real_parent = current->real_parent; + p->parent = current->parent; else - p->real_parent = current; - p->parent = p->real_parent; + p->parent = current; spin_lock(¤t->sighand->siglock); + + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -ERESTARTNOINTR; + goto bad_fork_cleanup_namespaces; + } + if (clone_flags & CLONE_THREAD) { - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } p->group_leader = current->group_leader; - - if (current->signal->group_stop_count > 0) { - /* - * There is an all-stop in progress for the group. - * We ourselves will stop as soon as we check signals. - * Make the new thread part of that group stop too. - */ - current->signal->group_stop_count++; - set_tsk_thread_flag(p, TIF_SIGPENDING); - } + list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); if (!cputime_eq(current->signal->it_virt_expires, cputime_zero) || @@ -1204,28 +1276,24 @@ static task_t *copy_process(unsigned long clone_flags, } } - /* - * inherit ioprio - */ - p->ioprio = current->ioprio; + if (likely(p->pid)) { + add_parent(p); + tracehook_init_task(p); - SET_LINKS(p); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); - - if (thread_group_leader(p)) { - p->signal->tty = current->signal->tty; - p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; - attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); - if (p->pid) + if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); + set_signal_session(p->signal, process_session(current)); + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, process_session(p)); + + list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; + } + attach_pid(p, PIDTYPE_PID, p->pid); + nr_threads++; } - attach_pid(p, PIDTYPE_TGID, p->tgid); - attach_pid(p, PIDTYPE_PID, p->pid); - nr_threads++; total_forks++; spin_unlock(¤t->sighand->siglock); @@ -1244,17 +1312,17 @@ static task_t *copy_process(unsigned long clone_flags, proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: - exit_signal(p); + cleanup_signal(p); bad_fork_cleanup_sighand: - exit_sighand(p); + __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: @@ -1271,7 +1339,8 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cpuset: #endif cpuset_exit(p); -bad_fork_cleanup: +bad_fork_cleanup_delays_binfmt: + delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: @@ -1289,39 +1358,22 @@ fork_out: return ERR_PTR(retval); } -struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; } -task_t * __devinit fork_idle(int cpu) +struct task_struct * __cpuinit fork_idle(int cpu) { - task_t *task; + struct task_struct *task; struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); - if (!task) - return ERR_PTR(-ENOMEM); - init_idle(task, cpu); - unhash_process(task); - return task; -} + if (!IS_ERR(task)) + init_idle(task, cpu); -static inline int fork_traceflag (unsigned clone_flags) -{ - if (clone_flags & CLONE_UNTRACED) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; - - return 0; + return task; } /* @@ -1338,23 +1390,32 @@ long do_fork(unsigned long clone_flags, int __user *child_tidptr) { struct task_struct *p; - int trace = 0; - long pid = alloc_pidmap(); + struct pid *pid = alloc_pid(); + long nr; - if (pid < 0) + if (!pid) return -EAGAIN; - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; + + /* kernel threads are host only */ + if ((clone_flags & CLONE_KTHREAD) && !vx_check(0, VS_ADMIN)) { + vxwprintk(1, "xid=%d tried to spawn a kernel thread.", + vx_current_xid()); + free_pid(pid); + return -EPERM; } - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); + nr = pid->nr; + + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + int is_user = likely(user_mode(regs)); struct completion vfork; if (clone_flags & CLONE_VFORK) { @@ -1362,45 +1423,56 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + if (likely(is_user)) + tracehook_report_clone(clone_flags, p); + + p->flags &= ~PF_STARTING; + + if (clone_flags & CLONE_STOPPED) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); + p->state = TASK_STOPPED; } - - if (!(clone_flags & CLONE_STOPPED)) - wake_up_new_task(p, clone_flags); else - p->state = TASK_STOPPED; + wake_up_new_task(p, clone_flags); - if (unlikely (trace)) { - current->ptrace_message = pid; - ptrace_notify ((trace << 8) | SIGTRAP); - } + if (likely(is_user)) + tracehook_report_clone_complete(clone_flags, nr, p); if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + if (likely(is_user)) + tracehook_report_vfork_done(p, nr); } } else { - free_pidmap(pid); - pid = PTR_ERR(p); + free_pid(pid); + nr = PTR_ERR(p); } - return pid; + return nr; } #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif +static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) +{ + struct sighand_struct *sighand = data; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + spin_lock_init(&sighand->siglock); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + sighand_ctor, NULL); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); @@ -1482,18 +1554,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the namespace structure if it is being shared + * Unshare the mnt_namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); if (!*new_nsp) return -ENOMEM; } @@ -1502,15 +1574,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new } /* - * Unsharing of sighand for tasks created with CLONE_SIGHAND is not - * supported yet + * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) { struct sighand_struct *sigh = current->sighand; - if ((unshare_flags & CLONE_SIGHAND) && - (sigh && atomic_read(&sigh->count) > 1)) + if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) return -EINVAL; else return 0; @@ -1561,6 +1631,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1573,19 +1653,29 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct namespace *ns, *new_ns = NULL; - struct sighand_struct *sigh, *new_sigh = NULL; + struct mnt_namespace *ns, *new_ns = NULL; + struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); + /* Return -EINVAL for all unsupported flags */ + err = -EINVAL; + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) + goto bad_unshare_out; + if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; @@ -1595,11 +1685,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; + + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_fs || new_ns || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { task_lock(current); + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + if (new_fs) { fs = current->fs; current->fs = new_fs; @@ -1607,17 +1716,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; new_ns = ns; } - if (new_sigh) { - sigh = current->sighand; - rcu_assign_pointer(current->sighand, new_sigh); - new_sigh = sigh; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1633,9 +1736,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); @@ -1651,7 +1778,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_ns: if (new_ns) - put_namespace(new_ns); + put_mnt_ns(new_ns); bad_unshare_cleanup_fs: if (new_fs)