X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fptrace.c;h=37118d72a9db7ed9c90acba57391bbccb06e634c;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=786130d1b44e6323ea313fe9d2967aa3bd4fe2d7;hpb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;p=linux-2.6.git

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 786130d1b..37118d72a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -18,125 +18,17 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-
-#ifdef CONFIG_PTRACE
 #include <linux/utrace.h>
 #include <linux/tracehook.h>
+#include <linux/vs_context.h>
 #include <asm/tracehook.h>
-#endif
-
-int getrusage(struct task_struct *, int, struct rusage __user *);
-
-//#define PTRACE_DEBUG
-
-int __ptrace_may_attach(struct task_struct *task)
-{
-	/* May we inspect the given task?
-	 * This check is used both for attaching with ptrace
-	 * and for allowing access to sensitive information in /proc.
-	 *
-	 * ptrace_attach denies several cases that /proc allows
-	 * because setting up the necessary parent/child relationship
-	 * or halting the specified task is impossible.
-	 */
-	int dumpable = 0;
-	/* Don't let security modules deny introspection */
-	if (task == current)
-		return 0;
-	if (((current->uid != task->euid) ||
-	     (current->uid != task->suid) ||
-	     (current->uid != task->uid) ||
-	     (current->gid != task->egid) ||
-	     (current->gid != task->sgid) ||
-	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		return -EPERM;
-	smp_rmb();
-	if (task->mm)
-		dumpable = task->mm->dumpable;
-	if (!dumpable && !capable(CAP_SYS_PTRACE))
-		return -EPERM;
-
-	return security_ptrace(current, task);
-}
-
-int ptrace_may_attach(struct task_struct *task)
-{
-	int err;
-	task_lock(task);
-	err = __ptrace_may_attach(task);
-	task_unlock(task);
-	return !err;
-}
-
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
- */
-
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
-	struct mm_struct *mm;
-	struct vm_area_struct *vma;
-	struct page *page;
-	void *old_buf = buf;
-
-	mm = get_task_mm(tsk);
-	if (!mm)
-		return 0;
-
-	down_read(&mm->mmap_sem);
-	/* ignore errors, just check how much was sucessfully transfered */
-	while (len) {
-		int bytes, ret, offset;
-		void *maddr;
-
-		ret = get_user_pages(tsk, mm, addr, 1,
-				write, 1, &page, &vma);
-		if (ret <= 0)
-			break;
-
-		bytes = len;
-		offset = addr & (PAGE_SIZE-1);
-		if (bytes > PAGE_SIZE-offset)
-			bytes = PAGE_SIZE-offset;
-
-		maddr = kmap(page);
-		if (write) {
-			copy_to_user_page(vma, page, addr,
-					  maddr + offset, buf, bytes);
-			set_page_dirty_lock(page);
-		} else {
-			copy_from_user_page(vma, page, addr,
-					    buf, maddr + offset, bytes);
-		}
-		kunmap(page);
-		page_cache_release(page);
-		len -= bytes;
-		buf += bytes;
-		addr += bytes;
-	}
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	
-	return buf - old_buf;
-}
-
-
-#ifndef CONFIG_PTRACE
-
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
-{
-	return -ENOSYS;
-}
-
-#else
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
 
 struct ptrace_state
 {
+	struct rcu_head rcu;
+ 
 	/*
 	 * These elements are always available, even when the struct is
 	 * awaiting destruction at the next RCU callback point.
@@ -146,39 +38,23 @@ struct ptrace_state
 	struct task_struct *parent; /* Whom we report to.  */
 	struct list_head entry;	/* Entry on parent->ptracees list.  */
 
-	union {
-		struct rcu_head dead;
-		struct {
-			u8 options; /* PTRACE_SETOPTIONS bits.  */
-			unsigned int stopped:1;	/* Stopped for report.  */
-			unsigned int reported:1; /* wait already reported.  */
-			unsigned int syscall:1;	/* Reporting for syscall.  */
+	u8 options;		/* PTRACE_SETOPTIONS bits.  */
+	unsigned int syscall:1;	/* Reporting for syscall.  */
 #ifdef PTRACE_SYSEMU
-			unsigned int sysemu:1; /* PTRACE_SYSEMU in progress. */
+	unsigned int sysemu:1;	/* PTRACE_SYSEMU in progress. */
 #endif
-			unsigned int have_eventmsg:1; /* u.eventmsg valid. */
-			unsigned int cap_sys_ptrace:1; /* Tracer capable.  */
-
-			union
-			{
-				unsigned long eventmsg;
-				siginfo_t *siginfo;
-			} u;
-		} live;
+	unsigned int have_eventmsg:1; /* u.eventmsg valid. */
+	unsigned int cap_sys_ptrace:1; /* Tracer capable.  */
+
+	union
+	{
+		unsigned long eventmsg;
+		siginfo_t *siginfo;
 	} u;
 };
 
 static const struct utrace_engine_ops ptrace_utrace_ops; /* Initialized below. */
 
-
-static void
-ptrace_state_link(struct ptrace_state *state)
-{
-	task_lock(state->parent);
-	list_add_rcu(&state->entry, &state->parent->ptracees);
-	task_unlock(state->parent);
-}
-
 static void
 ptrace_state_unlink(struct ptrace_state *state)
 {
@@ -187,48 +63,60 @@ ptrace_state_unlink(struct ptrace_state *state)
 	task_unlock(state->parent);
 }
 
-static int
+static struct ptrace_state *
 ptrace_setup(struct task_struct *target, struct utrace_attached_engine *engine,
-	     struct task_struct *parent, u8 options, int cap_sys_ptrace)
+	     struct task_struct *parent, u8 options, int cap_sys_ptrace,
+	     struct ptrace_state *state)
 {
-	struct ptrace_state *state = kzalloc(sizeof *state, GFP_USER);
-	if (unlikely(state == NULL))
-		return -ENOMEM;
+	if (state == NULL) {
+		state = kzalloc(sizeof *state, GFP_USER);
+		if (unlikely(state == NULL))
+			return ERR_PTR(-ENOMEM);
+	}
 
 	state->engine = engine;
 	state->task = target;
 	state->parent = parent;
-	state->u.live.options = options;
-	state->u.live.cap_sys_ptrace = cap_sys_ptrace;
-	ptrace_state_link(state);
+	state->options = options;
+	state->cap_sys_ptrace = cap_sys_ptrace;
+
+	task_lock(parent);
+	if (unlikely(parent->flags & PF_EXITING)) {
+		task_unlock(parent);
+		kfree(state);
+		return ERR_PTR(-EALREADY);
+	}
+	list_add_rcu(&state->entry, &state->parent->ptracees);
+	task_unlock(state->parent);
 
 	BUG_ON(engine->data != 0);
 	rcu_assign_pointer(engine->data, (unsigned long) state);
 
-	return 0;
+	return state;
 }
 
 static void
 ptrace_state_free(struct rcu_head *rhead)
 {
 	struct ptrace_state *state = container_of(rhead,
-						  struct ptrace_state, u.dead);
+						  struct ptrace_state, rcu);
 	kfree(state);
 }
 
 static void
 ptrace_done(struct ptrace_state *state)
 {
-	INIT_RCU_HEAD(&state->u.dead);
-	call_rcu(&state->u.dead, ptrace_state_free);
+	INIT_RCU_HEAD(&state->rcu);
+	call_rcu(&state->rcu, ptrace_state_free);
 }
 
 /*
  * Update the tracing engine state to match the new ptrace state.
  */
-static void
-ptrace_update(struct task_struct *target, struct utrace_attached_engine *engine,
-	      unsigned long flags)
+static int __must_check
+ptrace_update(struct task_struct *target,
+	      struct utrace_attached_engine *engine,
+	      unsigned long flags, int from_stopped)
 {
 	struct ptrace_state *state = (struct ptrace_state *) engine->data;
 
@@ -236,7 +124,7 @@ ptrace_update(struct task_struct *target, struct utrace_attached_engine *engine,
 	 * These events are always reported.
 	 */
 	flags |= (UTRACE_EVENT(DEATH) | UTRACE_EVENT(EXEC)
-		  | UTRACE_EVENT_SIGNAL_ALL);
+		  | UTRACE_EVENT_SIGNAL_ALL | UTRACE_EVENT(JCTL));
 
 	/*
 	 * We always have to examine clone events to check for CLONE_PTRACE.
@@ -246,30 +134,55 @@ ptrace_update(struct task_struct *target, struct utrace_attached_engine *engine,
 	/*
 	 * PTRACE_SETOPTIONS can request more events.
 	 */
-	if (state->u.live.options & PTRACE_O_TRACEEXIT)
+	if (state->options & PTRACE_O_TRACEEXIT)
 		flags |= UTRACE_EVENT(EXIT);
-	if (state->u.live.options & PTRACE_O_TRACEVFORKDONE)
+	if (state->options & PTRACE_O_TRACEVFORKDONE)
 		flags |= UTRACE_EVENT(VFORK_DONE);
 
 	/*
 	 * ptrace always inhibits normal parent reaping.
-	 * But for a corner case we sometimes see the REAP event instead.
+	 * But for a corner case we sometimes see the REAP event anyway.
 	 */
 	flags |= UTRACE_ACTION_NOREAP | UTRACE_EVENT(REAP);
 
-	state->u.live.stopped = (flags & UTRACE_ACTION_QUIESCE) != 0;
-	if (!state->u.live.stopped) {
-		if (!state->u.live.have_eventmsg)
-			state->u.live.u.siginfo = NULL;
-		if (!(target->flags & PF_EXITING))
+	if (from_stopped && !(flags & UTRACE_ACTION_QUIESCE)) {
+		/*
+		 * We're letting the thread resume from ptrace stop.
+		 * If SIGKILL is waking it up, it can be racing with us here
+		 * to set its own exit_code in do_exit.  Though we clobber
+		 * it here, we check for the case in ptrace_report_death.
+		 */
+		if (!unlikely(target->flags & PF_SIGNALED))
 			target->exit_code = 0;
+
+		if (!state->have_eventmsg)
+			state->u.siginfo = NULL;
+
+		if (target->state == TASK_STOPPED) {
+			/*
+			 * We have to double-check for naughty de_thread
+			 * reaping despite NOREAP, before we can get siglock.
+			 */
+			read_lock(&tasklist_lock);
+			if (!target->exit_state) {
+				spin_lock_irq(&target->sighand->siglock);
+				if (target->state == TASK_STOPPED)
+					target->signal->flags &=
+						~SIGNAL_STOP_STOPPED;
+				spin_unlock_irq(&target->sighand->siglock);
+			}
+			read_unlock(&tasklist_lock);
+		}
 	}
-	utrace_set_flags(target, engine, flags);
+
+	return utrace_set_flags(target, engine, flags);
 }
 
 static int ptrace_traceme(void)
 {
 	struct utrace_attached_engine *engine;
+	struct ptrace_state *state;
+	struct task_struct *parent;
 	int retval;
 
 	engine = utrace_attach(current, (UTRACE_ATTACH_CREATE
@@ -283,16 +196,55 @@ static int ptrace_traceme(void)
 			retval = -EPERM;
 	}
 	else {
+		/*
+		 * We need to preallocate so that we can hold
+		 * rcu_read_lock from extracting ->parent through
+		 * ptrace_setup using it.
+		 */
+		state = kzalloc(sizeof *state, GFP_USER);
+		if (unlikely(state == NULL)) {
+			(void) utrace_detach(current, engine);
+			printk(KERN_ERR
+			       "ptrace out of memory, lost child %d of %d",
+			       current->pid, current->parent->pid);
+			return -ENOMEM;
+		}
+
+		rcu_read_lock();
+		parent = rcu_dereference(current->parent);
+
 		task_lock(current);
-		retval = security_ptrace(current->parent, current);
+		retval = security_ptrace(parent, current);
 		task_unlock(current);
-		if (!retval)
-			retval = ptrace_setup(current, engine,
-					      current->parent, 0, 0);
-		if (retval)
-			utrace_detach(current, engine);
-		else
-			ptrace_update(current, engine, 0);
+
+		if (retval) {
+			kfree(state);
+			(void) utrace_detach(current, engine);
+		}
+		else {
+			state = ptrace_setup(current, engine, parent, 0, 0,
+					     state);
+			if (IS_ERR(state))
+				retval = PTR_ERR(state);
+		}
+		rcu_read_unlock();
+
+		if (!retval) {
+			/*
+			 * This can't fail because we can't die while we
+			 * are here doing this.
+			 */
+			retval = ptrace_update(current, engine, 0, 0);
+			BUG_ON(retval);
+		}
+		else if (unlikely(retval == -EALREADY))
+			/*
+			 * We raced with our parent's exit, which would
+			 * have detached us just after our attach if
+			 * we'd won the race.  Pretend we got attached
+			 * and then detached immediately, no error.
+			 */
+			retval = 0;
 	}
 
 	return retval;
@@ -301,6 +253,7 @@ static int ptrace_traceme(void)
 static int ptrace_attach(struct task_struct *task)
 {
 	struct utrace_attached_engine *engine;
+	struct ptrace_state *state;
 	int retval;
 
 	retval = -EPERM;
@@ -311,6 +264,9 @@ static int ptrace_attach(struct task_struct *task)
 	if (!task->mm)		/* kernel threads */
 		goto bad;
 
+	pr_debug("%d ptrace_attach %d state %lu exit_code %x\n",
+		 current->pid, task->pid, task->state, task->exit_code);
+
 	engine = utrace_attach(task, (UTRACE_ATTACH_CREATE
 				      | UTRACE_ATTACH_EXCLUSIVE
 				      | UTRACE_ATTACH_MATCH_OPS),
@@ -322,54 +278,153 @@ static int ptrace_attach(struct task_struct *task)
 		goto bad;
 	}
 
-	if (ptrace_may_attach(task))
-		retval = ptrace_setup(task, engine, current, 0,
-				      capable(CAP_SYS_PTRACE));
+	pr_debug("%d ptrace_attach %d after utrace_attach: %lu exit_code %x\n",
+		 current->pid, task->pid, task->state, task->exit_code);
+
+	if (ptrace_may_attach(task)) {
+		state = ptrace_setup(task, engine, current, 0,
+				     capable(CAP_SYS_PTRACE), NULL);
+		if (IS_ERR(state))
+			retval = PTR_ERR(state);
+		else {
+			retval = ptrace_update(task, engine, 0, 0);
+
+			pr_debug("%d ptrace_attach %d after ptrace_update (%d)"
+				 " %lu exit_code %x\n",
+				 current->pid, task->pid, retval,
+				 task->state, task->exit_code);
+
+			if (retval) {
+				/*
+				 * It died before we enabled any callbacks.
+				 */
+				if (retval == -EALREADY)
+					retval = -ESRCH;
+				BUG_ON(retval != -ESRCH);
+				ptrace_state_unlink(state);
+				ptrace_done(state);
+			}
+		}
+	}
 	if (retval)
-		utrace_detach(task, engine);
+		(void) utrace_detach(task, engine);
 	else {
-		int stopped;
+		int stopped = 0;
 
-		/* Go */
-		ptrace_update(task, engine, 0);
-		force_sig_specific(SIGSTOP, task);
+		/*
+		 * We must double-check that task has not just died and
+		 * been reaped (after ptrace_update succeeded).
+		 * This happens when exec (de_thread) ignores NOREAP.
+		 * We cannot call into the signal code if it's dead.
+		 */
+		read_lock(&tasklist_lock);
+		if (likely(!task->exit_state)) {
+			force_sig_specific(SIGSTOP, task);
 
-		spin_lock_irq(&task->sighand->siglock);
-		stopped = (task->state == TASK_STOPPED);
-		spin_unlock_irq(&task->sighand->siglock);
+			spin_lock_irq(&task->sighand->siglock);
+			stopped = (task->state == TASK_STOPPED);
+			spin_unlock_irq(&task->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
 
 		if (stopped) {
+			const struct utrace_regset *regset;
+
+			/*
+			 * Set QUIESCE immediately, so we can allow
+			 * ptrace requests while he's in TASK_STOPPED.
+			 */
+			retval = ptrace_update(task, engine,
+					       UTRACE_ACTION_QUIESCE, 0);
+			if (retval)
+				BUG_ON(retval != -ESRCH);
+			retval = 0;
+
 			/*
 			 * Do now the regset 0 writeback that we do on every
 			 * stop, since it's never been done.  On register
 			 * window machines, this makes sure the user memory
 			 * backing the register data is up to date.
 			 */
-			const struct utrace_regset *regset;
 			regset = utrace_regset(task, engine,
 					       utrace_native_view(task), 0);
 			if (regset->writeback)
 				(*regset->writeback)(task, regset, 1);
 		}
+
+		pr_debug("%d ptrace_attach %d complete (%sstopped)"
+			 " state %lu code %x",
+			 current->pid, task->pid, stopped ? "" : "not ",
+			 task->state, task->exit_code);
 	}
 
 bad:
 	return retval;
 }
 
+/*
+ * The task might be dying or being reaped in parallel, in which case
+ * engine and state may no longer be valid.  utrace_detach checks for us.
+ */
 static int ptrace_detach(struct task_struct *task,
-			 struct utrace_attached_engine *engine)
+			 struct utrace_attached_engine *engine,
+			 struct ptrace_state *state)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
+
+	int error;
+
+#ifdef HAVE_ARCH_PTRACE_DETACH
 	/*
-	 * Clearing ->data before detach makes sure an unrelated task
-	 * calling into ptrace_tracer_task won't try to touch stale state.
+	 * Some funky compatibility code in arch_ptrace may have
+	 * needed to install special state it should clean up now.
 	 */
-	rcu_assign_pointer(engine->data, 0UL);
-	utrace_detach(task, engine);
-	ptrace_state_unlink(state);
-	ptrace_done(state);
-	return 0;
+	arch_ptrace_detach(task);
+#endif
+
+	/*
+	 * Traditional ptrace behavior does wake_up_process no matter what
+	 * in ptrace_detach.  But utrace_detach will not do a wakeup if
+	 * it's in a proper job control stop.  We need it to wake up from
+	 * TASK_STOPPED and either resume or process more signals.  A
+	 * pending stop signal will just leave it stopped again, but will
+	 * consume the signal, and reset task->exit_code for the next wait
+	 * call to see.  This is important to userland if ptrace_do_wait
+	 * "stole" the previous unwaited-for-ness (clearing exit_code), but
+	 * there is a pending SIGSTOP, e.g. sent by a PTRACE_ATTACH done
+	 * while already in job control stop.
+	 */
+	read_lock(&tasklist_lock);
+	if (likely(task->signal != NULL)) {
+		spin_lock_irq(&task->sighand->siglock);
+		task->signal->flags &= ~SIGNAL_STOP_STOPPED;
+		spin_unlock_irq(&task->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+
+	error = utrace_detach(task, engine);
+	if (!error) {
+		/*
+		 * We can only get here from the ptracer itself or via
+		 * detach_zombie from another thread in its group.
+		 */
+		BUG_ON(state->parent->tgid != current->tgid);
+		ptrace_state_unlink(state);
+		ptrace_done(state);
+
+		/*
+		 * Wake up any other threads that might be blocked in
+		 * wait.  Though traditional ptrace does not guarantee
+		 * this wakeup on PTRACE_DETACH, it does prevent
+		 * erroneous blocking in wait when another racing
+		 * thread's wait call reap-detaches the last child.
+		 * Without this wakeup, another thread might stay
+		 * blocked when it should return -ECHILD.
+		 */
+		spin_lock_irq(&current->sighand->siglock);
+		wake_up_interruptible(&current->signal->wait_chldexit);
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+	return error;
 }
 
 
@@ -379,30 +434,51 @@ static int ptrace_detach(struct task_struct *task,
 void
 ptrace_exit(struct task_struct *tsk)
 {
-	rcu_read_lock();
-	if (unlikely(!list_empty(&tsk->ptracees))) {
-		struct ptrace_state *state, *next;
+	struct list_head *pos, *n;
 
-		/*
-		 * First detach the utrace layer from all the tasks.
-		 * We don't want to hold any locks while calling utrace_detach.
-		 */
-		list_for_each_entry_rcu(state, &tsk->ptracees, entry) {
-			rcu_assign_pointer(state->engine->data, 0UL);
-			utrace_detach(state->task, state->engine);
-		}
+	/*
+	 * Taking the task_lock after PF_EXITING is set ensures that a
+	 * child in ptrace_traceme will not put itself on our list when
+	 * we might already be tearing it down.
+	 */
+	task_lock(tsk);
+	if (likely(list_empty(&tsk->ptracees))) {
+		task_unlock(tsk);
+		return;
+	}
+	task_unlock(tsk);
 
-		/*
-		 * Now clear out our list and clean up our data structures.
-		 * The task_lock protects our list structure.
-		 */
-		task_lock(tsk);
-		list_for_each_entry_safe(state, next, &tsk->ptracees, entry) {
-			list_del_rcu(&state->entry);
+restart:
+	rcu_read_lock();
+
+	list_for_each_safe_rcu(pos, n, &tsk->ptracees) {
+		struct ptrace_state *state = list_entry(pos,
+							struct ptrace_state,
+							entry);
+		int error = utrace_detach(state->task, state->engine);
+		BUG_ON(state->parent != tsk);
+		if (likely(error == 0)) {
+			ptrace_state_unlink(state);
 			ptrace_done(state);
 		}
-		task_unlock(tsk);
+		else if (unlikely(error == -EALREADY)) {
+			/*
+			 * It's still doing report_death callbacks.
+			 * Just wait for it to settle down.
+			 * Since wait_task_inactive might yield,
+			 * we must go out of rcu_read_lock and restart.
+			 */
+			struct task_struct *p = state->task;
+			get_task_struct(p);
+			rcu_read_unlock();
+			wait_task_inactive(p);
+			put_task_struct(p);
+			goto restart;
+		}
+		else
+			BUG_ON(error != -ESRCH);
 	}
+
 	rcu_read_unlock();
 
 	BUG_ON(!list_empty(&tsk->ptracees));
@@ -421,15 +497,15 @@ ptrace_induce_signal(struct task_struct *target,
 	if (!valid_signal(signr))
 		return -EIO;
 
-	if (state->u.live.syscall) {
+	if (state->syscall) {
 		/*
 		 * This is the traditional ptrace behavior when given
 		 * a signal to resume from a syscall tracing stop.
 		 */
 		send_sig(signr, target, 1);
 	}
-	else if (!state->u.live.have_eventmsg && state->u.live.u.siginfo) {
-		siginfo_t *info = state->u.live.u.siginfo;
+	else if (!state->have_eventmsg && state->u.siginfo) {
+		siginfo_t *info = state->u.siginfo;
 
 		/* Update the siginfo structure if the signal has
 		   changed.  If the debugger wanted something
@@ -450,7 +526,7 @@ ptrace_induce_signal(struct task_struct *target,
 	return 0;
 }
 
-fastcall int
+int
 ptrace_regset_access(struct task_struct *target,
 		     struct utrace_attached_engine *engine,
 		     const struct utrace_regset_view *view,
@@ -485,7 +561,7 @@ ptrace_regset_access(struct task_struct *target,
 	return ret;
 }
 
-fastcall int
+int
 ptrace_onereg_access(struct task_struct *target,
 		     struct utrace_attached_engine *engine,
 		     const struct utrace_regset_view *view,
@@ -523,7 +599,7 @@ ptrace_onereg_access(struct task_struct *target,
 	return ret;
 }
 
-fastcall int
+int
 ptrace_layout_access(struct task_struct *target,
 		     struct utrace_attached_engine *engine,
 		     const struct utrace_regset_view *view,
@@ -556,7 +632,7 @@ ptrace_layout_access(struct task_struct *target,
 			 * This is a no-op/zero-fill portion of struct user.
 			 */
 			ret = 0;
-			if (!write) {
+			if (!write && seg->offset == 0) {
 				if (kdata)
 					memset(kdata, 0, n);
 				else if (clear_user(udata, n))
@@ -623,9 +699,7 @@ ptrace_start(long pid, long request,
 	if (child)
 		get_task_struct(child);
 	read_unlock(&tasklist_lock);
-#ifdef PTRACE_DEBUG
-	printk("ptrace pid %ld => %p\n", pid, child);
-#endif
+	pr_debug("ptrace pid %ld => %p\n", pid, child);
 	if (!child)
 		goto out;
 
@@ -633,7 +707,8 @@ ptrace_start(long pid, long request,
 	if (pid == 1)		/* you may not mess with init */
 		goto out_tsk;
 
-	if (!vx_check(vx_task_xid(child), VX_WATCH|VX_IDENT))
+	ret = -EPERM;
+	if (!vx_check(vx_task_xid(child), VS_WATCH_P|VS_IDENT))
 		goto out_tsk;
 
 	if (request == PTRACE_ATTACH) {
@@ -641,31 +716,25 @@ ptrace_start(long pid, long request,
 		goto out_tsk;
 	}
 
+	rcu_read_lock();
 	engine = utrace_attach(child, UTRACE_ATTACH_MATCH_OPS,
 			       &ptrace_utrace_ops, 0);
 	ret = -ESRCH;
 	if (IS_ERR(engine) || engine == NULL)
-		goto out_tsk;
-	rcu_read_lock();
+		goto out_tsk_rcu;
 	state = rcu_dereference((struct ptrace_state *) engine->data);
-	if (state == NULL || state->parent != current) {
-		rcu_read_unlock();
-		goto out_tsk;
-	}
+	if (state == NULL || state->parent != current)
+		goto out_tsk_rcu;
 	rcu_read_unlock();
 
 	/*
 	 * Traditional ptrace behavior demands that the target already be
 	 * quiescent, but not dead.
 	 */
-	if (request != PTRACE_KILL && !state->u.live.stopped) {
-#ifdef PTRACE_DEBUG
-		printk("%d not stopped (%lx)\n", child->pid, child->state);
-#endif
-		if (child->state != TASK_STOPPED)
-			goto out_tsk;
-		utrace_set_flags(child, engine,
-				 engine->flags | UTRACE_ACTION_QUIESCE);
+	if (request != PTRACE_KILL
+	    && !(engine->flags & UTRACE_ACTION_QUIESCE)) {
+		pr_debug("%d not stopped (%lu)\n", child->pid, child->state);
+		goto out_tsk;
 	}
 
 	/*
@@ -690,6 +759,8 @@ ptrace_start(long pid, long request,
 	*statep = state;
 	return -EIO;
 
+out_tsk_rcu:
+	rcu_read_unlock();
 out_tsk:
 	put_task_struct(child);
 out:
@@ -711,8 +782,13 @@ ptrace_common(long request, struct task_struct *child,
 		 * Detach a process that was attached.
 		 */
 		ret = ptrace_induce_signal(child, engine, data);
-		if (!ret)
-			ret = ptrace_detach(child, engine);
+		if (!ret) {
+			ret = ptrace_detach(child, engine, state);
+			if (ret == -EALREADY) /* Already a zombie.  */
+				ret = -ESRCH;
+			if (ret)
+				BUG_ON(ret != -ESRCH);
+		}
 		break;
 
 		/*
@@ -755,7 +831,7 @@ ptrace_common(long request, struct task_struct *child,
 		 */
 		flags = 0;
 #ifdef PTRACE_SYSEMU
-		state->u.live.sysemu = (request == PTRACE_SYSEMU_SINGLESTEP
+		state->sysemu = (request == PTRACE_SYSEMU_SINGLESTEP
 					|| request == PTRACE_SYSEMU);
 #endif
 		if (request == PTRACE_SINGLESTEP
@@ -775,7 +851,9 @@ ptrace_common(long request, struct task_struct *child,
 			 || request == PTRACE_SYSEMU_SINGLESTEP)
 			flags |= UTRACE_EVENT(SYSCALL_ENTRY);
 #endif
-		ptrace_update(child, engine, flags);
+		ret = ptrace_update(child, engine, flags, 1);
+		if (ret)
+			BUG_ON(ret != -ESRCH);
 		ret = 0;
 		break;
 
@@ -786,8 +864,10 @@ ptrace_common(long request, struct task_struct *child,
 		ret = -EINVAL;
 		if (data & ~PTRACE_O_MASK)
 			break;
-		state->u.live.options = data;
-		ptrace_update(child, engine, UTRACE_ACTION_QUIESCE);
+		state->options = data;
+		ret = ptrace_update(child, engine, UTRACE_ACTION_QUIESCE, 1);
+		if (ret)
+			BUG_ON(ret != -ESRCH);
 		ret = 0;
 		break;
 	}
@@ -803,10 +883,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	struct ptrace_state *state;
 	long ret, val;
 
-#ifdef PTRACE_DEBUG
-	printk("%d sys_ptrace(%ld, %ld, %lx, %lx)\n",
-	       current->pid, request, pid, addr, data);
-#endif
+	pr_debug("%d sys_ptrace(%ld, %ld, %lx, %lx)\n",
+		 current->pid, request, pid, addr, data);
 
 	ret = ptrace_start(pid, request, &child, &engine, &state);
 	if (ret != -EIO)
@@ -849,32 +927,32 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		break;
 
 	case PTRACE_GETEVENTMSG:
-		ret = put_user(state->u.live.have_eventmsg
-			       ? state->u.live.u.eventmsg : 0L,
+		ret = put_user(state->have_eventmsg
+			       ? state->u.eventmsg : 0L,
 			       (unsigned long __user *) data);
 		break;
 	case PTRACE_GETSIGINFO:
 		ret = -EINVAL;
-		if (!state->u.live.have_eventmsg && state->u.live.u.siginfo)
+		if (!state->have_eventmsg && state->u.siginfo)
 			ret = copy_siginfo_to_user((siginfo_t __user *) data,
-						   state->u.live.u.siginfo);
+						   state->u.siginfo);
 		break;
 	case PTRACE_SETSIGINFO:
 		ret = -EINVAL;
-		if (!state->u.live.have_eventmsg && state->u.live.u.siginfo
-		    && copy_from_user(state->u.live.u.siginfo,
-				      (siginfo_t __user *) data,
-				      sizeof(siginfo_t)))
-			ret = -EFAULT;
+		if (!state->have_eventmsg && state->u.siginfo) {
+			ret = 0;
+			if (copy_from_user(state->u.siginfo,
+					   (siginfo_t __user *) data,
+					   sizeof(siginfo_t)))
+				ret = -EFAULT;
+		}
 		break;
 	}
 
 out_tsk:
 	put_task_struct(child);
 out:
-#ifdef PTRACE_DEBUG
-	printk("%d ptrace -> %x\n", current->pid, ret);
-#endif
+	pr_debug("%d ptrace -> %lx\n", current->pid, ret);
 	return ret;
 }
 
@@ -891,10 +969,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	struct ptrace_state *state;
 	compat_long_t ret, val;
 
-#ifdef PTRACE_DEBUG
-	printk("%d compat_sys_ptrace(%d, %d, %x, %x)\n",
-	       current->pid, request, pid, addr, cdata);
-#endif
+	pr_debug("%d compat_sys_ptrace(%d, %d, %x, %x)\n",
+		 current->pid, request, pid, addr, cdata);
 	ret = ptrace_start(pid, request, &child, &engine, &state);
 	if (ret != -EIO)
 		goto out;
@@ -936,22 +1012,22 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 		break;
 
 	case PTRACE_GETEVENTMSG:
-		ret = put_user(state->u.live.have_eventmsg
-			       ? state->u.live.u.eventmsg : 0L,
+		ret = put_user(state->have_eventmsg
+			       ? state->u.eventmsg : 0L,
 			       (compat_long_t __user *) data);
 		break;
 	case PTRACE_GETSIGINFO:
 		ret = -EINVAL;
-		if (!state->u.live.have_eventmsg && state->u.live.u.siginfo)
+		if (!state->have_eventmsg && state->u.siginfo)
 			ret = copy_siginfo_to_user32(
 				(struct compat_siginfo __user *) data,
-				state->u.live.u.siginfo);
+				state->u.siginfo);
 		break;
 	case PTRACE_SETSIGINFO:
 		ret = -EINVAL;
-		if (!state->u.live.have_eventmsg && state->u.live.u.siginfo
+		if (!state->have_eventmsg && state->u.siginfo
 		    && copy_siginfo_from_user32(
-			    state->u.live.u.siginfo,
+			    state->u.siginfo,
 			    (struct compat_siginfo __user *) data))
 			ret = -EFAULT;
 		break;
@@ -960,14 +1036,66 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 out_tsk:
 	put_task_struct(child);
 out:
-#ifdef PTRACE_DEBUG
-	printk("%d ptrace -> %x\n", current->pid, ret);
-#endif
+	pr_debug("%d ptrace -> %lx\n", current->pid, (long)ret);
 	return ret;
 }
 #endif
 
 
+/*
+ * Detach the zombie being reported for wait.
+ */
+static inline void
+detach_zombie(struct task_struct *tsk,
+	      struct task_struct *p, struct ptrace_state *state)
+{
+	int detach_error;
+	struct utrace_attached_engine *engine;
+
+restart:
+	detach_error = 0;
+	rcu_read_lock();
+	if (tsk == current)
+		engine = state->engine;
+	else {
+		/*
+		 * We've excluded other ptrace_do_wait calls.  But the
+		 * ptracer itself might have done ptrace_detach while we
+		 * did not have rcu_read_lock.  So double-check that state
+		 * is still valid.
+		 */
+		engine = utrace_attach(
+			p, (UTRACE_ATTACH_MATCH_OPS
+			    | UTRACE_ATTACH_MATCH_DATA),
+			&ptrace_utrace_ops,
+			(unsigned long) state);
+		if (IS_ERR(engine) || state->parent != tsk)
+			detach_error = -ESRCH;
+		else
+			BUG_ON(state->engine != engine);
+	}
+	rcu_read_unlock();
+	if (likely(!detach_error))
+		detach_error = ptrace_detach(p, engine, state);
+	if (unlikely(detach_error == -EALREADY)) {
+		/*
+		 * It's still doing report_death callbacks.
+		 * Just wait for it to settle down.
+		 */
+		wait_task_inactive(p); /* Might block.  */
+		goto restart;
+	}
+	/*
+	 * A failure with -ESRCH means that report_reap is
+	 * already running and will do the cleanup, or that
+	 * we lost a race with ptrace_detach in another
+	 * thread or with the automatic detach in
+	 * report_death.
+	 */
+	if (detach_error)
+		BUG_ON(detach_error != -ESRCH);
+}
+
 /*
  * We're called with tasklist_lock held for reading.
  * If we return -ECHILD or zero, next_thread(tsk) must still be valid to use.
@@ -982,7 +1110,7 @@ ptrace_do_wait(struct task_struct *tsk,
 	struct ptrace_state *state;
 	struct task_struct *p;
 	int err = -ECHILD;
-	int why, status;
+	int exit_code, why, status;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(state, &tsk->ptracees, entry) {
@@ -1004,34 +1132,88 @@ ptrace_do_wait(struct task_struct *tsk,
 		if (security_task_wait(p))
 			continue;
 
+		/*
+		 * This is a matching child.  If we don't win now, tell
+		 * our caller to block and repeat.  From this point we
+		 * must ensure that wait_chldexit will get a wakeup for
+		 * any tracee stopping, dying, or being detached.
+		 * For death, tasklist_lock guarantees this already.
+		 */
 		err = 0;
-		if (state->u.live.reported)
-			continue;
 
-		if (state->u.live.stopped)
-			goto found;
-		if ((p->state & (TASK_TRACED | TASK_STOPPED))
-		    && (p->signal->flags & SIGNAL_STOP_STOPPED))
-			goto found;
-		if (p->exit_state == EXIT_ZOMBIE) {
+		switch (p->exit_state) {
+		case EXIT_ZOMBIE:
 			if (!likely(options & WEXITED))
 				continue;
-			if (delay_group_leader(p))
+			if (delay_group_leader(p)) {
+				struct task_struct *next = next_thread(p);
+				pr_debug("%d ptrace_do_wait leaving %d "
+					 "zombie code %x "
+					 "delay_group_leader (%d/%lu)\n",
+					 current->pid, p->pid, p->exit_code,
+					 next->pid, next->state);
 				continue;
+			}
+			exit_code = p->exit_code;
 			goto found;
+		case EXIT_DEAD:
+			continue;
+		default:
+			/*
+			 * tasklist_lock holds up any transitions to
+			 * EXIT_ZOMBIE.  After releasing it we are
+			 * guaranteed a wakeup on wait_chldexit after
+			 * any new deaths.
+			 */
+			if (p->flags & PF_EXITING)
+				/*
+				 * It's in do_exit and might have set
+				 * p->exit_code already, but it's not quite
+				 * dead yet.  It will get to report_death
+				 * and wakes us up when it finishes.
+				 */
+				continue;
+			break;
 		}
+
+		/*
+		 * This xchg atomically ensures that only one do_wait
+		 * call can report this thread.  Because exit_code is
+		 * always set before do_notify wakes us up, after this
+		 * check fails we are sure to get a wakeup if it stops.
+		 */
+		exit_code = xchg(&p->exit_code, 0);
+		if (exit_code)
+			goto found;
+
 		// XXX should handle WCONTINUED
+
+		pr_debug("%d ptrace_do_wait leaving %d state %lu code %x\n",
+			 current->pid, p->pid, p->state, p->exit_code);
 	}
 	rcu_read_unlock();
+	if (err == 0)
+		pr_debug("%d ptrace_do_wait blocking\n", current->pid);
+
 	return err;
 
 found:
+	BUG_ON(state->parent != tsk);
 	rcu_read_unlock();
 
-	BUG_ON(state->parent != tsk);
+	pr_debug("%d ptrace_do_wait (%d) found %d code %x (%lu/%d)\n",
+		 current->pid, tsk->pid, p->pid, exit_code,
+		 p->exit_state, p->exit_signal);
+
+	/*
+	 * If there was a group exit in progress, all threads report that
+	 * status.  Most will have SIGKILL in their own exit_code.
+	 */
+	if (p->signal->flags & SIGNAL_GROUP_EXIT)
+		exit_code = p->signal->group_exit_code;
 
 	if (p->exit_state) {
-		if (unlikely(p->parent == state->parent))
+		if (unlikely(p->parent == tsk && p->exit_signal != -1))
 			/*
 			 * This is our natural child we were ptracing.
 			 * When it dies it detaches (see ptrace_report_death).
@@ -1040,23 +1222,26 @@ found:
 			 * the normal wait_task_zombie path instead.
 			 */
 			return 0;
-		if ((p->exit_code & 0x7f) == 0) {
+		if ((exit_code & 0x7f) == 0) {
 			why = CLD_EXITED;
-			status = p->exit_code >> 8;
-		} else {
-			why = (p->exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
-			status = p->exit_code & 0xff;
+			status = exit_code >> 8;
+		}
+		else {
+			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			status = exit_code & 0x7f;
 		}
 	}
 	else {
 		why = CLD_TRAPPED;
-		status = (p->exit_code << 8) | 0x7f;
+		status = exit_code;
+		exit_code = (status << 8) | 0x7f;
 	}
 
 	/*
 	 * At this point we are committed to a successful return
 	 * or a user error return.  Release the tasklist_lock.
 	 */
+	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
 	if (rusagep)
@@ -1076,41 +1261,56 @@ found:
 			err = put_user(status, &infop->si_status);
 	}
 	if (!err && stat_addr)
-		err = put_user(status, stat_addr);
+		err = put_user(exit_code, stat_addr);
 
 	if (!err) {
-		struct utrace *utrace;
-
+		if (why != CLD_TRAPPED)
+			/*
+			 * This was a death report.  The ptracer's wait
+			 * does an implicit detach, so the zombie reports
+			 * to its real parent now.
+			 */
+			detach_zombie(tsk, p, state);
 		err = p->pid;
-
-		/*
-		 * If this was a non-death report, the child might now be
-		 * detaching on death in the same race possible in the
-		 * p->exit_state check above.  So check for p->utrace being
-		 * NULL, then we don't need to update the state any more.
-		 */
-		rcu_read_lock();
-		utrace = rcu_dereference(p->utrace);
-		if (likely(utrace != NULL)) {
-			utrace_lock(utrace);
-			if (unlikely(state->u.live.reported))
-				/*
-				 * Another thread in the group got here
-				 * first and reaped it before we locked.
-				 */
-				err = -ERESTARTNOINTR;
-			state->u.live.reported = 1;
-			utrace_unlock(utrace);
-		}
-		rcu_read_unlock();
-
-		if (err > 0 && why != CLD_TRAPPED)
-			ptrace_detach(p, state->engine);
 	}
 
+	put_task_struct(p);
+
 	return err;
 }
 
+
+/*
+ * All the report callbacks (except death and reap) are subject to a race
+ * with ptrace_exit doing a quick detach and ptrace_done.  It can do this
+ * even when the target is not quiescent, so a callback may already be in
+ * progress when it does ptrace_done.  Callbacks use this function to fetch
+ * the struct ptrace_state while ensuring it doesn't disappear until
+ * put_ptrace_state is called.  This just uses RCU, since state and
+ * anything we try to do to state->parent is safe under rcu_read_lock.
+ */
+static struct ptrace_state *
+get_ptrace_state(struct utrace_attached_engine *engine,
+		 struct task_struct *tsk)
+{
+	struct ptrace_state *state;
+
+	rcu_read_lock();
+	state = rcu_dereference((struct ptrace_state *) engine->data);
+	if (likely(state != NULL))
+		return state;
+
+	rcu_read_unlock();
+	return NULL;
+}
+
+static inline void
+put_ptrace_state(struct ptrace_state *state)
+{
+	rcu_read_unlock();
+}
+
+
 static void
 do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
 {
@@ -1147,6 +1347,10 @@ do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
 		}
 	}
 
+	read_lock(&tasklist_lock);
+	if (unlikely(parent->signal == NULL))
+		goto out;
+
 	sighand = parent->sighand;
 	spin_lock_irqsave(&sighand->siglock, flags);
 	if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
@@ -1157,32 +1361,33 @@ do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
 	 */
 	wake_up_interruptible_sync(&parent->signal->wait_chldexit);
 	spin_unlock_irqrestore(&sighand->siglock, flags);
+
+out:
+	read_unlock(&tasklist_lock);
 }
 
 static u32
-ptrace_report(struct utrace_attached_engine *engine, struct task_struct *tsk,
+ptrace_report(struct utrace_attached_engine *engine,
+	      struct task_struct *tsk,
+	      struct ptrace_state *state,
 	      int code)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
 	const struct utrace_regset *regset;
 
-#ifdef PTRACE_DEBUG
-	printk("%d ptrace_report %d engine %p state %p code %x parent %d (%p)\n",
-	       current->pid, tsk->pid, engine, state, code,
-	       state->parent->pid, state->parent);
-	if (!state->u.live.have_eventmsg && state->u.live.u.siginfo) {
-		const siginfo_t *si = state->u.live.u.siginfo;
-		printk("  si %d code %x errno %d addr %p\n",
-		       si->si_signo, si->si_code, si->si_errno,
-		       si->si_addr);
+	pr_debug("%d ptrace_report %d engine %p"
+		 " state %p code %x parent %d (%p)\n",
+		 current->pid, tsk->pid, engine, state, code,
+		 state->parent->pid, state->parent);
+	if (!state->have_eventmsg && state->u.siginfo) {
+		const siginfo_t *si = state->u.siginfo;
+		pr_debug("  si %d code %x errno %d addr %p\n",
+			 si->si_signo, si->si_code, si->si_errno,
+			 si->si_addr);
 	}
-#endif
-
-	BUG_ON(state->u.live.stopped);
 
 	/*
 	 * Set our QUIESCE flag right now, before notifying the tracer.
-	 * We do this before setting state->u.live.stopped rather than
+	 * We do this before setting tsk->exit_code rather than
 	 * by using UTRACE_ACTION_NEWSTATE in our return value, to
 	 * ensure that the tracer can't get the notification and then
 	 * try to resume us with PTRACE_CONT before we set the flag.
@@ -1199,43 +1404,57 @@ ptrace_report(struct utrace_attached_engine *engine, struct task_struct *tsk,
 	if (regset->writeback)
 		(*regset->writeback)(tsk, regset, 0);
 
-	state->u.live.stopped = 1;
-	state->u.live.reported = 0;
+	BUG_ON(code == 0);
 	tsk->exit_code = code;
 	do_notify(tsk, state->parent, CLD_TRAPPED);
 
-#ifdef PTRACE_DEBUG
-	printk("%d ptrace_report quiescing exit_code %x\n",
-	       current->pid, current->exit_code);
-#endif
+	pr_debug("%d ptrace_report quiescing exit_code %x\n",
+		 current->pid, current->exit_code);
+
+	put_ptrace_state(state);
 
 	return UTRACE_ACTION_RESUME;
 }
 
 static inline u32
-ptrace_event(struct utrace_attached_engine *engine, struct task_struct *tsk,
+ptrace_event(struct utrace_attached_engine *engine,
+	     struct task_struct *tsk,
+	     struct ptrace_state *state,
 	     int event)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
-	state->u.live.syscall = 0;
-	return ptrace_report(engine, tsk, (event << 8) | SIGTRAP);
+	state->syscall = 0;
+	return ptrace_report(engine, tsk, state, (event << 8) | SIGTRAP);
 }
 
-
+/*
+ * Unlike other report callbacks, this can't be called while ptrace_exit
+ * is doing ptrace_done in parallel, so we don't need get_ptrace_state.
+ */
 static u32
 ptrace_report_death(struct utrace_attached_engine *engine,
 		    struct task_struct *tsk)
 {
 	struct ptrace_state *state = (struct ptrace_state *) engine->data;
 
-	if (tsk->parent == state->parent) {
+	if (tsk->exit_code == 0 && unlikely(tsk->flags & PF_SIGNALED))
+		/*
+		 * This can only mean that tsk->exit_code was clobbered
+		 * by ptrace_update or ptrace_do_wait in a race with
+		 * an asynchronous wakeup and exit for SIGKILL.
+		 */
+		tsk->exit_code = SIGKILL;
+
+	if (tsk->parent == state->parent && tsk->exit_signal != -1) {
 		/*
-		 * This is a natural child, so we detach and let the normal
+		 * This is a natural child (excluding clone siblings of a
+		 * child group_leader), so we detach and let the normal
 		 * reporting happen once our NOREAP action is gone.  But
 		 * first, generate a SIGCHLD for those cases where normal
 		 * behavior won't.  A ptrace'd child always generates SIGCHLD.
 		 */
-		if (tsk->exit_signal == -1 || !thread_group_empty(tsk))
+		pr_debug("ptrace %d death natural parent %d exit_code %x\n",
+			 tsk->pid, state->parent->pid, tsk->exit_code);
+		if (!thread_group_empty(tsk))
 			do_notify(tsk, state->parent, CLD_EXITED);
 		ptrace_state_unlink(state);
 		rcu_assign_pointer(engine->data, 0UL);
@@ -1243,8 +1462,17 @@ ptrace_report_death(struct utrace_attached_engine *engine,
 		return UTRACE_ACTION_DETACH;
 	}
 
-	state->u.live.reported = 0;
+	/*
+	 * This might be a second report_death callback for a group leader
+	 * that was delayed when its original report_death callback was made.
+	 * Repeating do_notify is exactly what we need for that case too.
+	 * After the wakeup, ptrace_do_wait will see delay_group_leader false.
+	 */
+
+	pr_debug("ptrace %d death notify %d exit_code %x: ",
+		 tsk->pid, state->parent->pid, tsk->exit_code);
 	do_notify(tsk, state->parent, CLD_EXITED);
+	pr_debug("%d notified %d\n", tsk->pid, state->parent->pid);
 	return UTRACE_ACTION_RESUME;
 }
 
@@ -1256,36 +1484,99 @@ static void
 ptrace_report_reap(struct utrace_attached_engine *engine,
 		   struct task_struct *tsk)
 {
-	struct ptrace_state *state;
-	rcu_read_lock();
-	state = rcu_dereference((struct ptrace_state *) engine->data);
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
 	if (state != NULL) {
 		ptrace_state_unlink(state);
 		rcu_assign_pointer(engine->data, 0UL);
 		ptrace_done(state);
+		put_ptrace_state(state);
 	}
-	rcu_read_unlock();
 }
 
+/*
+ * Start tracing the child.  This has to do put_ptrace_state before it can
+ * do allocation that might block.
+ */
+static void
+ptrace_clone_setup(struct utrace_attached_engine *engine,
+		   struct task_struct *parent,
+		   struct ptrace_state *state,
+		   struct task_struct *child)
+{
+	struct task_struct *tracer;
+	struct utrace_attached_engine *child_engine;
+	struct ptrace_state *child_state;
+	int ret;
+	u8 options;
+	int cap_sys_ptrace;
+
+	tracer = state->parent;
+	options = state->options;
+	cap_sys_ptrace = state->cap_sys_ptrace;
+	get_task_struct(tracer);
+	put_ptrace_state(state);
+
+	child_engine = utrace_attach(child, (UTRACE_ATTACH_CREATE
+					     | UTRACE_ATTACH_EXCLUSIVE
+					     | UTRACE_ATTACH_MATCH_OPS),
+				     &ptrace_utrace_ops, 0UL);
+	if (unlikely(IS_ERR(child_engine))) {
+		BUG_ON(PTR_ERR(child_engine) != -ENOMEM);
+		put_task_struct(tracer);
+		goto nomem;
+	}
+
+	child_state = ptrace_setup(child, child_engine,
+				   tracer, options, cap_sys_ptrace, NULL);
+
+	put_task_struct(tracer);
+
+	if (unlikely(IS_ERR(child_state))) {
+		(void) utrace_detach(child, child_engine);
+
+		if (PTR_ERR(child_state) == -ENOMEM)
+			goto nomem;
+
+		/*
+		 * Our tracer has started exiting.  It's
+		 * too late to set it up tracing the child.
+		 */
+		BUG_ON(PTR_ERR(child_state) != -EALREADY);
+	}
+	else {
+		sigaddset(&child->pending.signal, SIGSTOP);
+		set_tsk_thread_flag(child, TIF_SIGPENDING);
+		ret = ptrace_update(child, child_engine, 0, 0);
+
+		/*
+		 * The child hasn't run yet, it can't have died already.
+		 */
+		BUG_ON(ret);
+	}
+
+	return;
+
+nomem:
+	printk(KERN_ERR "ptrace out of memory, lost child %d of %d",
+	       child->pid, parent->pid);
+}
 
 static u32
 ptrace_report_clone(struct utrace_attached_engine *engine,
 		    struct task_struct *parent,
 		    unsigned long clone_flags, struct task_struct *child)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
-	struct utrace_attached_engine *child_engine;
-	int event = PTRACE_EVENT_FORK;
-	int option = PTRACE_O_TRACEFORK;
-
-#ifdef PTRACE_DEBUG
-	printk("%d (%p) engine %p ptrace_report_clone child %d (%p) fl %lx\n",
-	       parent->pid, parent, engine, child->pid, child, clone_flags);
-#endif
+	int event, option;
+	struct ptrace_state *state = get_ptrace_state(engine, parent);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
 
-	if (clone_flags & CLONE_UNTRACED)
-		goto out;
+	pr_debug("%d (%p) engine %p"
+		 " ptrace_report_clone child %d (%p) fl %lx\n",
+		 parent->pid, parent, engine, child->pid, child, clone_flags);
 
+	event = PTRACE_EVENT_FORK;
+	option = PTRACE_O_TRACEFORK;
 	if (clone_flags & CLONE_VFORK) {
 		event = PTRACE_EVENT_VFORK;
 		option = PTRACE_O_TRACEVFORK;
@@ -1295,45 +1586,34 @@ ptrace_report_clone(struct utrace_attached_engine *engine,
 		option = PTRACE_O_TRACECLONE;
 	}
 
-	if (!(clone_flags & CLONE_PTRACE) && !(state->u.live.options & option))
-		goto out;
-
-	child_engine = utrace_attach(child, (UTRACE_ATTACH_CREATE
-					     | UTRACE_ATTACH_EXCLUSIVE
-					     | UTRACE_ATTACH_MATCH_OPS),
-				     &ptrace_utrace_ops, 0UL);
-	if (unlikely(IS_ERR(child_engine))) {
-		BUG_ON(PTR_ERR(child_engine) != -ENOMEM);
-		printk(KERN_ERR
-		       "ptrace out of memory, lost child %d of %d",
-		       child->pid, parent->pid);
-	}
-	else {
-		int ret = ptrace_setup(child, child_engine,
-				       state->parent,
-				       state->u.live.options,
-				       state->u.live.cap_sys_ptrace);
-		if (unlikely(ret != 0)) {
-			BUG_ON(ret != -ENOMEM);
-			printk(KERN_ERR
-			       "ptrace out of memory, lost child %d of %d",
-			       child->pid, parent->pid);
-			utrace_detach(child, child_engine);
-		}
-		else {
-			sigaddset(&child->pending.signal, SIGSTOP);
-			set_tsk_thread_flag(child, TIF_SIGPENDING);
-			ptrace_update(child, child_engine, 0);
-		}
+	if (state->options & option) {
+		state->have_eventmsg = 1;
+		state->u.eventmsg = child->pid;
 	}
+	else
+		event = 0;
+
+	if (!(clone_flags & CLONE_UNTRACED)
+	    && (event || (clone_flags & CLONE_PTRACE))) {
+		/*
+		 * Have our tracer start following the child too.
+		 */
+		ptrace_clone_setup(engine, parent, state, child);
 
-	if (state->u.live.options & option) {
-		state->u.live.have_eventmsg = 1;
-		state->u.live.u.eventmsg = child->pid;
-		return ptrace_event(engine, parent, event);
+		/*
+		 * That did put_ptrace_state, so we have to check
+		 * again in case our tracer just started exiting.
+		 */
+		state = get_ptrace_state(engine, parent);
+		if (unlikely(state == NULL))
+			return UTRACE_ACTION_RESUME;
 	}
 
-out:
+	if (event)
+		return ptrace_event(engine, parent, state, event);
+
+	put_ptrace_state(state);
+
 	return UTRACE_ACTION_RESUME;
 }
 
@@ -1342,10 +1622,13 @@ static u32
 ptrace_report_vfork_done(struct utrace_attached_engine *engine,
 			 struct task_struct *parent, pid_t child_pid)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
-	state->u.live.have_eventmsg = 1;
-	state->u.live.u.eventmsg = child_pid;
-	return ptrace_event(engine, parent, PTRACE_EVENT_VFORK_DONE);
+	struct ptrace_state *state = get_ptrace_state(engine, parent);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
+
+	state->have_eventmsg = 1;
+	state->u.eventmsg = child_pid;
+	return ptrace_event(engine, parent, state, PTRACE_EVENT_VFORK_DONE);
 }
 
 
@@ -1356,20 +1639,31 @@ ptrace_report_signal(struct utrace_attached_engine *engine,
 		     const struct k_sigaction *orig_ka,
 		     struct k_sigaction *return_ka)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
 	int signo = info == NULL ? SIGTRAP : info->si_signo;
-	state->u.live.syscall = 0;
-	state->u.live.have_eventmsg = 0;
-	state->u.live.u.siginfo = info;
-	return ptrace_report(engine, tsk, signo) | UTRACE_SIGNAL_IGN;
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
+
+	state->syscall = 0;
+	state->have_eventmsg = 0;
+	state->u.siginfo = info;
+	return ptrace_report(engine, tsk, state, signo) | UTRACE_SIGNAL_IGN;
 }
 
 static u32
 ptrace_report_jctl(struct utrace_attached_engine *engine,
 		   struct task_struct *tsk, int type)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
+
+	pr_debug("ptrace %d jctl notify %d type %x exit_code %x\n",
+		 tsk->pid, state->parent->pid, type, tsk->exit_code);
+
 	do_notify(tsk, state->parent, type);
+	put_ptrace_state(state);
+
 	return UTRACE_JCTL_NOSIGCHLD;
 }
 
@@ -1379,11 +1673,13 @@ ptrace_report_exec(struct utrace_attached_engine *engine,
 		   const struct linux_binprm *bprm,
 		   struct pt_regs *regs)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
-	if (state->u.live.options & PTRACE_O_TRACEEXEC)
-		return ptrace_event(engine, tsk, PTRACE_EVENT_EXEC);
-	state->u.live.syscall = 0;
-	return ptrace_report(engine, tsk, SIGTRAP);
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
+
+	return ptrace_event(engine, tsk, state,
+			    (state->options & PTRACE_O_TRACEEXEC)
+			    ? PTRACE_EVENT_EXEC : 0);
 }
 
 static u32
@@ -1391,14 +1687,17 @@ ptrace_report_syscall(struct utrace_attached_engine *engine,
 		      struct task_struct *tsk, struct pt_regs *regs,
 		      int entry)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
+
 #ifdef PTRACE_SYSEMU
-	if (entry && state->u.live.sysemu)
+	if (entry && state->sysemu)
 		tracehook_abort_syscall(regs);
 #endif
-	state->u.live.syscall = 1;
-	return ptrace_report(engine, tsk,
-			     ((state->u.live.options & PTRACE_O_TRACESYSGOOD)
+	state->syscall = 1;
+	return ptrace_report(engine, tsk, state,
+			     ((state->options & PTRACE_O_TRACESYSGOOD)
 			      ? 0x80 : 0) | SIGTRAP);
 }
 
@@ -1420,20 +1719,24 @@ static u32
 ptrace_report_exit(struct utrace_attached_engine *engine,
 		   struct task_struct *tsk, long orig_code, long *code)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
-	state->u.live.have_eventmsg = 1;
-	state->u.live.u.eventmsg = *code;
-	return ptrace_event(engine, tsk, PTRACE_EVENT_EXIT);
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
+	if (unlikely(state == NULL))
+		return UTRACE_ACTION_RESUME;
+
+	state->have_eventmsg = 1;
+	state->u.eventmsg = *code;
+	return ptrace_event(engine, tsk, state, PTRACE_EVENT_EXIT);
 }
 
 static int
 ptrace_unsafe_exec(struct utrace_attached_engine *engine,
 		   struct task_struct *tsk)
 {
-	struct ptrace_state *state = (struct ptrace_state *) engine->data;
 	int unsafe = LSM_UNSAFE_PTRACE;
-	if (state->u.live.cap_sys_ptrace)
+	struct ptrace_state *state = get_ptrace_state(engine, tsk);
+	if (likely(state != NULL) && state->cap_sys_ptrace)
 		unsafe = LSM_UNSAFE_PTRACE_CAP;
+	put_ptrace_state(state);
 	return unsafe;
 }
 
@@ -1441,16 +1744,13 @@ static struct task_struct *
 ptrace_tracer_task(struct utrace_attached_engine *engine,
 		   struct task_struct *target)
 {
-	struct ptrace_state *state;
-
-	/*
-	 * This call is not necessarily made by the target task,
-	 * so ptrace might be getting detached while we run here.
-	 * The state pointer will be NULL if that happens.
-	 */
-	state = rcu_dereference((struct ptrace_state *) engine->data);
-
-	return state == NULL ? NULL : state->parent;
+	struct task_struct *parent = NULL;
+	struct ptrace_state *state = get_ptrace_state(engine, target);
+	if (likely(state != NULL)) {
+		parent = state->parent;
+		put_ptrace_state(state);
+	}
+	return parent;
 }
 
 static int
@@ -1459,20 +1759,15 @@ ptrace_allow_access_process_vm(struct utrace_attached_engine *engine,
 			       struct task_struct *caller)
 {
 	struct ptrace_state *state;
-	int ours;
-
-	/*
-	 * This call is not necessarily made by the target task,
-	 * so ptrace might be getting detached while we run here.
-	 * The state pointer will be NULL if that happens.
-	 */
-	rcu_read_lock();
-	state = rcu_dereference((struct ptrace_state *) engine->data);
-	ours = (state != NULL
-		&& ((engine->flags & UTRACE_ACTION_QUIESCE)
-		    || (target->state == TASK_STOPPED))
-		&& state->parent == caller);
-	rcu_read_unlock();
+	int ours = 0;
+
+	state = get_ptrace_state(engine, target);
+	if (likely(state != NULL)) {
+		ours = (((engine->flags & UTRACE_ACTION_QUIESCE)
+			 || target->state == TASK_STOPPED)
+			&& state->parent == caller);
+		put_ptrace_state(state);
+	}
 
 	return ours && security_ptrace(caller, target) == 0;
 }
@@ -1494,5 +1789,3 @@ static const struct utrace_engine_ops ptrace_utrace_ops =
 	.tracer_task = ptrace_tracer_task,
 	.allow_access_process_vm = ptrace_allow_access_process_vm,
 };
-
-#endif