fedora core 6 1.2949 + vserver 2.2.0

[linux-2.6.git] / kernel / ptrace.c
diff --git a/kernel/ptrace.c b/kernel/ptrace.c

index ee0b4cb..37118d7 100644 (file)
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -18,125 +18,17 @@
  #include <linux/ptrace.h>
  #include <linux/security.h>
  #include <linux/signal.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-
-#ifdef CONFIG_PTRACE
  #include <linux/utrace.h>
  #include <linux/tracehook.h>
+#include <linux/vs_context.h>
  #include <asm/tracehook.h>
-#endif
-
-int getrusage(struct task_struct *, int, struct rusage __user *);
-
-//#define PTRACE_DEBUG
-
-int __ptrace_may_attach(struct task_struct *task)
-{
-       /* May we inspect the given task?
-        * This check is used both for attaching with ptrace
-        * and for allowing access to sensitive information in /proc.
-        *
-        * ptrace_attach denies several cases that /proc allows
-        * because setting up the necessary parent/child relationship
-        * or halting the specified task is impossible.
-        */
-       int dumpable = 0;
-       /* Don't let security modules deny introspection */
-       if (task == current)
-               return 0;
-       if (((current->uid != task->euid) ||
-            (current->uid != task->suid) ||
-            (current->uid != task->uid) ||
-            (current->gid != task->egid) ||
-            (current->gid != task->sgid) ||
-            (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-               return -EPERM;
-       smp_rmb();
-       if (task->mm)
-               dumpable = task->mm->dumpable;
-       if (!dumpable && !capable(CAP_SYS_PTRACE))
-               return -EPERM;
-
-       return security_ptrace(current, task);
-}
-
-int ptrace_may_attach(struct task_struct *task)
-{
-       int err;
-       task_lock(task);
-       err = __ptrace_may_attach(task);
-       task_unlock(task);
-       return !err;
-}
-
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
- */
-
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
-       struct mm_struct *mm;
-       struct vm_area_struct *vma;
-       struct page *page;
-       void *old_buf = buf;
-
-       mm = get_task_mm(tsk);
-       if (!mm)
-               return 0;
-
-       down_read(&mm->mmap_sem);
-       /* ignore errors, just check how much was sucessfully transfered */
-       while (len) {
-               int bytes, ret, offset;
-               void *maddr;
-
-               ret = get_user_pages(tsk, mm, addr, 1,
-                               write, 1, &page, &vma);
-               if (ret <= 0)
-                       break;
-
-               bytes = len;
-               offset = addr & (PAGE_SIZE-1);
-               if (bytes > PAGE_SIZE-offset)
-                       bytes = PAGE_SIZE-offset;
-
-               maddr = kmap(page);
-               if (write) {
-                       copy_to_user_page(vma, page, addr,
-                                         maddr + offset, buf, bytes);
-                       set_page_dirty_lock(page);
-               } else {
-                       copy_from_user_page(vma, page, addr,
-                                           buf, maddr + offset, bytes);
-               }
-               kunmap(page);
-               page_cache_release(page);
-               len -= bytes;
-               buf += bytes;
-               addr += bytes;
-       }
-       up_read(&mm->mmap_sem);
-       mmput(mm);
-       
-       return buf - old_buf;
-}
-
-
-#ifndef CONFIG_PTRACE
-
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
-{
-       return -ENOSYS;
-}
-
-#else
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
  
  struct ptrace_state
  {
+       struct rcu_head rcu;
+ 
         /*
          * These elements are always available, even when the struct is
          * awaiting destruction at the next RCU callback point.
@@ -146,39 +38,23 @@ struct ptrace_state
         struct task_struct *parent; /* Whom we report to.  */
         struct list_head entry; /* Entry on parent->ptracees list.  */
  
-       union {
-               struct rcu_head dead;
-               struct {
-                       u8 options; /* PTRACE_SETOPTIONS bits.  */
-                       unsigned int stopped:1; /* Stopped for report.  */
-                       unsigned int reported:1; /* wait already reported.  */
-                       unsigned int syscall:1; /* Reporting for syscall.  */
+       u8 options;             /* PTRACE_SETOPTIONS bits.  */
+       unsigned int syscall:1; /* Reporting for syscall.  */
  #ifdef PTRACE_SYSEMU
-                       unsigned int sysemu:1; /* PTRACE_SYSEMU in progress. */
+       unsigned int sysemu:1;  /* PTRACE_SYSEMU in progress. */
  #endif
-                       unsigned int have_eventmsg:1; /* u.eventmsg valid. */
-                       unsigned int cap_sys_ptrace:1; /* Tracer capable.  */
-
-                       union
-                       {
-                               unsigned long eventmsg;
-                               siginfo_t *siginfo;
-                       } u;
-               } live;
+       unsigned int have_eventmsg:1; /* u.eventmsg valid. */
+       unsigned int cap_sys_ptrace:1; /* Tracer capable.  */
+
+       union
+       {
+               unsigned long eventmsg;
+               siginfo_t *siginfo;
         } u;
  };
  
  static const struct utrace_engine_ops ptrace_utrace_ops; /* Initialized below. */
  
-
-static void
-ptrace_state_link(struct ptrace_state *state)
-{
-       task_lock(state->parent);
-       list_add_rcu(&state->entry, &state->parent->ptracees);
-       task_unlock(state->parent);
-}
-
  static void
  ptrace_state_unlink(struct ptrace_state *state)
  {
@@ -187,48 +63,60 @@ ptrace_state_unlink(struct ptrace_state *state)
         task_unlock(state->parent);
  }
  
-static int
+static struct ptrace_state *
  ptrace_setup(struct task_struct *target, struct utrace_attached_engine *engine,
-            struct task_struct *parent, u8 options, int cap_sys_ptrace)
+            struct task_struct *parent, u8 options, int cap_sys_ptrace,
+            struct ptrace_state *state)
  {
-       struct ptrace_state *state = kzalloc(sizeof *state, GFP_USER);
-       if (unlikely(state == NULL))
-               return -ENOMEM;
+       if (state == NULL) {
+               state = kzalloc(sizeof *state, GFP_USER);
+               if (unlikely(state == NULL))
+                       return ERR_PTR(-ENOMEM);
+       }
  
         state->engine = engine;
         state->task = target;
         state->parent = parent;
-       state->u.live.options = options;
-       state->u.live.cap_sys_ptrace = cap_sys_ptrace;
-       ptrace_state_link(state);
+       state->options = options;
+       state->cap_sys_ptrace = cap_sys_ptrace;
+
+       task_lock(parent);
+       if (unlikely(parent->flags & PF_EXITING)) {
+               task_unlock(parent);
+               kfree(state);
+               return ERR_PTR(-EALREADY);
+       }
+       list_add_rcu(&state->entry, &state->parent->ptracees);
+       task_unlock(state->parent);
  
         BUG_ON(engine->data != 0);
         rcu_assign_pointer(engine->data, (unsigned long) state);
  
-       return 0;
+       return state;
  }
  
  static void
  ptrace_state_free(struct rcu_head *rhead)
  {
         struct ptrace_state *state = container_of(rhead,
-                                                 struct ptrace_state, u.dead);
+                                                 struct ptrace_state, rcu);
         kfree(state);
  }
  
  static void
  ptrace_done(struct ptrace_state *state)
  {
-       INIT_RCU_HEAD(&state->u.dead);
-       call_rcu(&state->u.dead, ptrace_state_free);
+       INIT_RCU_HEAD(&state->rcu);
+       call_rcu(&state->rcu, ptrace_state_free);
  }
  
  /*
   * Update the tracing engine state to match the new ptrace state.
   */
-static void
-ptrace_update(struct task_struct *target, struct utrace_attached_engine *engine,
-             unsigned long flags)
+static int __must_check
+ptrace_update(struct task_struct *target,
+             struct utrace_attached_engine *engine,
+             unsigned long flags, int from_stopped)
  {
         struct ptrace_state *state = (struct ptrace_state *) engine->data;
  
@@ -236,7 +124,7 @@ ptrace_update(struct task_struct *target, struct utrace_attached_engine *engine,
          * These events are always reported.
          */
         flags |= (UTRACE_EVENT(DEATH) | UTRACE_EVENT(EXEC)
-                 | UTRACE_EVENT_SIGNAL_ALL);
+                 | UTRACE_EVENT_SIGNAL_ALL | UTRACE_EVENT(JCTL));
  
         /*
          * We always have to examine clone events to check for CLONE_PTRACE.
@@ -246,30 +134,55 @@ ptrace_update(struct task_struct *target, struct utrace_attached_engine *engine,
         /*
          * PTRACE_SETOPTIONS can request more events.
          */
-       if (state->u.live.options & PTRACE_O_TRACEEXIT)
+       if (state->options & PTRACE_O_TRACEEXIT)
                 flags |= UTRACE_EVENT(EXIT);
-       if (state->u.live.options & PTRACE_O_TRACEVFORKDONE)
+       if (state->options & PTRACE_O_TRACEVFORKDONE)
                 flags |= UTRACE_EVENT(VFORK_DONE);
  
         /*
          * ptrace always inhibits normal parent reaping.
-        * But for a corner case we sometimes see the REAP event instead.
+        * But for a corner case we sometimes see the REAP event anyway.
          */
         flags |= UTRACE_ACTION_NOREAP | UTRACE_EVENT(REAP);
  
-       state->u.live.stopped = (flags & UTRACE_ACTION_QUIESCE) != 0;
-       if (!state->u.live.stopped) {
-               if (!state->u.live.have_eventmsg)
-                       state->u.live.u.siginfo = NULL;
-               if (!(target->flags & PF_EXITING))
+       if (from_stopped && !(flags & UTRACE_ACTION_QUIESCE)) {
+               /*
+                * We're letting the thread resume from ptrace stop.
+                * If SIGKILL is waking it up, it can be racing with us here
+                * to set its own exit_code in do_exit.  Though we clobber
+                * it here, we check for the case in ptrace_report_death.
+                */
+               if (!unlikely(target->flags & PF_SIGNALED))
                         target->exit_code = 0;
+
+               if (!state->have_eventmsg)
+                       state->u.siginfo = NULL;
+
+               if (target->state == TASK_STOPPED) {
+                       /*
+                        * We have to double-check for naughty de_thread
+                        * reaping despite NOREAP, before we can get siglock.
+                        */
+                       read_lock(&tasklist_lock);
+                       if (!target->exit_state) {
+                               spin_lock_irq(&target->sighand->siglock);
+                               if (target->state == TASK_STOPPED)
+                                       target->signal->flags &=
+                                               ~SIGNAL_STOP_STOPPED;
+                               spin_unlock_irq(&target->sighand->siglock);
+                       }
+                       read_unlock(&tasklist_lock);
+               }
         }
-       utrace_set_flags(target, engine, flags);
+
+       return utrace_set_flags(target, engine, flags);
  }
  
  static int ptrace_traceme(void)
  {
         struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
+       struct task_struct *parent;
         int retval;
  
         engine = utrace_attach(current, (UTRACE_ATTACH_CREATE
@@ -283,16 +196,55 @@ static int ptrace_traceme(void)
                         retval = -EPERM;
         }
         else {
+               /*
+                * We need to preallocate so that we can hold
+                * rcu_read_lock from extracting ->parent through
+                * ptrace_setup using it.
+                */
+               state = kzalloc(sizeof *state, GFP_USER);
+               if (unlikely(state == NULL)) {
+                       (void) utrace_detach(current, engine);
+                       printk(KERN_ERR
+                              "ptrace out of memory, lost child %d of %d",
+                              current->pid, current->parent->pid);
+                       return -ENOMEM;
+               }
+
+               rcu_read_lock();
+               parent = rcu_dereference(current->parent);
+
                 task_lock(current);
-               retval = security_ptrace(current->parent, current);
+               retval = security_ptrace(parent, current);
                 task_unlock(current);
-               if (!retval)
-                       retval = ptrace_setup(current, engine,
-                                             current->parent, 0, 0);
-               if (retval)
-                       utrace_detach(current, engine);
-               else
-                       ptrace_update(current, engine, 0);
+
+               if (retval) {
+                       kfree(state);
+                       (void) utrace_detach(current, engine);
+               }
+               else {
+                       state = ptrace_setup(current, engine, parent, 0, 0,
+                                            state);
+                       if (IS_ERR(state))
+                               retval = PTR_ERR(state);
+               }
+               rcu_read_unlock();
+
+               if (!retval) {
+                       /*
+                        * This can't fail because we can't die while we
+                        * are here doing this.
+                        */
+                       retval = ptrace_update(current, engine, 0, 0);
+                       BUG_ON(retval);
+               }
+               else if (unlikely(retval == -EALREADY))
+                       /*
+                        * We raced with our parent's exit, which would
+                        * have detached us just after our attach if
+                        * we'd won the race.  Pretend we got attached
+                        * and then detached immediately, no error.
+                        */
+                       retval = 0;
         }
  
         return retval;
@@ -301,6 +253,7 @@ static int ptrace_traceme(void)
  static int ptrace_attach(struct task_struct *task)
  {
         struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
         int retval;
  
         retval = -EPERM;
@@ -311,6 +264,9 @@ static int ptrace_attach(struct task_struct *task)
         if (!task->mm)          /* kernel threads */
                 goto bad;
  
+       pr_debug("%d ptrace_attach %d state %lu exit_code %x\n",
+                current->pid, task->pid, task->state, task->exit_code);
+
         engine = utrace_attach(task, (UTRACE_ATTACH_CREATE
                                       | UTRACE_ATTACH_EXCLUSIVE
                                       | UTRACE_ATTACH_MATCH_OPS),
@@ -322,54 +278,153 @@ static int ptrace_attach(struct task_struct *task)
                 goto bad;
         }
  
-       if (ptrace_may_attach(task))
-               retval = ptrace_setup(task, engine, current, 0,
-                                     capable(CAP_SYS_PTRACE));
+       pr_debug("%d ptrace_attach %d after utrace_attach: %lu exit_code %x\n",
+                current->pid, task->pid, task->state, task->exit_code);
+
+       if (ptrace_may_attach(task)) {
+               state = ptrace_setup(task, engine, current, 0,
+                                    capable(CAP_SYS_PTRACE), NULL);
+               if (IS_ERR(state))
+                       retval = PTR_ERR(state);
+               else {
+                       retval = ptrace_update(task, engine, 0, 0);
+
+                       pr_debug("%d ptrace_attach %d after ptrace_update (%d)"
+                                " %lu exit_code %x\n",
+                                current->pid, task->pid, retval,
+                                task->state, task->exit_code);
+
+                       if (retval) {
+                               /*
+                                * It died before we enabled any callbacks.
+                                */
+                               if (retval == -EALREADY)
+                                       retval = -ESRCH;
+                               BUG_ON(retval != -ESRCH);
+                               ptrace_state_unlink(state);
+                               ptrace_done(state);
+                       }
+               }
+       }
         if (retval)
-               utrace_detach(task, engine);
+               (void) utrace_detach(task, engine);
         else {
-               int stopped;
+               int stopped = 0;
  
-               /* Go */
-               ptrace_update(task, engine, 0);
-               force_sig_specific(SIGSTOP, task);
+               /*
+                * We must double-check that task has not just died and
+                * been reaped (after ptrace_update succeeded).
+                * This happens when exec (de_thread) ignores NOREAP.
+                * We cannot call into the signal code if it's dead.
+                */
+               read_lock(&tasklist_lock);
+               if (likely(!task->exit_state)) {
+                       force_sig_specific(SIGSTOP, task);
  
-               spin_lock_irq(&task->sighand->siglock);
-               stopped = (task->state == TASK_STOPPED);
-               spin_unlock_irq(&task->sighand->siglock);
+                       spin_lock_irq(&task->sighand->siglock);
+                       stopped = (task->state == TASK_STOPPED);
+                       spin_unlock_irq(&task->sighand->siglock);
+               }
+               read_unlock(&tasklist_lock);
  
                 if (stopped) {
+                       const struct utrace_regset *regset;
+
+                       /*
+                        * Set QUIESCE immediately, so we can allow
+                        * ptrace requests while he's in TASK_STOPPED.
+                        */
+                       retval = ptrace_update(task, engine,
+                                              UTRACE_ACTION_QUIESCE, 0);
+                       if (retval)
+                               BUG_ON(retval != -ESRCH);
+                       retval = 0;
+
                         /*
                          * Do now the regset 0 writeback that we do on every
                          * stop, since it's never been done.  On register
                          * window machines, this makes sure the user memory
                          * backing the register data is up to date.
                          */
-                       const struct utrace_regset *regset;
                         regset = utrace_regset(task, engine,
                                                utrace_native_view(task), 0);
                         if (regset->writeback)
                                 (*regset->writeback)(task, regset, 1);
                 }
+
+               pr_debug("%d ptrace_attach %d complete (%sstopped)"
+                        " state %lu code %x",
+                        current->pid, task->pid, stopped ? "" : "not ",
+                        task->state, task->exit_code);
         }
  
  bad:
         return retval;
  }
  
+/*
+ * The task might be dying or being reaped in parallel, in which case
+ * engine and state may no longer be valid.  utrace_detach checks for us.
+ */
  static int ptrace_detach(struct task_struct *task,
-                        struct utrace_attached_engine *engine)
+                        struct utrace_attached_engine *engine,
+                        struct ptrace_state *state)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
+
+       int error;
+
+#ifdef HAVE_ARCH_PTRACE_DETACH
         /*
-        * Clearing ->data before detach makes sure an unrelated task
-        * calling into ptrace_tracer_task won't try to touch stale state.
+        * Some funky compatibility code in arch_ptrace may have
+        * needed to install special state it should clean up now.
          */
-       rcu_assign_pointer(engine->data, 0UL);
-       utrace_detach(task, engine);
-       ptrace_state_unlink(state);
-       ptrace_done(state);
-       return 0;
+       arch_ptrace_detach(task);
+#endif
+
+       /*
+        * Traditional ptrace behavior does wake_up_process no matter what
+        * in ptrace_detach.  But utrace_detach will not do a wakeup if
+        * it's in a proper job control stop.  We need it to wake up from
+        * TASK_STOPPED and either resume or process more signals.  A
+        * pending stop signal will just leave it stopped again, but will
+        * consume the signal, and reset task->exit_code for the next wait
+        * call to see.  This is important to userland if ptrace_do_wait
+        * "stole" the previous unwaited-for-ness (clearing exit_code), but
+        * there is a pending SIGSTOP, e.g. sent by a PTRACE_ATTACH done
+        * while already in job control stop.
+        */
+       read_lock(&tasklist_lock);
+       if (likely(task->signal != NULL)) {
+               spin_lock_irq(&task->sighand->siglock);
+               task->signal->flags &= ~SIGNAL_STOP_STOPPED;
+               spin_unlock_irq(&task->sighand->siglock);
+       }
+       read_unlock(&tasklist_lock);
+
+       error = utrace_detach(task, engine);
+       if (!error) {
+               /*
+                * We can only get here from the ptracer itself or via
+                * detach_zombie from another thread in its group.
+                */
+               BUG_ON(state->parent->tgid != current->tgid);
+               ptrace_state_unlink(state);
+               ptrace_done(state);
+
+               /*
+                * Wake up any other threads that might be blocked in
+                * wait.  Though traditional ptrace does not guarantee
+                * this wakeup on PTRACE_DETACH, it does prevent
+                * erroneous blocking in wait when another racing
+                * thread's wait call reap-detaches the last child.
+                * Without this wakeup, another thread might stay
+                * blocked when it should return -ECHILD.
+                */
+               spin_lock_irq(&current->sighand->siglock);
+               wake_up_interruptible(&current->signal->wait_chldexit);
+               spin_unlock_irq(&current->sighand->siglock);
+       }
+       return error;
  }
  
  
@@ -379,30 +434,51 @@ static int ptrace_detach(struct task_struct *task,
  void
  ptrace_exit(struct task_struct *tsk)
  {
-       rcu_read_lock();
-       if (unlikely(!list_empty(&tsk->ptracees))) {
-               struct ptrace_state *state, *next;
+       struct list_head *pos, *n;
  
-               /*
-                * First detach the utrace layer from all the tasks.
-                * We don't want to hold any locks while calling utrace_detach.
-                */
-               list_for_each_entry_rcu(state, &tsk->ptracees, entry) {
-                       rcu_assign_pointer(state->engine->data, 0UL);
-                       utrace_detach(state->task, state->engine);
-               }
+       /*
+        * Taking the task_lock after PF_EXITING is set ensures that a
+        * child in ptrace_traceme will not put itself on our list when
+        * we might already be tearing it down.
+        */
+       task_lock(tsk);
+       if (likely(list_empty(&tsk->ptracees))) {
+               task_unlock(tsk);
+               return;
+       }
+       task_unlock(tsk);
  
-               /*
-                * Now clear out our list and clean up our data structures.
-                * The task_lock protects our list structure.
-                */
-               task_lock(tsk);
-               list_for_each_entry_safe(state, next, &tsk->ptracees, entry) {
-                       list_del_rcu(&state->entry);
+restart:
+       rcu_read_lock();
+
+       list_for_each_safe_rcu(pos, n, &tsk->ptracees) {
+               struct ptrace_state *state = list_entry(pos,
+                                                       struct ptrace_state,
+                                                       entry);
+               int error = utrace_detach(state->task, state->engine);
+               BUG_ON(state->parent != tsk);
+               if (likely(error == 0)) {
+                       ptrace_state_unlink(state);
                         ptrace_done(state);
                 }
-               task_unlock(tsk);
+               else if (unlikely(error == -EALREADY)) {
+                       /*
+                        * It's still doing report_death callbacks.
+                        * Just wait for it to settle down.
+                        * Since wait_task_inactive might yield,
+                        * we must go out of rcu_read_lock and restart.
+                        */
+                       struct task_struct *p = state->task;
+                       get_task_struct(p);
+                       rcu_read_unlock();
+                       wait_task_inactive(p);
+                       put_task_struct(p);
+                       goto restart;
+               }
+               else
+                       BUG_ON(error != -ESRCH);
         }
+
         rcu_read_unlock();
  
         BUG_ON(!list_empty(&tsk->ptracees));
@@ -421,15 +497,15 @@ ptrace_induce_signal(struct task_struct *target,
         if (!valid_signal(signr))
                 return -EIO;
  
-       if (state->u.live.syscall) {
+       if (state->syscall) {
                 /*
                  * This is the traditional ptrace behavior when given
                  * a signal to resume from a syscall tracing stop.
                  */
                 send_sig(signr, target, 1);
         }
-       else if (!state->u.live.have_eventmsg && state->u.live.u.siginfo) {
-               siginfo_t *info = state->u.live.u.siginfo;
+       else if (!state->have_eventmsg && state->u.siginfo) {
+               siginfo_t *info = state->u.siginfo;
  
                 /* Update the siginfo structure if the signal has
                    changed.  If the debugger wanted something
@@ -450,7 +526,7 @@ ptrace_induce_signal(struct task_struct *target,
         return 0;
  }
  
-fastcall int
+int
  ptrace_regset_access(struct task_struct *target,
                      struct utrace_attached_engine *engine,
                      const struct utrace_regset_view *view,
@@ -485,7 +561,7 @@ ptrace_regset_access(struct task_struct *target,
         return ret;
  }
  
-fastcall int
+int
  ptrace_onereg_access(struct task_struct *target,
                      struct utrace_attached_engine *engine,
                      const struct utrace_regset_view *view,
@@ -523,7 +599,7 @@ ptrace_onereg_access(struct task_struct *target,
         return ret;
  }
  
-fastcall int
+int
  ptrace_layout_access(struct task_struct *target,
                      struct utrace_attached_engine *engine,
                      const struct utrace_regset_view *view,
@@ -556,7 +632,7 @@ ptrace_layout_access(struct task_struct *target,
                          * This is a no-op/zero-fill portion of struct user.
                          */
                         ret = 0;
-                       if (!write) {
+                       if (!write && seg->offset == 0) {
                                 if (kdata)
                                         memset(kdata, 0, n);
                                 else if (clear_user(udata, n))
@@ -623,9 +699,7 @@ ptrace_start(long pid, long request,
         if (child)
                 get_task_struct(child);
         read_unlock(&tasklist_lock);
-#ifdef PTRACE_DEBUG
-       printk("ptrace pid %ld => %p\n", pid, child);
-#endif
+       pr_debug("ptrace pid %ld => %p\n", pid, child);
         if (!child)
                 goto out;
  
@@ -633,7 +707,8 @@ ptrace_start(long pid, long request,
         if (pid == 1)           /* you may not mess with init */
                 goto out_tsk;
  
-       if (!vx_check(vx_task_xid(child), VX_WATCH|VX_IDENT))
+       ret = -EPERM;
+       if (!vx_check(vx_task_xid(child), VS_WATCH_P|VS_IDENT))
                 goto out_tsk;
  
         if (request == PTRACE_ATTACH) {
@@ -641,31 +716,25 @@ ptrace_start(long pid, long request,
                 goto out_tsk;
         }
  
+       rcu_read_lock();
         engine = utrace_attach(child, UTRACE_ATTACH_MATCH_OPS,
                                &ptrace_utrace_ops, 0);
         ret = -ESRCH;
         if (IS_ERR(engine) || engine == NULL)
-               goto out_tsk;
-       rcu_read_lock();
+               goto out_tsk_rcu;
         state = rcu_dereference((struct ptrace_state *) engine->data);
-       if (state == NULL || state->parent != current) {
-               rcu_read_unlock();
-               goto out_tsk;
-       }
+       if (state == NULL || state->parent != current)
+               goto out_tsk_rcu;
         rcu_read_unlock();
  
         /*
          * Traditional ptrace behavior demands that the target already be
          * quiescent, but not dead.
          */
-       if (request != PTRACE_KILL && !state->u.live.stopped) {
-#ifdef PTRACE_DEBUG
-               printk("%d not stopped (%lx)\n", child->pid, child->state);
-#endif
-               if (child->state != TASK_STOPPED)
-                       goto out_tsk;
-               utrace_set_flags(child, engine,
-                                engine->flags | UTRACE_ACTION_QUIESCE);
+       if (request != PTRACE_KILL
+           && !(engine->flags & UTRACE_ACTION_QUIESCE)) {
+               pr_debug("%d not stopped (%lu)\n", child->pid, child->state);
+               goto out_tsk;
         }
  
         /*
@@ -690,6 +759,8 @@ ptrace_start(long pid, long request,
         *statep = state;
         return -EIO;
  
+out_tsk_rcu:
+       rcu_read_unlock();
  out_tsk:
         put_task_struct(child);
  out:
@@ -711,8 +782,13 @@ ptrace_common(long request, struct task_struct *child,
                  * Detach a process that was attached.
                  */
                 ret = ptrace_induce_signal(child, engine, data);
-               if (!ret)
-                       ret = ptrace_detach(child, engine);
+               if (!ret) {
+                       ret = ptrace_detach(child, engine, state);
+                       if (ret == -EALREADY) /* Already a zombie.  */
+                               ret = -ESRCH;
+                       if (ret)
+                               BUG_ON(ret != -ESRCH);
+               }
                 break;
  
                 /*
@@ -755,7 +831,7 @@ ptrace_common(long request, struct task_struct *child,
                  */
                 flags = 0;
  #ifdef PTRACE_SYSEMU
-               state->u.live.sysemu = (request == PTRACE_SYSEMU_SINGLESTEP
+               state->sysemu = (request == PTRACE_SYSEMU_SINGLESTEP
                                         || request == PTRACE_SYSEMU);
  #endif
                 if (request == PTRACE_SINGLESTEP
@@ -775,7 +851,9 @@ ptrace_common(long request, struct task_struct *child,
                          || request == PTRACE_SYSEMU_SINGLESTEP)
                         flags |= UTRACE_EVENT(SYSCALL_ENTRY);
  #endif
-               ptrace_update(child, engine, flags);
+               ret = ptrace_update(child, engine, flags, 1);
+               if (ret)
+                       BUG_ON(ret != -ESRCH);
                 ret = 0;
                 break;
  
@@ -786,8 +864,10 @@ ptrace_common(long request, struct task_struct *child,
                 ret = -EINVAL;
                 if (data & ~PTRACE_O_MASK)
                         break;
-               state->u.live.options = data;
-               ptrace_update(child, engine, UTRACE_ACTION_QUIESCE);
+               state->options = data;
+               ret = ptrace_update(child, engine, UTRACE_ACTION_QUIESCE, 1);
+               if (ret)
+                       BUG_ON(ret != -ESRCH);
                 ret = 0;
                 break;
         }
@@ -803,10 +883,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
         struct ptrace_state *state;
         long ret, val;
  
-#ifdef PTRACE_DEBUG
-       printk("%d sys_ptrace(%ld, %ld, %lx, %lx)\n",
-              current->pid, request, pid, addr, data);
-#endif
+       pr_debug("%d sys_ptrace(%ld, %ld, %lx, %lx)\n",
+                current->pid, request, pid, addr, data);
  
         ret = ptrace_start(pid, request, &child, &engine, &state);
         if (ret != -EIO)
@@ -849,32 +927,32 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
                 break;
  
         case PTRACE_GETEVENTMSG:
-               ret = put_user(state->u.live.have_eventmsg
-                              ? state->u.live.u.eventmsg : 0L,
+               ret = put_user(state->have_eventmsg
+                              ? state->u.eventmsg : 0L,
                                (unsigned long __user *) data);
                 break;
         case PTRACE_GETSIGINFO:
                 ret = -EINVAL;
-               if (!state->u.live.have_eventmsg && state->u.live.u.siginfo)
+               if (!state->have_eventmsg && state->u.siginfo)
                         ret = copy_siginfo_to_user((siginfo_t __user *) data,
-                                                  state->u.live.u.siginfo);
+                                                  state->u.siginfo);
                 break;
         case PTRACE_SETSIGINFO:
                 ret = -EINVAL;
-               if (!state->u.live.have_eventmsg && state->u.live.u.siginfo
-                   && copy_from_user(state->u.live.u.siginfo,
-                                     (siginfo_t __user *) data,
-                                     sizeof(siginfo_t)))
-                       ret = -EFAULT;
+               if (!state->have_eventmsg && state->u.siginfo) {
+                       ret = 0;
+                       if (copy_from_user(state->u.siginfo,
+                                          (siginfo_t __user *) data,
+                                          sizeof(siginfo_t)))
+                               ret = -EFAULT;
+               }
                 break;
         }
  
  out_tsk:
         put_task_struct(child);
  out:
-#ifdef PTRACE_DEBUG
-       printk("%d ptrace -> %x\n", current->pid, ret);
-#endif
+       pr_debug("%d ptrace -> %lx\n", current->pid, ret);
         return ret;
  }
  
@@ -891,10 +969,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
         struct ptrace_state *state;
         compat_long_t ret, val;
  
-#ifdef PTRACE_DEBUG
-       printk("%d compat_sys_ptrace(%d, %d, %x, %x)\n",
-              current->pid, request, pid, addr, cdata);
-#endif
+       pr_debug("%d compat_sys_ptrace(%d, %d, %x, %x)\n",
+                current->pid, request, pid, addr, cdata);
         ret = ptrace_start(pid, request, &child, &engine, &state);
         if (ret != -EIO)
                 goto out;
@@ -936,22 +1012,22 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                 break;
  
         case PTRACE_GETEVENTMSG:
-               ret = put_user(state->u.live.have_eventmsg
-                              ? state->u.live.u.eventmsg : 0L,
+               ret = put_user(state->have_eventmsg
+                              ? state->u.eventmsg : 0L,
                                (compat_long_t __user *) data);
                 break;
         case PTRACE_GETSIGINFO:
                 ret = -EINVAL;
-               if (!state->u.live.have_eventmsg && state->u.live.u.siginfo)
+               if (!state->have_eventmsg && state->u.siginfo)
                         ret = copy_siginfo_to_user32(
                                 (struct compat_siginfo __user *) data,
-                               state->u.live.u.siginfo);
+                               state->u.siginfo);
                 break;
         case PTRACE_SETSIGINFO:
                 ret = -EINVAL;
-               if (!state->u.live.have_eventmsg && state->u.live.u.siginfo
+               if (!state->have_eventmsg && state->u.siginfo
                     && copy_siginfo_from_user32(
-                           state->u.live.u.siginfo,
+                           state->u.siginfo,
                             (struct compat_siginfo __user *) data))
                         ret = -EFAULT;
                 break;
@@ -960,14 +1036,66 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
  out_tsk:
         put_task_struct(child);
  out:
-#ifdef PTRACE_DEBUG
-       printk("%d ptrace -> %x\n", current->pid, ret);
-#endif
+       pr_debug("%d ptrace -> %lx\n", current->pid, (long)ret);
         return ret;
  }
  #endif
  
  
+/*
+ * Detach the zombie being reported for wait.
+ */
+static inline void
+detach_zombie(struct task_struct *tsk,
+             struct task_struct *p, struct ptrace_state *state)
+{
+       int detach_error;
+       struct utrace_attached_engine *engine;
+
+restart:
+       detach_error = 0;
+       rcu_read_lock();
+       if (tsk == current)
+               engine = state->engine;
+       else {
+               /*
+                * We've excluded other ptrace_do_wait calls.  But the
+                * ptracer itself might have done ptrace_detach while we
+                * did not have rcu_read_lock.  So double-check that state
+                * is still valid.
+                */
+               engine = utrace_attach(
+                       p, (UTRACE_ATTACH_MATCH_OPS
+                           | UTRACE_ATTACH_MATCH_DATA),
+                       &ptrace_utrace_ops,
+                       (unsigned long) state);
+               if (IS_ERR(engine) || state->parent != tsk)
+                       detach_error = -ESRCH;
+               else
+                       BUG_ON(state->engine != engine);
+       }
+       rcu_read_unlock();
+       if (likely(!detach_error))
+               detach_error = ptrace_detach(p, engine, state);
+       if (unlikely(detach_error == -EALREADY)) {
+               /*
+                * It's still doing report_death callbacks.
+                * Just wait for it to settle down.
+                */
+               wait_task_inactive(p); /* Might block.  */
+               goto restart;
+       }
+       /*
+        * A failure with -ESRCH means that report_reap is
+        * already running and will do the cleanup, or that
+        * we lost a race with ptrace_detach in another
+        * thread or with the automatic detach in
+        * report_death.
+        */
+       if (detach_error)
+               BUG_ON(detach_error != -ESRCH);
+}
+
  /*
   * We're called with tasklist_lock held for reading.
   * If we return -ECHILD or zero, next_thread(tsk) must still be valid to use.
@@ -982,7 +1110,7 @@ ptrace_do_wait(struct task_struct *tsk,
         struct ptrace_state *state;
         struct task_struct *p;
         int err = -ECHILD;
-       int why, status;
+       int exit_code, why, status;
  
         rcu_read_lock();
         list_for_each_entry_rcu(state, &tsk->ptracees, entry) {
@@ -1004,34 +1132,88 @@ ptrace_do_wait(struct task_struct *tsk,
                 if (security_task_wait(p))
                         continue;
  
+               /*
+                * This is a matching child.  If we don't win now, tell
+                * our caller to block and repeat.  From this point we
+                * must ensure that wait_chldexit will get a wakeup for
+                * any tracee stopping, dying, or being detached.
+                * For death, tasklist_lock guarantees this already.
+                */
                 err = 0;
-               if (state->u.live.reported)
-                       continue;
  
-               if (state->u.live.stopped)
-                       goto found;
-               if ((p->state & (TASK_TRACED | TASK_STOPPED))
-                   && (p->signal->flags & SIGNAL_STOP_STOPPED))
-                       goto found;
-               if (p->exit_state == EXIT_ZOMBIE) {
+               switch (p->exit_state) {
+               case EXIT_ZOMBIE:
                         if (!likely(options & WEXITED))
                                 continue;
-                       if (delay_group_leader(p))
+                       if (delay_group_leader(p)) {
+                               struct task_struct *next = next_thread(p);
+                               pr_debug("%d ptrace_do_wait leaving %d "
+                                        "zombie code %x "
+                                        "delay_group_leader (%d/%lu)\n",
+                                        current->pid, p->pid, p->exit_code,
+                                        next->pid, next->state);
                                 continue;
+                       }
+                       exit_code = p->exit_code;
                         goto found;
+               case EXIT_DEAD:
+                       continue;
+               default:
+                       /*
+                        * tasklist_lock holds up any transitions to
+                        * EXIT_ZOMBIE.  After releasing it we are
+                        * guaranteed a wakeup on wait_chldexit after
+                        * any new deaths.
+                        */
+                       if (p->flags & PF_EXITING)
+                               /*
+                                * It's in do_exit and might have set
+                                * p->exit_code already, but it's not quite
+                                * dead yet.  It will get to report_death
+                                * and wakes us up when it finishes.
+                                */
+                               continue;
+                       break;
                 }
+
+               /*
+                * This xchg atomically ensures that only one do_wait
+                * call can report this thread.  Because exit_code is
+                * always set before do_notify wakes us up, after this
+                * check fails we are sure to get a wakeup if it stops.
+                */
+               exit_code = xchg(&p->exit_code, 0);
+               if (exit_code)
+                       goto found;
+
                 // XXX should handle WCONTINUED
+
+               pr_debug("%d ptrace_do_wait leaving %d state %lu code %x\n",
+                        current->pid, p->pid, p->state, p->exit_code);
         }
         rcu_read_unlock();
+       if (err == 0)
+               pr_debug("%d ptrace_do_wait blocking\n", current->pid);
+
         return err;
  
  found:
+       BUG_ON(state->parent != tsk);
         rcu_read_unlock();
  
-       BUG_ON(state->parent != tsk);
+       pr_debug("%d ptrace_do_wait (%d) found %d code %x (%lu/%d)\n",
+                current->pid, tsk->pid, p->pid, exit_code,
+                p->exit_state, p->exit_signal);
+
+       /*
+        * If there was a group exit in progress, all threads report that
+        * status.  Most will have SIGKILL in their own exit_code.
+        */
+       if (p->signal->flags & SIGNAL_GROUP_EXIT)
+               exit_code = p->signal->group_exit_code;
  
         if (p->exit_state) {
-               if (unlikely(p->parent == state->parent))
+               if (unlikely(p->parent == tsk && p->exit_signal != -1))
                         /*
                          * This is our natural child we were ptracing.
                          * When it dies it detaches (see ptrace_report_death).
@@ -1040,23 +1222,26 @@ found:
                          * the normal wait_task_zombie path instead.
                          */
                         return 0;
-               if ((p->exit_code & 0x7f) == 0) {
+               if ((exit_code & 0x7f) == 0) {
                         why = CLD_EXITED;
-                       status = p->exit_code >> 8;
-               } else {
-                       why = (p->exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
-                       status = p->exit_code & 0xff;
+                       status = exit_code >> 8;
+               }
+               else {
+                       why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+                       status = exit_code & 0x7f;
                 }
         }
         else {
                 why = CLD_TRAPPED;
-               status = (p->exit_code << 8) | 0x7f;
+               status = exit_code;
+               exit_code = (status << 8) | 0x7f;
         }
  
         /*
          * At this point we are committed to a successful return
          * or a user error return.  Release the tasklist_lock.
          */
+       get_task_struct(p);
         read_unlock(&tasklist_lock);
  
         if (rusagep)
@@ -1076,41 +1261,56 @@ found:
                         err = put_user(status, &infop->si_status);
         }
         if (!err && stat_addr)
-               err = put_user(status, stat_addr);
+               err = put_user(exit_code, stat_addr);
  
         if (!err) {
-               struct utrace *utrace;
-
+               if (why != CLD_TRAPPED)
+                       /*
+                        * This was a death report.  The ptracer's wait
+                        * does an implicit detach, so the zombie reports
+                        * to its real parent now.
+                        */
+                       detach_zombie(tsk, p, state);
                 err = p->pid;
-
-               /*
-                * If this was a non-death report, the child might now be
-                * detaching on death in the same race possible in the
-                * p->exit_state check above.  So check for p->utrace being
-                * NULL, then we don't need to update the state any more.
-                */
-               rcu_read_lock();
-               utrace = rcu_dereference(p->utrace);
-               if (likely(utrace != NULL)) {
-                       utrace_lock(utrace);
-                       if (unlikely(state->u.live.reported))
-                               /*
-                                * Another thread in the group got here
-                                * first and reaped it before we locked.
-                                */
-                               err = -ERESTARTNOINTR;
-                       state->u.live.reported = 1;
-                       utrace_unlock(utrace);
-               }
-               rcu_read_unlock();
-
-               if (err > 0 && why != CLD_TRAPPED)
-                       ptrace_detach(p, state->engine);
         }
  
+       put_task_struct(p);
+
         return err;
  }
  
+
+/*
+ * All the report callbacks (except death and reap) are subject to a race
+ * with ptrace_exit doing a quick detach and ptrace_done.  It can do this
+ * even when the target is not quiescent, so a callback may already be in
+ * progress when it does ptrace_done.  Callbacks use this function to fetch
+ * the struct ptrace_state while ensuring it doesn't disappear until
+ * put_ptrace_state is called.  This just uses RCU, since state and
+ * anything we try to do to state->parent is safe under rcu_read_lock.
+ */
+static struct ptrace_state *
+get_ptrace_state(struct utrace_attached_engine *engine,
+                struct task_struct *tsk)
+{
+       struct ptrace_state *state;
+
+       rcu_read_lock();
+       state = rcu_dereference((struct ptrace_state *) engine->data);
+       if (likely(state != NULL))
+               return state;
+
+       rcu_read_unlock();
+       return NULL;
+}
+
+static inline void
+put_ptrace_state(struct ptrace_state *state)
+{
+       rcu_read_unlock();
+}
+
+
  static void
  do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
  {
@@ -1147,6 +1347,10 @@ do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
                 }
         }
  
+       read_lock(&tasklist_lock);
+       if (unlikely(parent->signal == NULL))
+               goto out;
+
         sighand = parent->sighand;
         spin_lock_irqsave(&sighand->siglock, flags);
         if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
@@ -1157,30 +1361,33 @@ do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
          */
         wake_up_interruptible_sync(&parent->signal->wait_chldexit);
         spin_unlock_irqrestore(&sighand->siglock, flags);
+
+out:
+       read_unlock(&tasklist_lock);
  }
  
  static u32
-ptrace_report(struct utrace_attached_engine *engine, struct task_struct *tsk,
+ptrace_report(struct utrace_attached_engine *engine,
+             struct task_struct *tsk,
+             struct ptrace_state *state,
               int code)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
         const struct utrace_regset *regset;
  
-#ifdef PTRACE_DEBUG
-       printk("%d ptrace_report %d engine %p state %p code %x parent %d (%p)\n",
-              current->pid, tsk->pid, engine, state, code,
-              state->parent->pid, state->parent);
-       if (!state->u.live.have_eventmsg && state->u.live.u.siginfo) {
-               const siginfo_t *si = state->u.live.u.siginfo;
-               printk("  si %d code %x errno %d addr %p\n",
-                      si->si_signo, si->si_code, si->si_errno,
-                      si->si_addr);
+       pr_debug("%d ptrace_report %d engine %p"
+                " state %p code %x parent %d (%p)\n",
+                current->pid, tsk->pid, engine, state, code,
+                state->parent->pid, state->parent);
+       if (!state->have_eventmsg && state->u.siginfo) {
+               const siginfo_t *si = state->u.siginfo;
+               pr_debug("  si %d code %x errno %d addr %p\n",
+                        si->si_signo, si->si_code, si->si_errno,
+                        si->si_addr);
         }
-#endif
  
         /*
          * Set our QUIESCE flag right now, before notifying the tracer.
-        * We do this before setting state->u.live.stopped rather than
+        * We do this before setting tsk->exit_code rather than
          * by using UTRACE_ACTION_NEWSTATE in our return value, to
          * ensure that the tracer can't get the notification and then
          * try to resume us with PTRACE_CONT before we set the flag.
@@ -1197,43 +1404,57 @@ ptrace_report(struct utrace_attached_engine *engine, struct task_struct *tsk,
         if (regset->writeback)
                 (*regset->writeback)(tsk, regset, 0);
  
-       state->u.live.stopped = 1;
-       state->u.live.reported = 0;
+       BUG_ON(code == 0);
         tsk->exit_code = code;
         do_notify(tsk, state->parent, CLD_TRAPPED);
  
-#ifdef PTRACE_DEBUG
-       printk("%d ptrace_report quiescing exit_code %x\n",
-              current->pid, current->exit_code);
-#endif
+       pr_debug("%d ptrace_report quiescing exit_code %x\n",
+                current->pid, current->exit_code);
+
+       put_ptrace_state(state);
  
         return UTRACE_ACTION_RESUME;
  }
  
  static inline u32
-ptrace_event(struct utrace_attached_engine *engine, struct task_struct *tsk,
+ptrace_event(struct utrace_attached_engine *engine,
+            struct task_struct *tsk,
+            struct ptrace_state *state,
              int event)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
-       state->u.live.syscall = 0;
-       return ptrace_report(engine, tsk, (event << 8) | SIGTRAP);
+       state->syscall = 0;
+       return ptrace_report(engine, tsk, state, (event << 8) | SIGTRAP);
  }
  
-
+/*
+ * Unlike other report callbacks, this can't be called while ptrace_exit
+ * is doing ptrace_done in parallel, so we don't need get_ptrace_state.
+ */
  static u32
  ptrace_report_death(struct utrace_attached_engine *engine,
                     struct task_struct *tsk)
  {
         struct ptrace_state *state = (struct ptrace_state *) engine->data;
  
-       if (tsk->parent == state->parent) {
+       if (tsk->exit_code == 0 && unlikely(tsk->flags & PF_SIGNALED))
+               /*
+                * This can only mean that tsk->exit_code was clobbered
+                * by ptrace_update or ptrace_do_wait in a race with
+                * an asynchronous wakeup and exit for SIGKILL.
+                */
+               tsk->exit_code = SIGKILL;
+
+       if (tsk->parent == state->parent && tsk->exit_signal != -1) {
                 /*
-                * This is a natural child, so we detach and let the normal
+                * This is a natural child (excluding clone siblings of a
+                * child group_leader), so we detach and let the normal
                  * reporting happen once our NOREAP action is gone.  But
                  * first, generate a SIGCHLD for those cases where normal
                  * behavior won't.  A ptrace'd child always generates SIGCHLD.
                  */
-               if (tsk->exit_signal == -1 || !thread_group_empty(tsk))
+               pr_debug("ptrace %d death natural parent %d exit_code %x\n",
+                        tsk->pid, state->parent->pid, tsk->exit_code);
+               if (!thread_group_empty(tsk))
                         do_notify(tsk, state->parent, CLD_EXITED);
                 ptrace_state_unlink(state);
                 rcu_assign_pointer(engine->data, 0UL);
@@ -1241,8 +1462,17 @@ ptrace_report_death(struct utrace_attached_engine *engine,
                 return UTRACE_ACTION_DETACH;
         }
  
-       state->u.live.reported = 0;
+       /*
+        * This might be a second report_death callback for a group leader
+        * that was delayed when its original report_death callback was made.
+        * Repeating do_notify is exactly what we need for that case too.
+        * After the wakeup, ptrace_do_wait will see delay_group_leader false.
+        */
+
+       pr_debug("ptrace %d death notify %d exit_code %x: ",
+                tsk->pid, state->parent->pid, tsk->exit_code);
         do_notify(tsk, state->parent, CLD_EXITED);
+       pr_debug("%d notified %d\n", tsk->pid, state->parent->pid);
         return UTRACE_ACTION_RESUME;
  }
  
@@ -1254,36 +1484,99 @@ static void
  ptrace_report_reap(struct utrace_attached_engine *engine,
                    struct task_struct *tsk)
  {
-       struct ptrace_state *state;
-       rcu_read_lock();
-       state = rcu_dereference((struct ptrace_state *) engine->data);
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
         if (state != NULL) {
                 ptrace_state_unlink(state);
                 rcu_assign_pointer(engine->data, 0UL);
                 ptrace_done(state);
+               put_ptrace_state(state);
         }
-       rcu_read_unlock();
  }
  
+/*
+ * Start tracing the child.  This has to do put_ptrace_state before it can
+ * do allocation that might block.
+ */
+static void
+ptrace_clone_setup(struct utrace_attached_engine *engine,
+                  struct task_struct *parent,
+                  struct ptrace_state *state,
+                  struct task_struct *child)
+{
+       struct task_struct *tracer;
+       struct utrace_attached_engine *child_engine;
+       struct ptrace_state *child_state;
+       int ret;
+       u8 options;
+       int cap_sys_ptrace;
+
+       tracer = state->parent;
+       options = state->options;
+       cap_sys_ptrace = state->cap_sys_ptrace;
+       get_task_struct(tracer);
+       put_ptrace_state(state);
+
+       child_engine = utrace_attach(child, (UTRACE_ATTACH_CREATE
+                                            | UTRACE_ATTACH_EXCLUSIVE
+                                            | UTRACE_ATTACH_MATCH_OPS),
+                                    &ptrace_utrace_ops, 0UL);
+       if (unlikely(IS_ERR(child_engine))) {
+               BUG_ON(PTR_ERR(child_engine) != -ENOMEM);
+               put_task_struct(tracer);
+               goto nomem;
+       }
+
+       child_state = ptrace_setup(child, child_engine,
+                                  tracer, options, cap_sys_ptrace, NULL);
+
+       put_task_struct(tracer);
+
+       if (unlikely(IS_ERR(child_state))) {
+               (void) utrace_detach(child, child_engine);
+
+               if (PTR_ERR(child_state) == -ENOMEM)
+                       goto nomem;
+
+               /*
+                * Our tracer has started exiting.  It's
+                * too late to set it up tracing the child.
+                */
+               BUG_ON(PTR_ERR(child_state) != -EALREADY);
+       }
+       else {
+               sigaddset(&child->pending.signal, SIGSTOP);
+               set_tsk_thread_flag(child, TIF_SIGPENDING);
+               ret = ptrace_update(child, child_engine, 0, 0);
+
+               /*
+                * The child hasn't run yet, it can't have died already.
+                */
+               BUG_ON(ret);
+       }
+
+       return;
+
+nomem:
+       printk(KERN_ERR "ptrace out of memory, lost child %d of %d",
+              child->pid, parent->pid);
+}
  
  static u32
  ptrace_report_clone(struct utrace_attached_engine *engine,
                     struct task_struct *parent,
                     unsigned long clone_flags, struct task_struct *child)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
-       struct utrace_attached_engine *child_engine;
-       int event = PTRACE_EVENT_FORK;
-       int option = PTRACE_O_TRACEFORK;
-
-#ifdef PTRACE_DEBUG
-       printk("%d (%p) engine %p ptrace_report_clone child %d (%p) fl %lx\n",
-              parent->pid, parent, engine, child->pid, child, clone_flags);
-#endif
+       int event, option;
+       struct ptrace_state *state = get_ptrace_state(engine, parent);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
  
-       if (clone_flags & CLONE_UNTRACED)
-               goto out;
+       pr_debug("%d (%p) engine %p"
+                " ptrace_report_clone child %d (%p) fl %lx\n",
+                parent->pid, parent, engine, child->pid, child, clone_flags);
  
+       event = PTRACE_EVENT_FORK;
+       option = PTRACE_O_TRACEFORK;
         if (clone_flags & CLONE_VFORK) {
                 event = PTRACE_EVENT_VFORK;
                 option = PTRACE_O_TRACEVFORK;
@@ -1293,45 +1586,34 @@ ptrace_report_clone(struct utrace_attached_engine *engine,
                 option = PTRACE_O_TRACECLONE;
         }
  
-       if (!(clone_flags & CLONE_PTRACE) && !(state->u.live.options & option))
-               goto out;
-
-       child_engine = utrace_attach(child, (UTRACE_ATTACH_CREATE
-                                            | UTRACE_ATTACH_EXCLUSIVE
-                                            | UTRACE_ATTACH_MATCH_OPS),
-                                    &ptrace_utrace_ops, 0UL);
-       if (unlikely(IS_ERR(child_engine))) {
-               BUG_ON(PTR_ERR(child_engine) != -ENOMEM);
-               printk(KERN_ERR
-                      "ptrace out of memory, lost child %d of %d",
-                      child->pid, parent->pid);
-       }
-       else {
-               int ret = ptrace_setup(child, child_engine,
-                                      state->parent,
-                                      state->u.live.options,
-                                      state->u.live.cap_sys_ptrace);
-               if (unlikely(ret != 0)) {
-                       BUG_ON(ret != -ENOMEM);
-                       printk(KERN_ERR
-                              "ptrace out of memory, lost child %d of %d",
-                              child->pid, parent->pid);
-                       utrace_detach(child, child_engine);
-               }
-               else {
-                       sigaddset(&child->pending.signal, SIGSTOP);
-                       set_tsk_thread_flag(child, TIF_SIGPENDING);
-                       ptrace_update(child, child_engine, 0);
-               }
+       if (state->options & option) {
+               state->have_eventmsg = 1;
+               state->u.eventmsg = child->pid;
         }
+       else
+               event = 0;
+
+       if (!(clone_flags & CLONE_UNTRACED)
+           && (event || (clone_flags & CLONE_PTRACE))) {
+               /*
+                * Have our tracer start following the child too.
+                */
+               ptrace_clone_setup(engine, parent, state, child);
  
-       if (state->u.live.options & option) {
-               state->u.live.have_eventmsg = 1;
-               state->u.live.u.eventmsg = child->pid;
-               return ptrace_event(engine, parent, event);
+               /*
+                * That did put_ptrace_state, so we have to check
+                * again in case our tracer just started exiting.
+                */
+               state = get_ptrace_state(engine, parent);
+               if (unlikely(state == NULL))
+                       return UTRACE_ACTION_RESUME;
         }
  
-out:
+       if (event)
+               return ptrace_event(engine, parent, state, event);
+
+       put_ptrace_state(state);
+
         return UTRACE_ACTION_RESUME;
  }
  
@@ -1340,10 +1622,13 @@ static u32
  ptrace_report_vfork_done(struct utrace_attached_engine *engine,
                          struct task_struct *parent, pid_t child_pid)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
-       state->u.live.have_eventmsg = 1;
-       state->u.live.u.eventmsg = child_pid;
-       return ptrace_event(engine, parent, PTRACE_EVENT_VFORK_DONE);
+       struct ptrace_state *state = get_ptrace_state(engine, parent);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       state->have_eventmsg = 1;
+       state->u.eventmsg = child_pid;
+       return ptrace_event(engine, parent, state, PTRACE_EVENT_VFORK_DONE);
  }
  
  
@@ -1354,20 +1639,31 @@ ptrace_report_signal(struct utrace_attached_engine *engine,
                      const struct k_sigaction *orig_ka,
                      struct k_sigaction *return_ka)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
         int signo = info == NULL ? SIGTRAP : info->si_signo;
-       state->u.live.syscall = 0;
-       state->u.live.have_eventmsg = 0;
-       state->u.live.u.siginfo = info;
-       return ptrace_report(engine, tsk, signo) | UTRACE_SIGNAL_IGN;
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       state->syscall = 0;
+       state->have_eventmsg = 0;
+       state->u.siginfo = info;
+       return ptrace_report(engine, tsk, state, signo) | UTRACE_SIGNAL_IGN;
  }
  
  static u32
  ptrace_report_jctl(struct utrace_attached_engine *engine,
                    struct task_struct *tsk, int type)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       pr_debug("ptrace %d jctl notify %d type %x exit_code %x\n",
+                tsk->pid, state->parent->pid, type, tsk->exit_code);
+
         do_notify(tsk, state->parent, type);
+       put_ptrace_state(state);
+
         return UTRACE_JCTL_NOSIGCHLD;
  }
  
@@ -1377,11 +1673,13 @@ ptrace_report_exec(struct utrace_attached_engine *engine,
                    const struct linux_binprm *bprm,
                    struct pt_regs *regs)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
-       if (state->u.live.options & PTRACE_O_TRACEEXEC)
-               return ptrace_event(engine, tsk, PTRACE_EVENT_EXEC);
-       state->u.live.syscall = 0;
-       return ptrace_report(engine, tsk, SIGTRAP);
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       return ptrace_event(engine, tsk, state,
+                           (state->options & PTRACE_O_TRACEEXEC)
+                           ? PTRACE_EVENT_EXEC : 0);
  }
  
  static u32
@@ -1389,14 +1687,17 @@ ptrace_report_syscall(struct utrace_attached_engine *engine,
                       struct task_struct *tsk, struct pt_regs *regs,
                       int entry)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
  #ifdef PTRACE_SYSEMU
-       if (entry && state->u.live.sysemu)
+       if (entry && state->sysemu)
                 tracehook_abort_syscall(regs);
  #endif
-       state->u.live.syscall = 1;
-       return ptrace_report(engine, tsk,
-                            ((state->u.live.options & PTRACE_O_TRACESYSGOOD)
+       state->syscall = 1;
+       return ptrace_report(engine, tsk, state,
+                            ((state->options & PTRACE_O_TRACESYSGOOD)
                               ? 0x80 : 0) | SIGTRAP);
  }
  
@@ -1418,20 +1719,24 @@ static u32
  ptrace_report_exit(struct utrace_attached_engine *engine,
                    struct task_struct *tsk, long orig_code, long *code)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
-       state->u.live.have_eventmsg = 1;
-       state->u.live.u.eventmsg = *code;
-       return ptrace_event(engine, tsk, PTRACE_EVENT_EXIT);
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       state->have_eventmsg = 1;
+       state->u.eventmsg = *code;
+       return ptrace_event(engine, tsk, state, PTRACE_EVENT_EXIT);
  }
  
  static int
  ptrace_unsafe_exec(struct utrace_attached_engine *engine,
                    struct task_struct *tsk)
  {
-       struct ptrace_state *state = (struct ptrace_state *) engine->data;
         int unsafe = LSM_UNSAFE_PTRACE;
-       if (state->u.live.cap_sys_ptrace)
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (likely(state != NULL) && state->cap_sys_ptrace)
                 unsafe = LSM_UNSAFE_PTRACE_CAP;
+       put_ptrace_state(state);
         return unsafe;
  }
  
@@ -1439,16 +1744,13 @@ static struct task_struct *
  ptrace_tracer_task(struct utrace_attached_engine *engine,
                    struct task_struct *target)
  {
-       struct ptrace_state *state;
-
-       /*
-        * This call is not necessarily made by the target task,
-        * so ptrace might be getting detached while we run here.
-        * The state pointer will be NULL if that happens.
-        */
-       state = rcu_dereference((struct ptrace_state *) engine->data);
-
-       return state == NULL ? NULL : state->parent;
+       struct task_struct *parent = NULL;
+       struct ptrace_state *state = get_ptrace_state(engine, target);
+       if (likely(state != NULL)) {
+               parent = state->parent;
+               put_ptrace_state(state);
+       }
+       return parent;
  }
  
  static int
@@ -1457,20 +1759,15 @@ ptrace_allow_access_process_vm(struct utrace_attached_engine *engine,
                                struct task_struct *caller)
  {
         struct ptrace_state *state;
-       int ours;
-
-       /*
-        * This call is not necessarily made by the target task,
-        * so ptrace might be getting detached while we run here.
-        * The state pointer will be NULL if that happens.
-        */
-       rcu_read_lock();
-       state = rcu_dereference((struct ptrace_state *) engine->data);
-       ours = (state != NULL
-               && ((engine->flags & UTRACE_ACTION_QUIESCE)
-                   || (target->state == TASK_STOPPED))
-               && state->parent == caller);
-       rcu_read_unlock();
+       int ours = 0;
+
+       state = get_ptrace_state(engine, target);
+       if (likely(state != NULL)) {
+               ours = (((engine->flags & UTRACE_ACTION_QUIESCE)
+                        || target->state == TASK_STOPPED)
+                       && state->parent == caller);
+               put_ptrace_state(state);
+       }
  
         return ours && security_ptrace(caller, target) == 0;
  }
@@ -1492,5 +1789,3 @@ static const struct utrace_engine_ops ptrace_utrace_ops =
         .tracer_task = ptrace_tracer_task,
         .allow_access_process_vm = ptrace_allow_access_process_vm,
  };
-
-#endif