fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / kernel / ptrace.c
index cc576b6..37118d7 100644 (file)
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
-#include <linux/vs_cvirt.h>
-
+#include <linux/utrace.h>
+#include <linux/tracehook.h>
+#include <linux/vs_context.h>
+#include <asm/tracehook.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
-/*
- * ptrace a task: make the debugger its new parent and
- * move it to the ptrace list.
- *
- * Must be called with the tasklist lock write-held.
- */
-void __ptrace_link(task_t *child, task_t *new_parent)
+struct ptrace_state
 {
-       if (!list_empty(&child->ptrace_list))
-               BUG();
-       if (child->parent == new_parent)
-               return;
-       list_add(&child->ptrace_list, &child->parent->ptrace_children);
-       REMOVE_LINKS(child);
-       child->parent = new_parent;
-       SET_LINKS(child);
-}
+       struct rcu_head rcu;
  
-/*
- * Turn a tracing stop into a normal stop now, since with no tracer there
- * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
- * signal sent that would resume the child, but didn't because it was in
- * TASK_TRACED, resume it now.
- * Requires that irqs be disabled.
- */
-void ptrace_untrace(task_t *child)
-{
-       spin_lock(&child->sighand->siglock);
-       if (child->state == TASK_TRACED) {
-               if (child->signal->flags & SIGNAL_STOP_STOPPED) {
-                       child->state = TASK_STOPPED;
-               } else {
-                       signal_wake_up(child, 1);
-               }
-       }
-       spin_unlock(&child->sighand->siglock);
+       /*
+        * These elements are always available, even when the struct is
+        * awaiting destruction at the next RCU callback point.
+        */
+       struct utrace_attached_engine *engine;
+       struct task_struct *task; /* Target task.  */
+       struct task_struct *parent; /* Whom we report to.  */
+       struct list_head entry; /* Entry on parent->ptracees list.  */
+
+       u8 options;             /* PTRACE_SETOPTIONS bits.  */
+       unsigned int syscall:1; /* Reporting for syscall.  */
+#ifdef PTRACE_SYSEMU
+       unsigned int sysemu:1;  /* PTRACE_SYSEMU in progress. */
+#endif
+       unsigned int have_eventmsg:1; /* u.eventmsg valid. */
+       unsigned int cap_sys_ptrace:1; /* Tracer capable.  */
+
+       union
+       {
+               unsigned long eventmsg;
+               siginfo_t *siginfo;
+       } u;
+};
+
+static const struct utrace_engine_ops ptrace_utrace_ops; /* Initialized below. */
+
+static void
+ptrace_state_unlink(struct ptrace_state *state)
+{
+       task_lock(state->parent);
+       list_del_rcu(&state->entry);
+       task_unlock(state->parent);
 }
 
-/*
- * unptrace a task: move it back to its original parent and
- * remove it from the ptrace list.
- *
- * Must be called with the tasklist lock write-held.
- */
-void __ptrace_unlink(task_t *child)
+static struct ptrace_state *
+ptrace_setup(struct task_struct *target, struct utrace_attached_engine *engine,
+            struct task_struct *parent, u8 options, int cap_sys_ptrace,
+            struct ptrace_state *state)
 {
-       BUG_ON(!child->ptrace);
+       if (state == NULL) {
+               state = kzalloc(sizeof *state, GFP_USER);
+               if (unlikely(state == NULL))
+                       return ERR_PTR(-ENOMEM);
+       }
 
-       child->ptrace = 0;
-       if (!list_empty(&child->ptrace_list)) {
-               list_del_init(&child->ptrace_list);
-               REMOVE_LINKS(child);
-               child->parent = child->real_parent;
-               SET_LINKS(child);
+       state->engine = engine;
+       state->task = target;
+       state->parent = parent;
+       state->options = options;
+       state->cap_sys_ptrace = cap_sys_ptrace;
+
+       task_lock(parent);
+       if (unlikely(parent->flags & PF_EXITING)) {
+               task_unlock(parent);
+               kfree(state);
+               return ERR_PTR(-EALREADY);
        }
+       list_add_rcu(&state->entry, &state->parent->ptracees);
+       task_unlock(state->parent);
 
-       if (child->state == TASK_TRACED)
-               ptrace_untrace(child);
+       BUG_ON(engine->data != 0);
+       rcu_assign_pointer(engine->data, (unsigned long) state);
+
+       return state;
+}
+
+static void
+ptrace_state_free(struct rcu_head *rhead)
+{
+       struct ptrace_state *state = container_of(rhead,
+                                                 struct ptrace_state, rcu);
+       kfree(state);
+}
+
+static void
+ptrace_done(struct ptrace_state *state)
+{
+       INIT_RCU_HEAD(&state->rcu);
+       call_rcu(&state->rcu, ptrace_state_free);
 }
 
 /*
- * Check that we have indeed attached to the thing..
+ * Update the tracing engine state to match the new ptrace state.
  */
-int ptrace_check_attach(struct task_struct *child, int kill)
+static int __must_check
+ptrace_update(struct task_struct *target,
+             struct utrace_attached_engine *engine,
+             unsigned long flags, int from_stopped)
 {
-       int ret = -ESRCH;
+       struct ptrace_state *state = (struct ptrace_state *) engine->data;
 
        /*
-        * We take the read lock around doing both checks to close a
-        * possible race where someone else was tracing our child and
-        * detached between these two checks.  After this locked check,
-        * we are sure that this is our traced child and that can only
-        * be changed by us so it's not changing right after this.
+        * These events are always reported.
         */
-       read_lock(&tasklist_lock);
-       if ((child->ptrace & PT_PTRACED) && child->parent == current &&
-           (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
-           && child->signal != NULL) {
-               ret = 0;
-               spin_lock_irq(&child->sighand->siglock);
-               if (child->state == TASK_STOPPED) {
-                       child->state = TASK_TRACED;
-               } else if (child->state != TASK_TRACED && !kill) {
-                       ret = -ESRCH;
-               }
-               spin_unlock_irq(&child->sighand->siglock);
-       }
-       read_unlock(&tasklist_lock);
+       flags |= (UTRACE_EVENT(DEATH) | UTRACE_EVENT(EXEC)
+                 | UTRACE_EVENT_SIGNAL_ALL | UTRACE_EVENT(JCTL));
+
+       /*
+        * We always have to examine clone events to check for CLONE_PTRACE.
+        */
+       flags |= UTRACE_EVENT(CLONE);
 
-       if (!ret && !kill) {
-               wait_task_inactive(child);
+       /*
+        * PTRACE_SETOPTIONS can request more events.
+        */
+       if (state->options & PTRACE_O_TRACEEXIT)
+               flags |= UTRACE_EVENT(EXIT);
+       if (state->options & PTRACE_O_TRACEVFORKDONE)
+               flags |= UTRACE_EVENT(VFORK_DONE);
+
+       /*
+        * ptrace always inhibits normal parent reaping.
+        * But for a corner case we sometimes see the REAP event anyway.
+        */
+       flags |= UTRACE_ACTION_NOREAP | UTRACE_EVENT(REAP);
+
+       if (from_stopped && !(flags & UTRACE_ACTION_QUIESCE)) {
+               /*
+                * We're letting the thread resume from ptrace stop.
+                * If SIGKILL is waking it up, it can be racing with us here
+                * to set its own exit_code in do_exit.  Though we clobber
+                * it here, we check for the case in ptrace_report_death.
+                */
+               if (!unlikely(target->flags & PF_SIGNALED))
+                       target->exit_code = 0;
+
+               if (!state->have_eventmsg)
+                       state->u.siginfo = NULL;
+
+               if (target->state == TASK_STOPPED) {
+                       /*
+                        * We have to double-check for naughty de_thread
+                        * reaping despite NOREAP, before we can get siglock.
+                        */
+                       read_lock(&tasklist_lock);
+                       if (!target->exit_state) {
+                               spin_lock_irq(&target->sighand->siglock);
+                               if (target->state == TASK_STOPPED)
+                                       target->signal->flags &=
+                                               ~SIGNAL_STOP_STOPPED;
+                               spin_unlock_irq(&target->sighand->siglock);
+                       }
+                       read_unlock(&tasklist_lock);
+               }
        }
 
-       /* All systems go.. */
-       return ret;
+       return utrace_set_flags(target, engine, flags);
 }
 
-static int may_attach(struct task_struct *task)
+static int ptrace_traceme(void)
 {
-       if (!task->mm)
-               return -EPERM;
-       if (((current->uid != task->euid) ||
-            (current->uid != task->suid) ||
-            (current->uid != task->uid) ||
-            (current->gid != task->egid) ||
-            (current->gid != task->sgid) ||
-            (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-               return -EPERM;
-       smp_rmb();
-       if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-               return -EPERM;
+       struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
+       struct task_struct *parent;
+       int retval;
 
-       return security_ptrace(current, task);
-}
+       engine = utrace_attach(current, (UTRACE_ATTACH_CREATE
+                                        | UTRACE_ATTACH_EXCLUSIVE
+                                        | UTRACE_ATTACH_MATCH_OPS),
+                              &ptrace_utrace_ops, 0UL);
 
-int ptrace_may_attach(struct task_struct *task)
-{
-       int err;
-       task_lock(task);
-       err = may_attach(task);
-       task_unlock(task);
-       return !err;
+       if (IS_ERR(engine)) {
+               retval = PTR_ERR(engine);
+               if (retval == -EEXIST)
+                       retval = -EPERM;
+       }
+       else {
+               /*
+                * We need to preallocate so that we can hold
+                * rcu_read_lock from extracting ->parent through
+                * ptrace_setup using it.
+                */
+               state = kzalloc(sizeof *state, GFP_USER);
+               if (unlikely(state == NULL)) {
+                       (void) utrace_detach(current, engine);
+                       printk(KERN_ERR
+                              "ptrace out of memory, lost child %d of %d",
+                              current->pid, current->parent->pid);
+                       return -ENOMEM;
+               }
+
+               rcu_read_lock();
+               parent = rcu_dereference(current->parent);
+
+               task_lock(current);
+               retval = security_ptrace(parent, current);
+               task_unlock(current);
+
+               if (retval) {
+                       kfree(state);
+                       (void) utrace_detach(current, engine);
+               }
+               else {
+                       state = ptrace_setup(current, engine, parent, 0, 0,
+                                            state);
+                       if (IS_ERR(state))
+                               retval = PTR_ERR(state);
+               }
+               rcu_read_unlock();
+
+               if (!retval) {
+                       /*
+                        * This can't fail because we can't die while we
+                        * are here doing this.
+                        */
+                       retval = ptrace_update(current, engine, 0, 0);
+                       BUG_ON(retval);
+               }
+               else if (unlikely(retval == -EALREADY))
+                       /*
+                        * We raced with our parent's exit, which would
+                        * have detached us just after our attach if
+                        * we'd won the race.  Pretend we got attached
+                        * and then detached immediately, no error.
+                        */
+                       retval = 0;
+       }
+
+       return retval;
 }
 
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
 {
+       struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
        int retval;
 
        retval = -EPERM;
        if (task->pid <= 1)
-               goto out;
+               goto bad;
        if (task->tgid == current->tgid)
-               goto out;
+               goto bad;
+       if (!task->mm)          /* kernel threads */
+               goto bad;
 
-repeat:
-       /*
-        * Nasty, nasty.
-        *
-        * We want to hold both the task-lock and the
-        * tasklist_lock for writing at the same time.
-        * But that's against the rules (tasklist_lock
-        * is taken for reading by interrupts on other
-        * cpu's that may have task_lock).
-        */
-       task_lock(task);
-       local_irq_disable();
-       if (!write_trylock(&tasklist_lock)) {
-               local_irq_enable();
-               task_unlock(task);
-               do {
-                       cpu_relax();
-               } while (!write_can_lock(&tasklist_lock));
-               goto repeat;
-       }
-
-       /* the same process cannot be attached many times */
-       if (task->ptrace & PT_PTRACED)
+       pr_debug("%d ptrace_attach %d state %lu exit_code %x\n",
+                current->pid, task->pid, task->state, task->exit_code);
+
+       engine = utrace_attach(task, (UTRACE_ATTACH_CREATE
+                                     | UTRACE_ATTACH_EXCLUSIVE
+                                     | UTRACE_ATTACH_MATCH_OPS),
+                              &ptrace_utrace_ops, 0);
+       if (IS_ERR(engine)) {
+               retval = PTR_ERR(engine);
+               if (retval == -EEXIST)
+                       retval = -EPERM;
                goto bad;
-       retval = may_attach(task);
+       }
+
+       pr_debug("%d ptrace_attach %d after utrace_attach: %lu exit_code %x\n",
+                current->pid, task->pid, task->state, task->exit_code);
+
+       if (ptrace_may_attach(task)) {
+               state = ptrace_setup(task, engine, current, 0,
+                                    capable(CAP_SYS_PTRACE), NULL);
+               if (IS_ERR(state))
+                       retval = PTR_ERR(state);
+               else {
+                       retval = ptrace_update(task, engine, 0, 0);
+
+                       pr_debug("%d ptrace_attach %d after ptrace_update (%d)"
+                                " %lu exit_code %x\n",
+                                current->pid, task->pid, retval,
+                                task->state, task->exit_code);
+
+                       if (retval) {
+                               /*
+                                * It died before we enabled any callbacks.
+                                */
+                               if (retval == -EALREADY)
+                                       retval = -ESRCH;
+                               BUG_ON(retval != -ESRCH);
+                               ptrace_state_unlink(state);
+                               ptrace_done(state);
+                       }
+               }
+       }
        if (retval)
-               goto bad;
+               (void) utrace_detach(task, engine);
+       else {
+               int stopped = 0;
 
-       /* Go */
-       task->ptrace |= PT_PTRACED | ((task->real_parent != current)
-                                     ? PT_ATTACHED : 0);
-       if (capable(CAP_SYS_PTRACE))
-               task->ptrace |= PT_PTRACE_CAP;
+               /*
+                * We must double-check that task has not just died and
+                * been reaped (after ptrace_update succeeded).
+                * This happens when exec (de_thread) ignores NOREAP.
+                * We cannot call into the signal code if it's dead.
+                */
+               read_lock(&tasklist_lock);
+               if (likely(!task->exit_state)) {
+                       force_sig_specific(SIGSTOP, task);
 
-       __ptrace_link(task, current);
+                       spin_lock_irq(&task->sighand->siglock);
+                       stopped = (task->state == TASK_STOPPED);
+                       spin_unlock_irq(&task->sighand->siglock);
+               }
+               read_unlock(&tasklist_lock);
+
+               if (stopped) {
+                       const struct utrace_regset *regset;
+
+                       /*
+                        * Set QUIESCE immediately, so we can allow
+                        * ptrace requests while he's in TASK_STOPPED.
+                        */
+                       retval = ptrace_update(task, engine,
+                                              UTRACE_ACTION_QUIESCE, 0);
+                       if (retval)
+                               BUG_ON(retval != -ESRCH);
+                       retval = 0;
+
+                       /*
+                        * Do now the regset 0 writeback that we do on every
+                        * stop, since it's never been done.  On register
+                        * window machines, this makes sure the user memory
+                        * backing the register data is up to date.
+                        */
+                       regset = utrace_regset(task, engine,
+                                              utrace_native_view(task), 0);
+                       if (regset->writeback)
+                               (*regset->writeback)(task, regset, 1);
+               }
 
-       force_sig_specific(SIGSTOP, task);
+               pr_debug("%d ptrace_attach %d complete (%sstopped)"
+                        " state %lu code %x",
+                        current->pid, task->pid, stopped ? "" : "not ",
+                        task->state, task->exit_code);
+       }
 
 bad:
-       write_unlock_irq(&tasklist_lock);
-       task_unlock(task);
-out:
        return retval;
 }
 
-void __ptrace_detach(struct task_struct *child, unsigned int data)
+/*
+ * The task might be dying or being reaped in parallel, in which case
+ * engine and state may no longer be valid.  utrace_detach checks for us.
+ */
+static int ptrace_detach(struct task_struct *task,
+                        struct utrace_attached_engine *engine,
+                        struct ptrace_state *state)
 {
-       child->exit_code = data;
-       /* .. re-parent .. */
-       __ptrace_unlink(child);
-       /* .. and wake it up. */
-       if (child->exit_state != EXIT_ZOMBIE)
-               wake_up_process(child);
-}
 
-int ptrace_detach(struct task_struct *child, unsigned int data)
-{
-       if (!valid_signal(data))
-               return -EIO;
+       int error;
 
-       /* Architecture-specific hardware disable .. */
-       ptrace_disable(child);
+#ifdef HAVE_ARCH_PTRACE_DETACH
+       /*
+        * Some funky compatibility code in arch_ptrace may have
+        * needed to install special state it should clean up now.
+        */
+       arch_ptrace_detach(task);
+#endif
 
-       write_lock_irq(&tasklist_lock);
-       if (child->ptrace)
-               __ptrace_detach(child, data);
-       write_unlock_irq(&tasklist_lock);
+       /*
+        * Traditional ptrace behavior does wake_up_process no matter what
+        * in ptrace_detach.  But utrace_detach will not do a wakeup if
+        * it's in a proper job control stop.  We need it to wake up from
+        * TASK_STOPPED and either resume or process more signals.  A
+        * pending stop signal will just leave it stopped again, but will
+        * consume the signal, and reset task->exit_code for the next wait
+        * call to see.  This is important to userland if ptrace_do_wait
+        * "stole" the previous unwaited-for-ness (clearing exit_code), but
+        * there is a pending SIGSTOP, e.g. sent by a PTRACE_ATTACH done
+        * while already in job control stop.
+        */
+       read_lock(&tasklist_lock);
+       if (likely(task->signal != NULL)) {
+               spin_lock_irq(&task->sighand->siglock);
+               task->signal->flags &= ~SIGNAL_STOP_STOPPED;
+               spin_unlock_irq(&task->sighand->siglock);
+       }
+       read_unlock(&tasklist_lock);
 
-       return 0;
+       error = utrace_detach(task, engine);
+       if (!error) {
+               /*
+                * We can only get here from the ptracer itself or via
+                * detach_zombie from another thread in its group.
+                */
+               BUG_ON(state->parent->tgid != current->tgid);
+               ptrace_state_unlink(state);
+               ptrace_done(state);
+
+               /*
+                * Wake up any other threads that might be blocked in
+                * wait.  Though traditional ptrace does not guarantee
+                * this wakeup on PTRACE_DETACH, it does prevent
+                * erroneous blocking in wait when another racing
+                * thread's wait call reap-detaches the last child.
+                * Without this wakeup, another thread might stay
+                * blocked when it should return -ECHILD.
+                */
+               spin_lock_irq(&current->sighand->siglock);
+               wake_up_interruptible(&current->signal->wait_chldexit);
+               spin_unlock_irq(&current->sighand->siglock);
+       }
+       return error;
 }
 
+
 /*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
+ * This is called when we are exiting.  We must stop all our ptracing.
  */
+void
+ptrace_exit(struct task_struct *tsk)
+{
+       struct list_head *pos, *n;
+
+       /*
+        * Taking the task_lock after PF_EXITING is set ensures that a
+        * child in ptrace_traceme will not put itself on our list when
+        * we might already be tearing it down.
+        */
+       task_lock(tsk);
+       if (likely(list_empty(&tsk->ptracees))) {
+               task_unlock(tsk);
+               return;
+       }
+       task_unlock(tsk);
+
+restart:
+       rcu_read_lock();
+
+       list_for_each_safe_rcu(pos, n, &tsk->ptracees) {
+               struct ptrace_state *state = list_entry(pos,
+                                                       struct ptrace_state,
+                                                       entry);
+               int error = utrace_detach(state->task, state->engine);
+               BUG_ON(state->parent != tsk);
+               if (likely(error == 0)) {
+                       ptrace_state_unlink(state);
+                       ptrace_done(state);
+               }
+               else if (unlikely(error == -EALREADY)) {
+                       /*
+                        * It's still doing report_death callbacks.
+                        * Just wait for it to settle down.
+                        * Since wait_task_inactive might yield,
+                        * we must go out of rcu_read_lock and restart.
+                        */
+                       struct task_struct *p = state->task;
+                       get_task_struct(p);
+                       rcu_read_unlock();
+                       wait_task_inactive(p);
+                       put_task_struct(p);
+                       goto restart;
+               }
+               else
+                       BUG_ON(error != -ESRCH);
+       }
+
+       rcu_read_unlock();
 
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+       BUG_ON(!list_empty(&tsk->ptracees));
+}
+
+static int
+ptrace_induce_signal(struct task_struct *target,
+                    struct utrace_attached_engine *engine,
+                    long signr)
 {
-       struct mm_struct *mm;
-       struct vm_area_struct *vma;
-       struct page *page;
-       void *old_buf = buf;
+       struct ptrace_state *state = (struct ptrace_state *) engine->data;
 
-       mm = get_task_mm(tsk);
-       if (!mm)
+       if (signr == 0)
                return 0;
 
-       down_read(&mm->mmap_sem);
-       /* ignore errors, just check how much was sucessfully transfered */
-       while (len) {
-               int bytes, ret, offset;
-               void *maddr;
-
-               ret = get_user_pages(tsk, mm, addr, 1,
-                               write, 1, &page, &vma);
-               if (ret <= 0)
-                       break;
+       if (!valid_signal(signr))
+               return -EIO;
 
-               bytes = len;
-               offset = addr & (PAGE_SIZE-1);
-               if (bytes > PAGE_SIZE-offset)
-                       bytes = PAGE_SIZE-offset;
-
-               maddr = kmap(page);
-               if (write) {
-                       copy_to_user_page(vma, page, addr,
-                                         maddr + offset, buf, bytes);
-                       set_page_dirty_lock(page);
-               } else {
-                       copy_from_user_page(vma, page, addr,
-                                           buf, maddr + offset, bytes);
+       if (state->syscall) {
+               /*
+                * This is the traditional ptrace behavior when given
+                * a signal to resume from a syscall tracing stop.
+                */
+               send_sig(signr, target, 1);
+       }
+       else if (!state->have_eventmsg && state->u.siginfo) {
+               siginfo_t *info = state->u.siginfo;
+
+               /* Update the siginfo structure if the signal has
+                  changed.  If the debugger wanted something
+                  specific in the siginfo structure then it should
+                  have updated *info via PTRACE_SETSIGINFO.  */
+               if (signr != info->si_signo) {
+                       info->si_signo = signr;
+                       info->si_errno = 0;
+                       info->si_code = SI_USER;
+                       info->si_pid = current->pid;
+                       info->si_uid = current->uid;
                }
-               kunmap(page);
-               page_cache_release(page);
-               len -= bytes;
-               buf += bytes;
-               addr += bytes;
+
+               return utrace_inject_signal(target, engine,
+                                           UTRACE_ACTION_RESUME, info, NULL);
        }
-       up_read(&mm->mmap_sem);
-       mmput(mm);
-       
-       return buf - old_buf;
+
+       return 0;
 }
 
-int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
+int
+ptrace_regset_access(struct task_struct *target,
+                    struct utrace_attached_engine *engine,
+                    const struct utrace_regset_view *view,
+                    int setno, unsigned long offset, unsigned int size,
+                    void __user *data, int write)
 {
-       int copied = 0;
+       const struct utrace_regset *regset = utrace_regset(target, engine,
+                                                          view, setno);
+       int ret;
 
-       while (len > 0) {
-               char buf[128];
-               int this_len, retval;
+       if (unlikely(regset == NULL))
+               return -EIO;
 
-               this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-               retval = access_process_vm(tsk, src, buf, this_len, 0);
-               if (!retval) {
-                       if (copied)
-                               break;
-                       return -EIO;
-               }
-               if (copy_to_user(dst, buf, retval))
-                       return -EFAULT;
-               copied += retval;
-               src += retval;
-               dst += retval;
-               len -= retval;                  
+       if (size == (unsigned int) -1)
+               size = regset->size * regset->n;
+
+       if (write) {
+               if (!access_ok(VERIFY_READ, data, size))
+                       ret = -EIO;
+               else
+                       ret = (*regset->set)(target, regset,
+                                            offset, size, NULL, data);
+       }
+       else {
+               if (!access_ok(VERIFY_WRITE, data, size))
+                       ret = -EIO;
+               else
+                       ret = (*regset->get)(target, regset,
+                                            offset, size, NULL, data);
        }
-       return copied;
+
+       return ret;
 }
 
-int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
+int
+ptrace_onereg_access(struct task_struct *target,
+                    struct utrace_attached_engine *engine,
+                    const struct utrace_regset_view *view,
+                    int setno, unsigned long regno,
+                    void __user *data, int write)
 {
-       int copied = 0;
+       const struct utrace_regset *regset = utrace_regset(target, engine,
+                                                          view, setno);
+       unsigned int pos;
+       int ret;
 
-       while (len > 0) {
-               char buf[128];
-               int this_len, retval;
+       if (unlikely(regset == NULL))
+               return -EIO;
 
-               this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-               if (copy_from_user(buf, src, this_len))
-                       return -EFAULT;
-               retval = access_process_vm(tsk, dst, buf, this_len, 1);
-               if (!retval) {
-                       if (copied)
-                               break;
-                       return -EIO;
-               }
-               copied += retval;
-               src += retval;
-               dst += retval;
-               len -= retval;                  
+       if (regno < regset->bias || regno >= regset->bias + regset->n)
+               return -EINVAL;
+
+       pos = (regno - regset->bias) * regset->size;
+
+       if (write) {
+               if (!access_ok(VERIFY_READ, data, regset->size))
+                       ret = -EIO;
+               else
+                       ret = (*regset->set)(target, regset, pos, regset->size,
+                                            NULL, data);
+       }
+       else {
+               if (!access_ok(VERIFY_WRITE, data, regset->size))
+                       ret = -EIO;
+               else
+                       ret = (*regset->get)(target, regset, pos, regset->size,
+                                            NULL, data);
        }
-       return copied;
+
+       return ret;
 }
 
-static int ptrace_setoptions(struct task_struct *child, long data)
+int
+ptrace_layout_access(struct task_struct *target,
+                    struct utrace_attached_engine *engine,
+                    const struct utrace_regset_view *view,
+                    const struct ptrace_layout_segment layout[],
+                    unsigned long addr, unsigned int size,
+                    void __user *udata, void *kdata, int write)
 {
-       child->ptrace &= ~PT_TRACE_MASK;
-
-       if (data & PTRACE_O_TRACESYSGOOD)
-               child->ptrace |= PT_TRACESYSGOOD;
+       const struct ptrace_layout_segment *seg;
+       int ret = -EIO;
 
-       if (data & PTRACE_O_TRACEFORK)
-               child->ptrace |= PT_TRACE_FORK;
+       if (kdata == NULL &&
+           !access_ok(write ? VERIFY_READ : VERIFY_WRITE, udata, size))
+               return -EIO;
 
-       if (data & PTRACE_O_TRACEVFORK)
-               child->ptrace |= PT_TRACE_VFORK;
+       seg = layout;
+       do {
+               unsigned int pos, n;
 
-       if (data & PTRACE_O_TRACECLONE)
-               child->ptrace |= PT_TRACE_CLONE;
+               while (addr >= seg->end && seg->end != 0)
+                       ++seg;
 
-       if (data & PTRACE_O_TRACEEXEC)
-               child->ptrace |= PT_TRACE_EXEC;
+               if (addr < seg->start || addr >= seg->end)
+                       return -EIO;
 
-       if (data & PTRACE_O_TRACEVFORKDONE)
-               child->ptrace |= PT_TRACE_VFORK_DONE;
+               pos = addr - seg->start + seg->offset;
+               n = min(size, seg->end - (unsigned int) addr);
+
+               if (unlikely(seg->regset == (unsigned int) -1)) {
+                       /*
+                        * This is a no-op/zero-fill portion of struct user.
+                        */
+                       ret = 0;
+                       if (!write && seg->offset == 0) {
+                               if (kdata)
+                                       memset(kdata, 0, n);
+                               else if (clear_user(udata, n))
+                                       ret = -EFAULT;
+                       }
+               }
+               else {
+                       unsigned int align;
+                       const struct utrace_regset *regset = utrace_regset(
+                               target, engine, view, seg->regset);
+                       if (unlikely(regset == NULL))
+                               return -EIO;
+
+                       /*
+                        * A ptrace compatibility layout can do a misaligned
+                        * regset access, e.g. word access to larger data.
+                        * An arch's compat layout can be this way only if
+                        * it is actually ok with the regset code despite the
+                        * regset->align setting.
+                        */
+                       align = min(regset->align, size);
+                       if ((pos & (align - 1))
+                           || pos >= regset->n * regset->size)
+                               return -EIO;
+
+                       if (write)
+                               ret = (*regset->set)(target, regset,
+                                                    pos, n, kdata, udata);
+                       else
+                               ret = (*regset->get)(target, regset,
+                                                    pos, n, kdata, udata);
+               }
 
-       if (data & PTRACE_O_TRACEEXIT)
-               child->ptrace |= PT_TRACE_EXIT;
+               if (kdata)
+                       kdata += n;
+               else
+                       udata += n;
+               addr += n;
+               size -= n;
+       } while (ret == 0 && size > 0);
 
-       return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+       return ret;
 }
 
-static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
+
+static int
+ptrace_start(long pid, long request,
+            struct task_struct **childp,
+            struct utrace_attached_engine **enginep,
+            struct ptrace_state **statep)
+
 {
-       siginfo_t lastinfo;
-       int error = -ESRCH;
+       struct task_struct *child;
+       struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
+       int ret;
+
+       if (request == PTRACE_TRACEME)
+               return ptrace_traceme();
 
+       ret = -ESRCH;
        read_lock(&tasklist_lock);
-       if (likely(child->sighand != NULL)) {
-               error = -EINVAL;
-               spin_lock_irq(&child->sighand->siglock);
-               if (likely(child->last_siginfo != NULL)) {
-                       lastinfo = *child->last_siginfo;
-                       error = 0;
-               }
-               spin_unlock_irq(&child->sighand->siglock);
-       }
+       child = find_task_by_pid(pid);
+       if (child)
+               get_task_struct(child);
        read_unlock(&tasklist_lock);
-       if (!error)
-               return copy_siginfo_to_user(data, &lastinfo);
-       return error;
-}
+       pr_debug("ptrace pid %ld => %p\n", pid, child);
+       if (!child)
+               goto out;
 
-static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
-{
-       siginfo_t newinfo;
-       int error = -ESRCH;
+       ret = -EPERM;
+       if (pid == 1)           /* you may not mess with init */
+               goto out_tsk;
 
-       if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
-               return -EFAULT;
+       ret = -EPERM;
+       if (!vx_check(vx_task_xid(child), VS_WATCH_P|VS_IDENT))
+               goto out_tsk;
 
-       read_lock(&tasklist_lock);
-       if (likely(child->sighand != NULL)) {
-               error = -EINVAL;
-               spin_lock_irq(&child->sighand->siglock);
-               if (likely(child->last_siginfo != NULL)) {
-                       *child->last_siginfo = newinfo;
-                       error = 0;
-               }
-               spin_unlock_irq(&child->sighand->siglock);
+       if (request == PTRACE_ATTACH) {
+               ret = ptrace_attach(child);
+               goto out_tsk;
        }
-       read_unlock(&tasklist_lock);
-       return error;
+
+       rcu_read_lock();
+       engine = utrace_attach(child, UTRACE_ATTACH_MATCH_OPS,
+                              &ptrace_utrace_ops, 0);
+       ret = -ESRCH;
+       if (IS_ERR(engine) || engine == NULL)
+               goto out_tsk_rcu;
+       state = rcu_dereference((struct ptrace_state *) engine->data);
+       if (state == NULL || state->parent != current)
+               goto out_tsk_rcu;
+       rcu_read_unlock();
+
+       /*
+        * Traditional ptrace behavior demands that the target already be
+        * quiescent, but not dead.
+        */
+       if (request != PTRACE_KILL
+           && !(engine->flags & UTRACE_ACTION_QUIESCE)) {
+               pr_debug("%d not stopped (%lu)\n", child->pid, child->state);
+               goto out_tsk;
+       }
+
+       /*
+        * We do this for all requests to match traditional ptrace behavior.
+        * If the machine state synchronization done at context switch time
+        * includes e.g. writing back to user memory, we want to make sure
+        * that has finished before a PTRACE_PEEKDATA can fetch the results.
+        * On most machines, only regset data is affected by context switch
+        * and calling utrace_regset later on will take care of that, so
+        * this is superfluous.
+        *
+        * To do this purely in utrace terms, we could do:
+        *  (void) utrace_regset(child, engine, utrace_native_view(child), 0);
+        */
+       wait_task_inactive(child);
+
+       if (child->exit_state)
+               goto out_tsk;
+
+       *childp = child;
+       *enginep = engine;
+       *statep = state;
+       return -EIO;
+
+out_tsk_rcu:
+       rcu_read_unlock();
+out_tsk:
+       put_task_struct(child);
+out:
+       return ret;
 }
 
-int ptrace_request(struct task_struct *child, long request,
-                  long addr, long data)
+static int
+ptrace_common(long request, struct task_struct *child,
+             struct utrace_attached_engine *engine,
+             struct ptrace_state *state,
+             unsigned long addr, long data)
 {
+       unsigned long flags;
        int ret = -EIO;
 
        switch (request) {
+       case PTRACE_DETACH:
+               /*
+                * Detach a process that was attached.
+                */
+               ret = ptrace_induce_signal(child, engine, data);
+               if (!ret) {
+                       ret = ptrace_detach(child, engine, state);
+                       if (ret == -EALREADY) /* Already a zombie.  */
+                               ret = -ESRCH;
+                       if (ret)
+                               BUG_ON(ret != -ESRCH);
+               }
+               break;
+
+               /*
+                * These are the operations that resume the child running.
+                */
+       case PTRACE_KILL:
+               data = SIGKILL;
+       case PTRACE_CONT:
+       case PTRACE_SYSCALL:
+#ifdef PTRACE_SYSEMU
+       case PTRACE_SYSEMU:
+       case PTRACE_SYSEMU_SINGLESTEP:
+#endif
+#ifdef PTRACE_SINGLEBLOCK
+       case PTRACE_SINGLEBLOCK:
+# ifdef ARCH_HAS_BLOCK_STEP
+               if (! ARCH_HAS_BLOCK_STEP)
+# endif
+                       if (request == PTRACE_SINGLEBLOCK)
+                               break;
+#endif
+       case PTRACE_SINGLESTEP:
+#ifdef ARCH_HAS_SINGLE_STEP
+               if (! ARCH_HAS_SINGLE_STEP)
+#endif
+                       if (request == PTRACE_SINGLESTEP
+#ifdef PTRACE_SYSEMU_SINGLESTEP
+                           || request == PTRACE_SYSEMU_SINGLESTEP
+#endif
+                               )
+                               break;
+
+               ret = ptrace_induce_signal(child, engine, data);
+               if (ret)
+                       break;
+
+
+               /*
+                * Reset the action flags without QUIESCE, so it resumes.
+                */
+               flags = 0;
+#ifdef PTRACE_SYSEMU
+               state->sysemu = (request == PTRACE_SYSEMU_SINGLESTEP
+                                       || request == PTRACE_SYSEMU);
+#endif
+               if (request == PTRACE_SINGLESTEP
+#ifdef PTRACE_SYSEMU
+                   || request == PTRACE_SYSEMU_SINGLESTEP
+#endif
+                       )
+                       flags |= UTRACE_ACTION_SINGLESTEP;
+#ifdef PTRACE_SINGLEBLOCK
+               else if (request == PTRACE_SINGLEBLOCK)
+                       flags |= UTRACE_ACTION_BLOCKSTEP;
+#endif
+               if (request == PTRACE_SYSCALL)
+                       flags |= UTRACE_EVENT_SYSCALL;
+#ifdef PTRACE_SYSEMU
+               else if (request == PTRACE_SYSEMU
+                        || request == PTRACE_SYSEMU_SINGLESTEP)
+                       flags |= UTRACE_EVENT(SYSCALL_ENTRY);
+#endif
+               ret = ptrace_update(child, engine, flags, 1);
+               if (ret)
+                       BUG_ON(ret != -ESRCH);
+               ret = 0;
+               break;
+
 #ifdef PTRACE_OLDSETOPTIONS
        case PTRACE_OLDSETOPTIONS:
 #endif
        case PTRACE_SETOPTIONS:
-               ret = ptrace_setoptions(child, data);
+               ret = -EINVAL;
+               if (data & ~PTRACE_O_MASK)
+                       break;
+               state->options = data;
+               ret = ptrace_update(child, engine, UTRACE_ACTION_QUIESCE, 1);
+               if (ret)
+                       BUG_ON(ret != -ESRCH);
+               ret = 0;
+               break;
+       }
+
+       return ret;
+}
+
+
+asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+{
+       struct task_struct *child;
+       struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
+       long ret, val;
+
+       pr_debug("%d sys_ptrace(%ld, %ld, %lx, %lx)\n",
+                current->pid, request, pid, addr, data);
+
+       ret = ptrace_start(pid, request, &child, &engine, &state);
+       if (ret != -EIO)
+               goto out;
+
+       val = 0;
+       ret = arch_ptrace(&request, child, engine, addr, data, &val);
+       if (ret != -ENOSYS) {
+               if (ret == 0) {
+                       ret = val;
+                       force_successful_syscall_return();
+               }
+               goto out_tsk;
+       }
+
+       switch (request) {
+       default:
+               ret = ptrace_common(request, child, engine, state, addr, data);
                break;
+
+       case PTRACE_PEEKTEXT: /* read word at location addr. */
+       case PTRACE_PEEKDATA: {
+               unsigned long tmp;
+               int copied;
+
+               copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+               ret = -EIO;
+               if (copied != sizeof(tmp))
+                       break;
+               ret = put_user(tmp, (unsigned long __user *) data);
+               break;
+       }
+
+       case PTRACE_POKETEXT: /* write the word at location addr. */
+       case PTRACE_POKEDATA:
+               ret = 0;
+               if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
+                       break;
+               ret = -EIO;
+               break;
+
        case PTRACE_GETEVENTMSG:
-               ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+               ret = put_user(state->have_eventmsg
+                              ? state->u.eventmsg : 0L,
+                              (unsigned long __user *) data);
                break;
        case PTRACE_GETSIGINFO:
-               ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
+               ret = -EINVAL;
+               if (!state->have_eventmsg && state->u.siginfo)
+                       ret = copy_siginfo_to_user((siginfo_t __user *) data,
+                                                  state->u.siginfo);
                break;
        case PTRACE_SETSIGINFO:
-               ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
+               ret = -EINVAL;
+               if (!state->have_eventmsg && state->u.siginfo) {
+                       ret = 0;
+                       if (copy_from_user(state->u.siginfo,
+                                          (siginfo_t __user *) data,
+                                          sizeof(siginfo_t)))
+                               ret = -EFAULT;
+               }
                break;
+       }
+
+out_tsk:
+       put_task_struct(child);
+out:
+       pr_debug("%d ptrace -> %lx\n", current->pid, ret);
+       return ret;
+}
+
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+                                 compat_ulong_t addr, compat_long_t cdata)
+{
+       const unsigned long data = (unsigned long) (compat_ulong_t) cdata;
+       struct task_struct *child;
+       struct utrace_attached_engine *engine;
+       struct ptrace_state *state;
+       compat_long_t ret, val;
+
+       pr_debug("%d compat_sys_ptrace(%d, %d, %x, %x)\n",
+                current->pid, request, pid, addr, cdata);
+       ret = ptrace_start(pid, request, &child, &engine, &state);
+       if (ret != -EIO)
+               goto out;
+
+       val = 0;
+       ret = arch_compat_ptrace(&request, child, engine, addr, cdata, &val);
+       if (ret != -ENOSYS) {
+               if (ret == 0) {
+                       ret = val;
+                       force_successful_syscall_return();
+               }
+               goto out_tsk;
+       }
+
+       switch (request) {
        default:
+               ret = ptrace_common(request, child, engine, state, addr, data);
+               break;
+
+       case PTRACE_PEEKTEXT: /* read word at location addr. */
+       case PTRACE_PEEKDATA: {
+               compat_ulong_t tmp;
+               int copied;
+
+               copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+               ret = -EIO;
+               if (copied != sizeof(tmp))
+                       break;
+               ret = put_user(tmp, (compat_ulong_t __user *) data);
                break;
        }
 
+       case PTRACE_POKETEXT: /* write the word at location addr. */
+       case PTRACE_POKEDATA:
+               ret = 0;
+               if (access_process_vm(child, addr, &cdata, sizeof(cdata), 1) == sizeof(cdata))
+                       break;
+               ret = -EIO;
+               break;
+
+       case PTRACE_GETEVENTMSG:
+               ret = put_user(state->have_eventmsg
+                              ? state->u.eventmsg : 0L,
+                              (compat_long_t __user *) data);
+               break;
+       case PTRACE_GETSIGINFO:
+               ret = -EINVAL;
+               if (!state->have_eventmsg && state->u.siginfo)
+                       ret = copy_siginfo_to_user32(
+                               (struct compat_siginfo __user *) data,
+                               state->u.siginfo);
+               break;
+       case PTRACE_SETSIGINFO:
+               ret = -EINVAL;
+               if (!state->have_eventmsg && state->u.siginfo
+                   && copy_siginfo_from_user32(
+                           state->u.siginfo,
+                           (struct compat_siginfo __user *) data))
+                       ret = -EFAULT;
+               break;
+       }
+
+out_tsk:
+       put_task_struct(child);
+out:
+       pr_debug("%d ptrace -> %lx\n", current->pid, (long)ret);
        return ret;
 }
+#endif
 
-/**
- * ptrace_traceme  --  helper for PTRACE_TRACEME
- *
- * Performs checks and sets PT_PTRACED.
- * Should be used by all ptrace implementations for PTRACE_TRACEME.
+
+/*
+ * Detach the zombie being reported for wait.
  */
-int ptrace_traceme(void)
+static inline void
+detach_zombie(struct task_struct *tsk,
+             struct task_struct *p, struct ptrace_state *state)
 {
-       int ret = -EPERM;
-
+       int detach_error;
+       struct utrace_attached_engine *engine;
+
+restart:
+       detach_error = 0;
+       rcu_read_lock();
+       if (tsk == current)
+               engine = state->engine;
+       else {
+               /*
+                * We've excluded other ptrace_do_wait calls.  But the
+                * ptracer itself might have done ptrace_detach while we
+                * did not have rcu_read_lock.  So double-check that state
+                * is still valid.
+                */
+               engine = utrace_attach(
+                       p, (UTRACE_ATTACH_MATCH_OPS
+                           | UTRACE_ATTACH_MATCH_DATA),
+                       &ptrace_utrace_ops,
+                       (unsigned long) state);
+               if (IS_ERR(engine) || state->parent != tsk)
+                       detach_error = -ESRCH;
+               else
+                       BUG_ON(state->engine != engine);
+       }
+       rcu_read_unlock();
+       if (likely(!detach_error))
+               detach_error = ptrace_detach(p, engine, state);
+       if (unlikely(detach_error == -EALREADY)) {
+               /*
+                * It's still doing report_death callbacks.
+                * Just wait for it to settle down.
+                */
+               wait_task_inactive(p); /* Might block.  */
+               goto restart;
+       }
        /*
-        * Are we already being traced?
+        * A failure with -ESRCH means that report_reap is
+        * already running and will do the cleanup, or that
+        * we lost a race with ptrace_detach in another
+        * thread or with the automatic detach in
+        * report_death.
         */
-       task_lock(current);
-       if (!(current->ptrace & PT_PTRACED)) {
-               ret = security_ptrace(current->parent, current);
+       if (detach_error)
+               BUG_ON(detach_error != -ESRCH);
+}
+
+/*
+ * We're called with tasklist_lock held for reading.
+ * If we return -ECHILD or zero, next_thread(tsk) must still be valid to use.
+ * If we return another error code, or a successful PID value, we
+ * release tasklist_lock first.
+ */
+int
+ptrace_do_wait(struct task_struct *tsk,
+              pid_t pid, int options, struct siginfo __user *infop,
+              int __user *stat_addr, struct rusage __user *rusagep)
+{
+       struct ptrace_state *state;
+       struct task_struct *p;
+       int err = -ECHILD;
+       int exit_code, why, status;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(state, &tsk->ptracees, entry) {
+               p = state->task;
+
+               if (pid > 0) {
+                       if (p->pid != pid)
+                               continue;
+               } else if (!pid) {
+                       if (process_group(p) != process_group(current))
+                               continue;
+               } else if (pid != -1) {
+                       if (process_group(p) != -pid)
+                               continue;
+               }
+               if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+                   && !(options & __WALL))
+                       continue;
+               if (security_task_wait(p))
+                       continue;
+
                /*
-                * Set the ptrace bit in the process ptrace flags.
+                * This is a matching child.  If we don't win now, tell
+                * our caller to block and repeat.  From this point we
+                * must ensure that wait_chldexit will get a wakeup for
+                * any tracee stopping, dying, or being detached.
+                * For death, tasklist_lock guarantees this already.
                 */
-               if (!ret)
-                       current->ptrace |= PT_PTRACED;
+               err = 0;
+
+               switch (p->exit_state) {
+               case EXIT_ZOMBIE:
+                       if (!likely(options & WEXITED))
+                               continue;
+                       if (delay_group_leader(p)) {
+                               struct task_struct *next = next_thread(p);
+                               pr_debug("%d ptrace_do_wait leaving %d "
+                                        "zombie code %x "
+                                        "delay_group_leader (%d/%lu)\n",
+                                        current->pid, p->pid, p->exit_code,
+                                        next->pid, next->state);
+                               continue;
+                       }
+                       exit_code = p->exit_code;
+                       goto found;
+               case EXIT_DEAD:
+                       continue;
+               default:
+                       /*
+                        * tasklist_lock holds up any transitions to
+                        * EXIT_ZOMBIE.  After releasing it we are
+                        * guaranteed a wakeup on wait_chldexit after
+                        * any new deaths.
+                        */
+                       if (p->flags & PF_EXITING)
+                               /*
+                                * It's in do_exit and might have set
+                                * p->exit_code already, but it's not quite
+                                * dead yet.  It will get to report_death
+                                * and wakes us up when it finishes.
+                                */
+                               continue;
+                       break;
+               }
+
+               /*
+                * This xchg atomically ensures that only one do_wait
+                * call can report this thread.  Because exit_code is
+                * always set before do_notify wakes us up, after this
+                * check fails we are sure to get a wakeup if it stops.
+                */
+               exit_code = xchg(&p->exit_code, 0);
+               if (exit_code)
+                       goto found;
+
+               // XXX should handle WCONTINUED
+
+               pr_debug("%d ptrace_do_wait leaving %d state %lu code %x\n",
+                        current->pid, p->pid, p->state, p->exit_code);
        }
-       task_unlock(current);
-       return ret;
+       rcu_read_unlock();
+       if (err == 0)
+               pr_debug("%d ptrace_do_wait blocking\n", current->pid);
+
+       return err;
+
+found:
+       BUG_ON(state->parent != tsk);
+       rcu_read_unlock();
+
+       pr_debug("%d ptrace_do_wait (%d) found %d code %x (%lu/%d)\n",
+                current->pid, tsk->pid, p->pid, exit_code,
+                p->exit_state, p->exit_signal);
+
+       /*
+        * If there was a group exit in progress, all threads report that
+        * status.  Most will have SIGKILL in their own exit_code.
+        */
+       if (p->signal->flags & SIGNAL_GROUP_EXIT)
+               exit_code = p->signal->group_exit_code;
+
+       if (p->exit_state) {
+               if (unlikely(p->parent == tsk && p->exit_signal != -1))
+                       /*
+                        * This is our natural child we were ptracing.
+                        * When it dies it detaches (see ptrace_report_death).
+                        * So we're seeing it here in a race.  When it
+                        * finishes detaching it will become reapable in
+                        * the normal wait_task_zombie path instead.
+                        */
+                       return 0;
+               if ((exit_code & 0x7f) == 0) {
+                       why = CLD_EXITED;
+                       status = exit_code >> 8;
+               }
+               else {
+                       why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+                       status = exit_code & 0x7f;
+               }
+       }
+       else {
+               why = CLD_TRAPPED;
+               status = exit_code;
+               exit_code = (status << 8) | 0x7f;
+       }
+
+       /*
+        * At this point we are committed to a successful return
+        * or a user error return.  Release the tasklist_lock.
+        */
+       get_task_struct(p);
+       read_unlock(&tasklist_lock);
+
+       if (rusagep)
+               err = getrusage(p, RUSAGE_BOTH, rusagep);
+       if (infop) {
+               if (!err)
+                       err = put_user(SIGCHLD, &infop->si_signo);
+               if (!err)
+                       err = put_user(0, &infop->si_errno);
+               if (!err)
+                       err = put_user((short)why, &infop->si_code);
+               if (!err)
+                       err = put_user(p->pid, &infop->si_pid);
+               if (!err)
+                       err = put_user(p->uid, &infop->si_uid);
+               if (!err)
+                       err = put_user(status, &infop->si_status);
+       }
+       if (!err && stat_addr)
+               err = put_user(exit_code, stat_addr);
+
+       if (!err) {
+               if (why != CLD_TRAPPED)
+                       /*
+                        * This was a death report.  The ptracer's wait
+                        * does an implicit detach, so the zombie reports
+                        * to its real parent now.
+                        */
+                       detach_zombie(tsk, p, state);
+               err = p->pid;
+       }
+
+       put_task_struct(p);
+
+       return err;
 }
 
-/**
- * ptrace_get_task_struct  --  grab a task struct reference for ptrace
- * @pid:       process id to grab a task_struct reference of
- *
- * This function is a helper for ptrace implementations.  It checks
- * permissions and then grabs a task struct for use of the actual
- * ptrace implementation.
- *
- * Returns the task_struct for @pid or an ERR_PTR() on failure.
+
+/*
+ * All the report callbacks (except death and reap) are subject to a race
+ * with ptrace_exit doing a quick detach and ptrace_done.  It can do this
+ * even when the target is not quiescent, so a callback may already be in
+ * progress when it does ptrace_done.  Callbacks use this function to fetch
+ * the struct ptrace_state while ensuring it doesn't disappear until
+ * put_ptrace_state is called.  This just uses RCU, since state and
+ * anything we try to do to state->parent is safe under rcu_read_lock.
  */
-struct task_struct *ptrace_get_task_struct(pid_t pid)
+static struct ptrace_state *
+get_ptrace_state(struct utrace_attached_engine *engine,
+                struct task_struct *tsk)
 {
-       struct task_struct *child;
+       struct ptrace_state *state;
+
+       rcu_read_lock();
+       state = rcu_dereference((struct ptrace_state *) engine->data);
+       if (likely(state != NULL))
+               return state;
 
+       rcu_read_unlock();
+       return NULL;
+}
+
+static inline void
+put_ptrace_state(struct ptrace_state *state)
+{
+       rcu_read_unlock();
+}
+
+
+static void
+do_notify(struct task_struct *tsk, struct task_struct *parent, int why)
+{
+       struct siginfo info;
+       unsigned long flags;
+       struct sighand_struct *sighand;
+       int sa_mask;
+
+       info.si_signo = SIGCHLD;
+       info.si_errno = 0;
+       info.si_pid = tsk->pid;
+       info.si_uid = tsk->uid;
+
+       /* FIXME: find out whether or not this is supposed to be c*time. */
+       info.si_utime = cputime_to_jiffies(tsk->utime);
+       info.si_stime = cputime_to_jiffies(tsk->stime);
+
+       sa_mask = SA_NOCLDSTOP;
+       info.si_code = why;
+       info.si_status = tsk->exit_code & 0x7f;
+       if (why == CLD_CONTINUED)
+               info.si_status = SIGCONT;
+       else if (why == CLD_STOPPED)
+               info.si_status = tsk->signal->group_exit_code & 0x7f;
+       else if (why == CLD_EXITED) {
+               sa_mask = SA_NOCLDWAIT;
+               if (tsk->exit_code & 0x80)
+                       info.si_code = CLD_DUMPED;
+               else if (tsk->exit_code & 0x7f)
+                       info.si_code = CLD_KILLED;
+               else {
+                       info.si_code = CLD_EXITED;
+                       info.si_status = tsk->exit_code >> 8;
+               }
+       }
+
+       read_lock(&tasklist_lock);
+       if (unlikely(parent->signal == NULL))
+               goto out;
+
+       sighand = parent->sighand;
+       spin_lock_irqsave(&sighand->siglock, flags);
+       if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
+           !(sighand->action[SIGCHLD-1].sa.sa_flags & sa_mask))
+               __group_send_sig_info(SIGCHLD, &info, parent);
        /*
-        * Tracing init is not allowed.
+        * Even if SIGCHLD is not generated, we must wake up wait4 calls.
         */
-       if (pid == 1)
-               return ERR_PTR(-EPERM);
+       wake_up_interruptible_sync(&parent->signal->wait_chldexit);
+       spin_unlock_irqrestore(&sighand->siglock, flags);
 
-       read_lock(&tasklist_lock);
-       child = find_task_by_pid(pid);
-       if (child)
-               get_task_struct(child);
+out:
        read_unlock(&tasklist_lock);
-       if (!child)
-               return ERR_PTR(-ESRCH);
-       return child;
 }
 
-#ifndef __ARCH_SYS_PTRACE
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+static u32
+ptrace_report(struct utrace_attached_engine *engine,
+             struct task_struct *tsk,
+             struct ptrace_state *state,
+             int code)
 {
-       struct task_struct *child;
-       long ret;
+       const struct utrace_regset *regset;
+
+       pr_debug("%d ptrace_report %d engine %p"
+                " state %p code %x parent %d (%p)\n",
+                current->pid, tsk->pid, engine, state, code,
+                state->parent->pid, state->parent);
+       if (!state->have_eventmsg && state->u.siginfo) {
+               const siginfo_t *si = state->u.siginfo;
+               pr_debug("  si %d code %x errno %d addr %p\n",
+                        si->si_signo, si->si_code, si->si_errno,
+                        si->si_addr);
+       }
 
        /*
-        * This lock_kernel fixes a subtle race with suid exec
+        * Set our QUIESCE flag right now, before notifying the tracer.
+        * We do this before setting tsk->exit_code rather than
+        * by using UTRACE_ACTION_NEWSTATE in our return value, to
+        * ensure that the tracer can't get the notification and then
+        * try to resume us with PTRACE_CONT before we set the flag.
         */
-       lock_kernel();
-       if (request == PTRACE_TRACEME) {
-               ret = ptrace_traceme();
-               goto out;
+       utrace_set_flags(tsk, engine, engine->flags | UTRACE_ACTION_QUIESCE);
+
+       /*
+        * If regset 0 has a writeback call, do it now.  On register window
+        * machines, this makes sure the user memory backing the register
+        * data is up to date by the time wait_task_inactive returns to
+        * ptrace_start in our tracer doing a PTRACE_PEEKDATA or the like.
+        */
+       regset = utrace_regset(tsk, engine, utrace_native_view(tsk), 0);
+       if (regset->writeback)
+               (*regset->writeback)(tsk, regset, 0);
+
+       BUG_ON(code == 0);
+       tsk->exit_code = code;
+       do_notify(tsk, state->parent, CLD_TRAPPED);
+
+       pr_debug("%d ptrace_report quiescing exit_code %x\n",
+                current->pid, current->exit_code);
+
+       put_ptrace_state(state);
+
+       return UTRACE_ACTION_RESUME;
+}
+
+static inline u32
+ptrace_event(struct utrace_attached_engine *engine,
+            struct task_struct *tsk,
+            struct ptrace_state *state,
+            int event)
+{
+       state->syscall = 0;
+       return ptrace_report(engine, tsk, state, (event << 8) | SIGTRAP);
+}
+
+/*
+ * Unlike other report callbacks, this can't be called while ptrace_exit
+ * is doing ptrace_done in parallel, so we don't need get_ptrace_state.
+ */
+static u32
+ptrace_report_death(struct utrace_attached_engine *engine,
+                   struct task_struct *tsk)
+{
+       struct ptrace_state *state = (struct ptrace_state *) engine->data;
+
+       if (tsk->exit_code == 0 && unlikely(tsk->flags & PF_SIGNALED))
+               /*
+                * This can only mean that tsk->exit_code was clobbered
+                * by ptrace_update or ptrace_do_wait in a race with
+                * an asynchronous wakeup and exit for SIGKILL.
+                */
+               tsk->exit_code = SIGKILL;
+
+       if (tsk->parent == state->parent && tsk->exit_signal != -1) {
+               /*
+                * This is a natural child (excluding clone siblings of a
+                * child group_leader), so we detach and let the normal
+                * reporting happen once our NOREAP action is gone.  But
+                * first, generate a SIGCHLD for those cases where normal
+                * behavior won't.  A ptrace'd child always generates SIGCHLD.
+                */
+               pr_debug("ptrace %d death natural parent %d exit_code %x\n",
+                        tsk->pid, state->parent->pid, tsk->exit_code);
+               if (!thread_group_empty(tsk))
+                       do_notify(tsk, state->parent, CLD_EXITED);
+               ptrace_state_unlink(state);
+               rcu_assign_pointer(engine->data, 0UL);
+               ptrace_done(state);
+               return UTRACE_ACTION_DETACH;
        }
 
-       child = ptrace_get_task_struct(pid);
-       if (IS_ERR(child)) {
-               ret = PTR_ERR(child);
-               goto out;
+       /*
+        * This might be a second report_death callback for a group leader
+        * that was delayed when its original report_death callback was made.
+        * Repeating do_notify is exactly what we need for that case too.
+        * After the wakeup, ptrace_do_wait will see delay_group_leader false.
+        */
+
+       pr_debug("ptrace %d death notify %d exit_code %x: ",
+                tsk->pid, state->parent->pid, tsk->exit_code);
+       do_notify(tsk, state->parent, CLD_EXITED);
+       pr_debug("%d notified %d\n", tsk->pid, state->parent->pid);
+       return UTRACE_ACTION_RESUME;
+}
+
+/*
+ * We get this only in the case where our UTRACE_ACTION_NOREAP was ignored.
+ * That happens solely when a non-leader exec reaps the old leader.
+ */
+static void
+ptrace_report_reap(struct utrace_attached_engine *engine,
+                  struct task_struct *tsk)
+{
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (state != NULL) {
+               ptrace_state_unlink(state);
+               rcu_assign_pointer(engine->data, 0UL);
+               ptrace_done(state);
+               put_ptrace_state(state);
        }
+}
 
-       ret = -EPERM;
-       if (!vx_check(vx_task_xid(child), VX_WATCH|VX_IDENT))
-               goto out_put_task_struct;
+/*
+ * Start tracing the child.  This has to do put_ptrace_state before it can
+ * do allocation that might block.
+ */
+static void
+ptrace_clone_setup(struct utrace_attached_engine *engine,
+                  struct task_struct *parent,
+                  struct ptrace_state *state,
+                  struct task_struct *child)
+{
+       struct task_struct *tracer;
+       struct utrace_attached_engine *child_engine;
+       struct ptrace_state *child_state;
+       int ret;
+       u8 options;
+       int cap_sys_ptrace;
+
+       tracer = state->parent;
+       options = state->options;
+       cap_sys_ptrace = state->cap_sys_ptrace;
+       get_task_struct(tracer);
+       put_ptrace_state(state);
+
+       child_engine = utrace_attach(child, (UTRACE_ATTACH_CREATE
+                                            | UTRACE_ATTACH_EXCLUSIVE
+                                            | UTRACE_ATTACH_MATCH_OPS),
+                                    &ptrace_utrace_ops, 0UL);
+       if (unlikely(IS_ERR(child_engine))) {
+               BUG_ON(PTR_ERR(child_engine) != -ENOMEM);
+               put_task_struct(tracer);
+               goto nomem;
+       }
 
-       if (request == PTRACE_ATTACH) {
-               ret = ptrace_attach(child);
-               goto out_put_task_struct;
+       child_state = ptrace_setup(child, child_engine,
+                                  tracer, options, cap_sys_ptrace, NULL);
+
+       put_task_struct(tracer);
+
+       if (unlikely(IS_ERR(child_state))) {
+               (void) utrace_detach(child, child_engine);
+
+               if (PTR_ERR(child_state) == -ENOMEM)
+                       goto nomem;
+
+               /*
+                * Our tracer has started exiting.  It's
+                * too late to set it up tracing the child.
+                */
+               BUG_ON(PTR_ERR(child_state) != -EALREADY);
        }
+       else {
+               sigaddset(&child->pending.signal, SIGSTOP);
+               set_tsk_thread_flag(child, TIF_SIGPENDING);
+               ret = ptrace_update(child, child_engine, 0, 0);
 
-       ret = ptrace_check_attach(child, request == PTRACE_KILL);
-       if (ret < 0)
-               goto out_put_task_struct;
+               /*
+                * The child hasn't run yet, it can't have died already.
+                */
+               BUG_ON(ret);
+       }
 
-       ret = arch_ptrace(child, request, addr, data);
-       if (ret < 0)
-               goto out_put_task_struct;
+       return;
 
- out_put_task_struct:
-       put_task_struct(child);
- out:
-       unlock_kernel();
-       return ret;
+nomem:
+       printk(KERN_ERR "ptrace out of memory, lost child %d of %d",
+              child->pid, parent->pid);
 }
-#endif /* __ARCH_SYS_PTRACE */
+
+static u32
+ptrace_report_clone(struct utrace_attached_engine *engine,
+                   struct task_struct *parent,
+                   unsigned long clone_flags, struct task_struct *child)
+{
+       int event, option;
+       struct ptrace_state *state = get_ptrace_state(engine, parent);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       pr_debug("%d (%p) engine %p"
+                " ptrace_report_clone child %d (%p) fl %lx\n",
+                parent->pid, parent, engine, child->pid, child, clone_flags);
+
+       event = PTRACE_EVENT_FORK;
+       option = PTRACE_O_TRACEFORK;
+       if (clone_flags & CLONE_VFORK) {
+               event = PTRACE_EVENT_VFORK;
+               option = PTRACE_O_TRACEVFORK;
+       }
+       else if ((clone_flags & CSIGNAL) != SIGCHLD) {
+               event = PTRACE_EVENT_CLONE;
+               option = PTRACE_O_TRACECLONE;
+       }
+
+       if (state->options & option) {
+               state->have_eventmsg = 1;
+               state->u.eventmsg = child->pid;
+       }
+       else
+               event = 0;
+
+       if (!(clone_flags & CLONE_UNTRACED)
+           && (event || (clone_flags & CLONE_PTRACE))) {
+               /*
+                * Have our tracer start following the child too.
+                */
+               ptrace_clone_setup(engine, parent, state, child);
+
+               /*
+                * That did put_ptrace_state, so we have to check
+                * again in case our tracer just started exiting.
+                */
+               state = get_ptrace_state(engine, parent);
+               if (unlikely(state == NULL))
+                       return UTRACE_ACTION_RESUME;
+       }
+
+       if (event)
+               return ptrace_event(engine, parent, state, event);
+
+       put_ptrace_state(state);
+
+       return UTRACE_ACTION_RESUME;
+}
+
+
+static u32
+ptrace_report_vfork_done(struct utrace_attached_engine *engine,
+                        struct task_struct *parent, pid_t child_pid)
+{
+       struct ptrace_state *state = get_ptrace_state(engine, parent);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       state->have_eventmsg = 1;
+       state->u.eventmsg = child_pid;
+       return ptrace_event(engine, parent, state, PTRACE_EVENT_VFORK_DONE);
+}
+
+
+static u32
+ptrace_report_signal(struct utrace_attached_engine *engine,
+                    struct task_struct *tsk, struct pt_regs *regs,
+                    u32 action, siginfo_t *info,
+                    const struct k_sigaction *orig_ka,
+                    struct k_sigaction *return_ka)
+{
+       int signo = info == NULL ? SIGTRAP : info->si_signo;
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       state->syscall = 0;
+       state->have_eventmsg = 0;
+       state->u.siginfo = info;
+       return ptrace_report(engine, tsk, state, signo) | UTRACE_SIGNAL_IGN;
+}
+
+static u32
+ptrace_report_jctl(struct utrace_attached_engine *engine,
+                  struct task_struct *tsk, int type)
+{
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       pr_debug("ptrace %d jctl notify %d type %x exit_code %x\n",
+                tsk->pid, state->parent->pid, type, tsk->exit_code);
+
+       do_notify(tsk, state->parent, type);
+       put_ptrace_state(state);
+
+       return UTRACE_JCTL_NOSIGCHLD;
+}
+
+static u32
+ptrace_report_exec(struct utrace_attached_engine *engine,
+                  struct task_struct *tsk,
+                  const struct linux_binprm *bprm,
+                  struct pt_regs *regs)
+{
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       return ptrace_event(engine, tsk, state,
+                           (state->options & PTRACE_O_TRACEEXEC)
+                           ? PTRACE_EVENT_EXEC : 0);
+}
+
+static u32
+ptrace_report_syscall(struct utrace_attached_engine *engine,
+                     struct task_struct *tsk, struct pt_regs *regs,
+                     int entry)
+{
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+#ifdef PTRACE_SYSEMU
+       if (entry && state->sysemu)
+               tracehook_abort_syscall(regs);
+#endif
+       state->syscall = 1;
+       return ptrace_report(engine, tsk, state,
+                            ((state->options & PTRACE_O_TRACESYSGOOD)
+                             ? 0x80 : 0) | SIGTRAP);
+}
+
+static u32
+ptrace_report_syscall_entry(struct utrace_attached_engine *engine,
+                           struct task_struct *tsk, struct pt_regs *regs)
+{
+       return ptrace_report_syscall(engine, tsk, regs, 1);
+}
+
+static u32
+ptrace_report_syscall_exit(struct utrace_attached_engine *engine,
+                           struct task_struct *tsk, struct pt_regs *regs)
+{
+       return ptrace_report_syscall(engine, tsk, regs, 0);
+}
+
+static u32
+ptrace_report_exit(struct utrace_attached_engine *engine,
+                  struct task_struct *tsk, long orig_code, long *code)
+{
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (unlikely(state == NULL))
+               return UTRACE_ACTION_RESUME;
+
+       state->have_eventmsg = 1;
+       state->u.eventmsg = *code;
+       return ptrace_event(engine, tsk, state, PTRACE_EVENT_EXIT);
+}
+
+static int
+ptrace_unsafe_exec(struct utrace_attached_engine *engine,
+                  struct task_struct *tsk)
+{
+       int unsafe = LSM_UNSAFE_PTRACE;
+       struct ptrace_state *state = get_ptrace_state(engine, tsk);
+       if (likely(state != NULL) && state->cap_sys_ptrace)
+               unsafe = LSM_UNSAFE_PTRACE_CAP;
+       put_ptrace_state(state);
+       return unsafe;
+}
+
+static struct task_struct *
+ptrace_tracer_task(struct utrace_attached_engine *engine,
+                  struct task_struct *target)
+{
+       struct task_struct *parent = NULL;
+       struct ptrace_state *state = get_ptrace_state(engine, target);
+       if (likely(state != NULL)) {
+               parent = state->parent;
+               put_ptrace_state(state);
+       }
+       return parent;
+}
+
+static int
+ptrace_allow_access_process_vm(struct utrace_attached_engine *engine,
+                              struct task_struct *target,
+                              struct task_struct *caller)
+{
+       struct ptrace_state *state;
+       int ours = 0;
+
+       state = get_ptrace_state(engine, target);
+       if (likely(state != NULL)) {
+               ours = (((engine->flags & UTRACE_ACTION_QUIESCE)
+                        || target->state == TASK_STOPPED)
+                       && state->parent == caller);
+               put_ptrace_state(state);
+       }
+
+       return ours && security_ptrace(caller, target) == 0;
+}
+
+
+static const struct utrace_engine_ops ptrace_utrace_ops =
+{
+       .report_syscall_entry = ptrace_report_syscall_entry,
+       .report_syscall_exit = ptrace_report_syscall_exit,
+       .report_exec = ptrace_report_exec,
+       .report_jctl = ptrace_report_jctl,
+       .report_signal = ptrace_report_signal,
+       .report_vfork_done = ptrace_report_vfork_done,
+       .report_clone = ptrace_report_clone,
+       .report_exit = ptrace_report_exit,
+       .report_death = ptrace_report_death,
+       .report_reap = ptrace_report_reap,
+       .unsafe_exec = ptrace_unsafe_exec,
+       .tracer_task = ptrace_tracer_task,
+       .allow_access_process_vm = ptrace_allow_access_process_vm,
+};