vserver 2.0-rc4

[linux-2.6.git] / arch / ia64 / kernel / perfmon.c
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c

index 6ea20f2..4f1543c 100644 (file)
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5,13 +5,13 @@
   * The initial version of perfmon.c was written by
   * Ganesh Venkitachalam, IBM Corp.
   *
- * Then it was modified for perfmon-1.x by Stephane Eranian and 
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
   * David Mosberger, Hewlett Packard Co.
- * 
+ *
   * Version Perfmon-2.x is a rewrite of perfmon-1.x
- * by Stephane Eranian, Hewlett Packard Co. 
+ * by Stephane Eranian, Hewlett Packard Co.
   *
- * Copyright (C) 1999-2003  Hewlett Packard Co
+ * Copyright (C) 1999-2003, 2005  Hewlett Packard Co
   *               Stephane Eranian <eranian@hpl.hp.com>
   *               David Mosberger-Tang <davidm@hpl.hp.com>
   *
@@ -38,8 +38,10 @@
  #include <linux/pagemap.h>
  #include <linux/mount.h>
  #include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/vs_memory.h>
+#include <linux/vs_cvirt.h>
  
-#include <asm/bitops.h>
  #include <asm/errno.h>
  #include <asm/intrinsics.h>
  #include <asm/page.h>
@@ -311,6 +313,7 @@ typedef struct pfm_context {
         unsigned int            ctx_cpu;                /* cpu to which perfmon is applied (system wide) */
  
         int                     ctx_fd;                 /* file descriptor used my this context */
+       pfm_ovfl_arg_t          ctx_ovfl_arg;           /* argument to custom buffer format handler */
  
         pfm_buffer_fmt_t        *ctx_buf_fmt;           /* buffer format callbacks */
         void                    *ctx_smpl_hdr;          /* points to sampling buffer header kernel vaddr */
@@ -571,12 +574,6 @@ pfm_unreserve_page(unsigned long a)
         ClearPageReserved(vmalloc_to_page((void*)a));
  }
  
-static inline int
-pfm_remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
-{
-       return remap_page_range(vma, from, phys_addr, size, prot);
-}
-
  static inline unsigned long
  pfm_protect_ctx_ctxsw(pfm_context_t *x)
  {
@@ -804,18 +801,6 @@ pfm_reset_msgq(pfm_context_t *ctx)
         DPRINT(("ctx=%p msgq reset\n", ctx));
  }
  
-
-/* Here we want the physical address of the memory.
- * This is used when initializing the contents of the
- * area and marking the pages as reserved.
- */
-static inline unsigned long
-pfm_kvirt_to_pa(unsigned long adr)
-{
-       __u64 pa = ia64_tpa(adr);
-       return pa;
-}
-
  static void *
  pfm_rvmalloc(unsigned long size)
  {
@@ -1511,15 +1496,8 @@ exit_pfm_fs(void)
         mntput(pfmfs_mnt);
  }
  
-static loff_t
-pfm_lseek(struct file *file, loff_t offset, int whence)
-{
-       DPRINT(("pfm_lseek called\n"));
-       return -ESPIPE;
-}
-
  static ssize_t
-pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
+pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
  {
         pfm_context_t *ctx;
         pfm_msg_t *msg;
@@ -1544,10 +1522,6 @@ pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
                 DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
                 return -EINVAL;
         }
-       /*
-        * seeks are not allowed on message queues
-        */
-       if (ppos != &filp->f_pos) return -ESPIPE;
  
         PROTECT_CTX(ctx, flags);
  
@@ -1616,7 +1590,7 @@ abort:
  }
  
  static ssize_t
-pfm_write(struct file *file, const char *ubuf,
+pfm_write(struct file *file, const char __user *ubuf,
                           size_t size, loff_t *ppos)
  {
         DPRINT(("pfm_write called\n"));
@@ -1666,7 +1640,7 @@ pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned lon
  }
  
  /*
- * context is locked when coming here and interrupts are disabled
+ * interrupt cannot be masked when coming here
   */
  static inline int
  pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
@@ -1688,7 +1662,6 @@ static int
  pfm_fasync(int fd, struct file *filp, int on)
  {
         pfm_context_t *ctx;
-       unsigned long flags;
         int ret;
  
         if (PFM_IS_FILE(filp) == 0) {
@@ -1701,19 +1674,21 @@ pfm_fasync(int fd, struct file *filp, int on)
                 printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
                 return -EBADF;
         }
-
-
-       PROTECT_CTX(ctx, flags);
-
+       /*
+        * we cannot mask interrupts during this call because this may
+        * may go to sleep if memory is not readily avalaible.
+        *
+        * We are protected from the conetxt disappearing by the get_fd()/put_fd()
+        * done in caller. Serialization of this function is ensured by caller.
+        */
         ret = pfm_do_fasync(fd, filp, ctx, on);
  
+
         DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
                 fd,
                 on,
                 ctx->ctx_async_queue, ret));
  
-       UNPROTECT_CTX(ctx, flags);
-
         return ret;
  }
  
@@ -2025,7 +2000,7 @@ pfm_close(struct inode *inode, struct file *filp)
  
                 /*
                  * XXX: check for signals :
-                *      - ok of explicit close
+                *      - ok for explicit close
                  *      - not ok when coming from exit_files()
                  */
                 schedule();
@@ -2140,7 +2115,7 @@ pfm_no_open(struct inode *irrelevant, struct file *dontcare)
  
  
  static struct file_operations pfm_file_ops = {
-       .llseek   = pfm_lseek,
+       .llseek   = no_llseek,
         .read     = pfm_read,
         .write    = pfm_write,
         .poll     = pfm_poll,
@@ -2187,9 +2162,7 @@ pfm_alloc_fd(struct file **cfile)
  
         DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
  
-       inode->i_sb   = pfmfs_mnt->mnt_sb;
         inode->i_mode = S_IFCHR|S_IRUGO;
-       inode->i_sock = 0;
         inode->i_uid  = current->fsuid;
         inode->i_gid  = current->fsgid;
  
@@ -2237,6 +2210,15 @@ out:
  static void
  pfm_free_fd(int fd, struct file *file)
  {
+       struct files_struct *files = current->files;
+
+       /* 
+        * there ie no fd_uninstall(), so we do it here
+        */
+       spin_lock(&files->file_lock);
+        files->fd[fd] = NULL;
+       spin_unlock(&files->file_lock);
+
         if (file) put_filp(file);
         put_unused_fd(fd);
  }
@@ -2244,14 +2226,14 @@ pfm_free_fd(int fd, struct file *file)
  static int
  pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
  {
-       unsigned long page;
-
         DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
  
         while (size > 0) {
-               page = pfm_kvirt_to_pa(buf);
+               unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
  
-               if (pfm_remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
+
+               if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
+                       return -ENOMEM;
  
                 addr  += PAGE_SIZE;
                 buf   += PAGE_SIZE;
@@ -2287,7 +2269,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
          * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
          *      return -ENOMEM;
          */
-       if (size > task->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;
+       if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+               return -ENOMEM;
  
         /*
          * We do the easy to undo allocations first.
@@ -2312,10 +2295,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
  
         /*
          * partially initialize the vma for the sampling buffer
-        *
-        * The VM_DONTCOPY flag is very important as it ensures that the mapping
-        * will never be inherited for any child process (via fork()) which is always
-        * what we want.
          */
         vma->vm_mm           = mm;
         vma->vm_flags        = VM_READ| VM_MAYREAD |VM_RESERVED;
@@ -2345,6 +2324,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
                 goto error;
         }
         vma->vm_end = vma->vm_start + size;
+       vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
  
         DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
  
@@ -2361,9 +2341,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
          */
         insert_vm_struct(mm, vma);
  
-       // mm->total_vm  += size >> PAGE_SHIFT;
         vx_vmpages_add(mm, size >> PAGE_SHIFT);
-
+       vm_stat_account(vma);
         up_write(&task->mm->mmap_sem);
  
         /*
@@ -2593,7 +2572,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
                 return -EINVAL;
         }
  
-       if (task->state == TASK_ZOMBIE) {
+       if (task->exit_state == EXIT_ZOMBIE) {
                 DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
                 return -EBUSY;
         }
@@ -2603,7 +2582,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
          */
         if (task == current) return 0;
  
-       if (task->state != TASK_STOPPED) {
+       if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
                 DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
                 return -EBUSY;
         }
@@ -2670,8 +2649,10 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
         ctx = pfm_context_alloc();
         if (!ctx) goto error;
  
-       req->ctx_fd = ctx->ctx_fd = pfm_alloc_fd(&filp);
-       if (req->ctx_fd < 0) goto error_file;
+       ret = pfm_alloc_fd(&filp);
+       if (ret < 0) goto error_file;
+
+       req->ctx_fd = ctx->ctx_fd = ret;
  
         /*
          * attach context to file
@@ -3076,11 +3057,12 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  #endif
                 }
  
-               DPRINT(("pmc[%u]=0x%lx loaded=%d access_pmu=%d all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
+               DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
                           cnum,
                           value,
                           is_loaded,
                           can_access_pmu,
+                         flags,
                           ctx->ctx_all_pmcs[0],
                           ctx->ctx_used_pmds[0],
                           ctx->ctx_pmds[cnum].eventid,
@@ -3256,8 +3238,8 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         }
                 }
  
-               DPRINT(("pmd[%u]=0x%lx loaded=%d access_pmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
-                         "long_reset=0x%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
+               DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
+                         "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
                         cnum,
                         value,
                         is_loaded,
@@ -3267,6 +3249,8 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         ctx->ctx_pmds[cnum].short_reset,
                         ctx->ctx_pmds[cnum].long_reset,
                         PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
+                       ctx->ctx_pmds[cnum].seed,
+                       ctx->ctx_pmds[cnum].mask,
                         ctx->ctx_used_pmds[0],
                         ctx->ctx_pmds[cnum].reset_pmds[0],
                         ctx->ctx_reload_pmds[0],
@@ -3344,7 +3328,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         }
         expert_mode = pfm_sysctl.expert_mode; 
  
-       DPRINT(("loaded=%d access_pmu=%d ctx_state=%d\n",
+       DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
                 is_loaded,
                 can_access_pmu,
                 state));
@@ -3884,7 +3868,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
  
                         ctx->ctx_ibrs[rnum] = dbreg.val;
  
-                       DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x is_loaded=%d access_pmu=%d\n",
+                       DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
                                 rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
                 } else {
                         CTX_USED_DBR(ctx, rnum);
@@ -3895,7 +3879,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
                         }
                         ctx->ctx_dbrs[rnum] = dbreg.val;
  
-                       DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x is_loaded=%d access_pmu=%d\n",
+                       DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
                                 rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
                 }
         }
@@ -3996,7 +3980,10 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         state     = ctx->ctx_state;
         is_system = ctx->ctx_fl_system;
  
-       if (state != PFM_CTX_LOADED && state != PFM_CTX_MASKED) return -EINVAL;
+       /*
+        * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
+        */
+       if (state == PFM_CTX_UNLOADED) return -EINVAL;
  
         /*
          * In system wide and when the context is loaded, access can only happen
@@ -4585,31 +4572,6 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
         return 0;
  }
  
-static void
-pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
-{
-       struct task_struct *task = ctx->ctx_task;
-
-       ia64_psr(regs)->up = 0;
-       ia64_psr(regs)->sp = 1;
-
-       if (GET_PMU_OWNER() == task) {
-               DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
-               SET_PMU_OWNER(NULL, NULL);
-       }
-
-       /*
-        * disconnect the task from the context and vice-versa
-        */
-       PFM_SET_WORK_PENDING(task, 0);
-
-       task->thread.pfm_context  = NULL;
-       task->thread.flags       &= ~IA64_THREAD_PM_VALID;
-
-       DPRINT(("force cleanupf for [%d]\n",  task->pid));
-}
-
-
  
  /*
   * called only from exit_thread(): task == current
@@ -4727,21 +4689,22 @@ static int
  pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
  {
         struct task_struct *task;
-       int state;
+       int state, old_state;
  
+recheck:
         state = ctx->ctx_state;
+       task  = ctx->ctx_task;
  
-       task = PFM_CTX_TASK(ctx);
         if (task == NULL) {
                 DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
                 return 0;
         }
  
         DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
-                               ctx->ctx_fd,
-                               state,
-                               task->pid,
-                               task->state, PFM_CMD_STOPPED(cmd)));
+               ctx->ctx_fd,
+               state,
+               task->pid,
+               task->state, PFM_CMD_STOPPED(cmd)));
  
         /*
          * self-monitoring always ok.
@@ -4753,31 +4716,63 @@ pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
         if (task == current || ctx->ctx_fl_system) return 0;
  
         /*
-        * context is UNLOADED, MASKED we are safe to go
+        * if context is UNLOADED we are safe to go
          */
-       if (state != PFM_CTX_LOADED) return 0;
+       if (state == PFM_CTX_UNLOADED) return 0;
  
-       if (state == PFM_CTX_ZOMBIE) return -EINVAL;
+       /*
+        * no command can operate on a zombie context
+        */
+       if (state == PFM_CTX_ZOMBIE) {
+               DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
+               return -EINVAL;
+       }
  
         /*
-        * context is loaded, we must make sure the task is stopped
+        * context is LOADED or MASKED. Some commands may need to have 
+        * the task stopped.
+        *
          * We could lift this restriction for UP but it would mean that
          * the user has no guarantee the task would not run between
          * two successive calls to perfmonctl(). That's probably OK.
          * If this user wants to ensure the task does not run, then
          * the task must be stopped.
          */
-       if (PFM_CMD_STOPPED(cmd) && task->state != TASK_STOPPED) {
-               DPRINT(("[%d] task not in stopped state\n", task->pid));
-               return -EBUSY;
-       }
+       if (PFM_CMD_STOPPED(cmd)) {
+               if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
+                       DPRINT(("[%d] task not in stopped state\n", task->pid));
+                       return -EBUSY;
+               }
+               /*
+                * task is now stopped, wait for ctxsw out
+                *
+                * This is an interesting point in the code.
+                * We need to unprotect the context because
+                * the pfm_save_regs() routines needs to grab
+                * the same lock. There are danger in doing
+                * this because it leaves a window open for
+                * another task to get access to the context
+                * and possibly change its state. The one thing
+                * that is not possible is for the context to disappear
+                * because we are protected by the VFS layer, i.e.,
+                * get_fd()/put_fd().
+                */
+               old_state = state;
  
-       UNPROTECT_CTX(ctx, flags);
+               UNPROTECT_CTX(ctx, flags);
  
-       wait_task_inactive(task);
+               wait_task_inactive(task);
  
-       PROTECT_CTX(ctx, flags);
+               PROTECT_CTX(ctx, flags);
  
+               /*
+                * we must recheck to verify if state has changed
+                */
+               if (ctx->ctx_state != old_state) {
+                       DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
+                       goto recheck;
+               }
+       }
         return 0;
  }
  
@@ -4785,10 +4780,8 @@ pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
   * system-call entry point (must return long)
   */
  asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
-               long arg8, long stack)
+sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
  {
-       struct pt_regs *regs = (struct pt_regs *)&stack;
         struct file *file = NULL;
         pfm_context_t *ctx = NULL;
         unsigned long flags = 0UL;
@@ -4912,7 +4905,7 @@ restart_args:
         if (unlikely(ret)) goto abort_locked;
  
  skip_fd:
-       ret = (*func)(ctx, args_k, count, regs);
+       ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
  
         call_made = 1;
  
@@ -4983,26 +4976,14 @@ pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_reg
  static void
  pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
  {
-       if (ctx->ctx_fl_system) {
-               printk(KERN_ERR "perfmon: pfm_context_force_terminate [%d] is system-wide\n", current->pid);
-               return;
-       }
-       /*
-        * we stop the whole thing, we do no need to flush
-        * we know we WERE masked
-        */
-       pfm_clear_psr_up();
-       ia64_psr(regs)->up = 0;
-       ia64_psr(regs)->sp = 1;
+       int ret;
  
-       /*
-        * disconnect the task from the context and vice-versa
-        */
-       current->thread.pfm_context  = NULL;
-       current->thread.flags       &= ~IA64_THREAD_PM_VALID;
-       ctx->ctx_task = NULL;
+       DPRINT(("entering for [%d]\n", current->pid));
  
-       DPRINT(("context terminated\n"));
+       ret = pfm_context_unload(ctx, NULL, 0, regs);
+       if (ret) {
+               printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
+       }
  
         /*
          * and wakeup controlling task, indicating we are now disconnected
@@ -5062,6 +5043,18 @@ pfm_handle_work(void)
  
         UNPROTECT_CTX(ctx, flags);
  
+        /*
+         * pfm_handle_work() is currently called with interrupts disabled.
+         * The down_interruptible call may sleep, therefore we
+         * must re-enable interrupts to avoid deadlocks. It is
+         * safe to do so because this function is called ONLY
+         * when returning to user level (PUStk=1), in which case
+         * there is no risk of kernel stack overflow due to deep
+         * interrupt nesting.
+         */
+       BUG_ON(flags & IA64_PSR_I);
+       local_irq_enable();
+
         DPRINT(("before block sleeping\n"));
  
         /*
@@ -5072,6 +5065,12 @@ pfm_handle_work(void)
  
         DPRINT(("after block sleeping ret=%d\n", ret));
  
+       /*
+        * disable interrupts to restore state we had upon entering
+        * this function
+        */
+       local_irq_disable();
+
         PROTECT_CTX(ctx, flags);
  
         /*
@@ -5186,7 +5185,7 @@ pfm_end_notify_user(pfm_context_t *ctx)
  static void
  pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
  {
-       pfm_ovfl_arg_t ovfl_arg;
+       pfm_ovfl_arg_t *ovfl_arg;
         unsigned long mask;
         unsigned long old_val, ovfl_val, new_val;
         unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
@@ -5273,7 +5272,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                 int j, k, ret = 0;
                 int this_cpu = smp_processor_id();
  
-               pmd_mask   = ovfl_pmds >> PMU_FIRST_COUNTER;
+               pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
+               ovfl_arg = &ctx->ctx_ovfl_arg;
  
                 prefetch(ctx->ctx_smpl_hdr);
  
@@ -5283,15 +5283,15 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
  
                         if ((pmd_mask & 0x1) == 0) continue;
  
-                       ovfl_arg.ovfl_pmd      = (unsigned char )i;
-                       ovfl_arg.ovfl_notify   = ovfl_notify & mask ? 1 : 0;
-                       ovfl_arg.active_set    = 0;
-                       ovfl_arg.ovfl_ctrl.val = 0; /* module must fill in all fields */
-                       ovfl_arg.smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
+                       ovfl_arg->ovfl_pmd      = (unsigned char )i;
+                       ovfl_arg->ovfl_notify   = ovfl_notify & mask ? 1 : 0;
+                       ovfl_arg->active_set    = 0;
+                       ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
+                       ovfl_arg->smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
  
-                       ovfl_arg.pmd_value      = ctx->ctx_pmds[i].val;
-                       ovfl_arg.pmd_last_reset = ctx->ctx_pmds[i].lval;
-                       ovfl_arg.pmd_eventid    = ctx->ctx_pmds[i].eventid;
+                       ovfl_arg->pmd_value      = ctx->ctx_pmds[i].val;
+                       ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
+                       ovfl_arg->pmd_eventid    = ctx->ctx_pmds[i].eventid;
  
                         /*
                          * copy values of pmds of interest. Sampling format may copy them
@@ -5300,8 +5300,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                         if (smpl_pmds) {
                                 for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
                                         if ((smpl_pmds & 0x1) == 0) continue;
-                                       ovfl_arg.smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
-                                       DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg.smpl_pmds_values[k-1]));
+                                       ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
+                                       DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
                                 }
                         }
  
@@ -5312,7 +5312,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                         /*
                          * call custom buffer format record (handler) routine
                          */
-                       ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, &ovfl_arg, regs, tstamp);
+                       ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
  
                         end_cycles = ia64_get_itc();
  
@@ -5320,13 +5320,13 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                          * For those controls, we take the union because they have
                          * an all or nothing behavior.
                          */
-                       ovfl_ctrl.bits.notify_user     |= ovfl_arg.ovfl_ctrl.bits.notify_user;
-                       ovfl_ctrl.bits.block_task      |= ovfl_arg.ovfl_ctrl.bits.block_task;
-                       ovfl_ctrl.bits.mask_monitoring |= ovfl_arg.ovfl_ctrl.bits.mask_monitoring;
+                       ovfl_ctrl.bits.notify_user     |= ovfl_arg->ovfl_ctrl.bits.notify_user;
+                       ovfl_ctrl.bits.block_task      |= ovfl_arg->ovfl_ctrl.bits.block_task;
+                       ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
                         /*
                          * build the bitmask of pmds to reset now
                          */
-                       if (ovfl_arg.ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
+                       if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
  
                         pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
                 }
@@ -5356,9 +5356,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                 if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
         }
  
-       DPRINT(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n",
-               ovfl_pmds,
-               reset_pmds));
+       DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
+
         /*
          * reset the requested PMD registers using the short reset values
          */
@@ -5794,6 +5793,32 @@ pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_c
  }
  
  #ifdef CONFIG_SMP
+
+static void
+pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
+{
+       struct task_struct *task = ctx->ctx_task;
+
+       ia64_psr(regs)->up = 0;
+       ia64_psr(regs)->sp = 1;
+
+       if (GET_PMU_OWNER() == task) {
+               DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
+               SET_PMU_OWNER(NULL, NULL);
+       }
+
+       /*
+        * disconnect the task from the context and vice-versa
+        */
+       PFM_SET_WORK_PENDING(task, 0);
+
+       task->thread.pfm_context  = NULL;
+       task->thread.flags       &= ~IA64_THREAD_PM_VALID;
+
+       DPRINT(("force cleanup for [%d]\n",  task->pid));
+}
+
+
  /*
   * in 2.6, interrupts are masked when we come here and the runqueue lock is held
   */
@@ -5832,14 +5857,6 @@ pfm_save_regs(struct task_struct *task)
                 return;
         }
  
-       /*
-        * sanity check
-        */
-       if (ctx->ctx_last_activation != GET_ACTIVATION()) {
-               pfm_unprotect_ctx_ctxsw(ctx, flags);
-               return;
-       }
-
         /*
          * save current PSR: needed because we modify it
          */
@@ -6288,15 +6305,15 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
          */
         is_self = ctx->ctx_task == task ? 1 : 0;
  
-#ifdef CONFIG_SMP
-       if (task == current) {
-#else
         /*
-        * in UP, the state can still be in the registers
+        * can access PMU is task is the owner of the PMU state on the current CPU
+        * or if we are running on the CPU bound to the context in system-wide mode
+        * (that is not necessarily the task the context is attached to in this mode).
+        * In system-wide we always have can_access_pmu true because a task running on an
+        * invalid processor is flagged earlier in the call stack (see pfm_stop).
          */
-       if (task == current || GET_PMU_OWNER() == task) {
-#endif
-               can_access_pmu = 1;
+       can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
+       if (can_access_pmu) {
                 /*
                  * Mark the PMU as not owned
                  * This will cause the interrupt handler to do nothing in case an overflow
@@ -6306,6 +6323,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
                  * on.
                  */
                 SET_PMU_OWNER(NULL, NULL);
+               DPRINT(("releasing ownership\n"));
  
                 /*
                  * read current overflow status:
@@ -6334,6 +6352,9 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
          * XXX: sampling situation is not taken into account here
          */
         mask2 = ctx->ctx_used_pmds[0];
+
+       DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
+
         for (i = 0; mask2; i++, mask2>>=1) {
  
                 /* skip non used pmds */
@@ -6372,7 +6393,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
                         }
                 }
  
-               DPRINT(("[%d] is_self=%d ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, is_self, i, val, pmd_val));
+               DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
  
                 if (is_self) task->thread.pmds[i] = pmd_val;
  
@@ -6650,8 +6671,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
  }
  #else  /* !CONFIG_PERFMON */
  asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
-               long arg8, long stack)
+sys_perfmonctl (int fd, int cmd, void *arg, int count)
  {
         return -ENOSYS;
  }