linux 2.6.16.38 w/ vs2.0.3-rc1

[linux-2.6.git] / arch / ia64 / kernel / perfmon.c
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c

index ae2eb13..11baf2a 100644 (file)
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5,13 +5,13 @@
   * The initial version of perfmon.c was written by
   * Ganesh Venkitachalam, IBM Corp.
   *
- * Then it was modified for perfmon-1.x by Stephane Eranian and 
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
   * David Mosberger, Hewlett Packard Co.
- * 
+ *
   * Version Perfmon-2.x is a rewrite of perfmon-1.x
- * by Stephane Eranian, Hewlett Packard Co. 
+ * by Stephane Eranian, Hewlett Packard Co.
   *
- * Copyright (C) 1999-2003  Hewlett Packard Co
+ * Copyright (C) 1999-2005  Hewlett Packard Co
   *               Stephane Eranian <eranian@hpl.hp.com>
   *               David Mosberger-Tang <davidm@hpl.hp.com>
   *
@@ -26,6 +26,7 @@
  #include <linux/interrupt.h>
  #include <linux/smp_lock.h>
  #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
  #include <linux/init.h>
  #include <linux/vmalloc.h>
  #include <linux/mm.h>
@@ -36,9 +37,13 @@
  #include <linux/vfs.h>
  #include <linux/pagemap.h>
  #include <linux/mount.h>
-#include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/rcupdate.h>
+#include <linux/completion.h>
+#include <linux/vs_memory.h>
+#include <linux/vs_cvirt.h>
  
-#include <asm/bitops.h>
  #include <asm/errno.h>
  #include <asm/intrinsics.h>
  #include <asm/page.h>
@@ -86,27 +91,25 @@
  #define        PFM_REG_CONFIG          (0x8<<4|PFM_REG_IMPL) /* configuration register */
  #define PFM_REG_BUFFER         (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
  
-#define PMC_IS_LAST(i) (pmu_conf.pmc_desc[i].type & PFM_REG_END)
-#define PMD_IS_LAST(i) (pmu_conf.pmd_desc[i].type & PFM_REG_END)
-
-#define PFM_IS_DISABLED() (pmu_conf.enabled == 0)
+#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END)
+#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END)
  
  #define PMC_OVFL_NOTIFY(ctx, i)        ((ctx)->ctx_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
  
  /* i assumed unsigned */
-#define PMC_IS_IMPL(i)   (i< PMU_MAX_PMCS && (pmu_conf.pmc_desc[i].type & PFM_REG_IMPL))
-#define PMD_IS_IMPL(i)   (i< PMU_MAX_PMDS && (pmu_conf.pmd_desc[i].type & PFM_REG_IMPL))
+#define PMC_IS_IMPL(i)   (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
+#define PMD_IS_IMPL(i)   (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
  
  /* XXX: these assume that register i is implemented */
-#define PMD_IS_COUNTING(i) ((pmu_conf.pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
-#define PMC_IS_COUNTING(i) ((pmu_conf.pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
-#define PMC_IS_MONITOR(i)  ((pmu_conf.pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
-#define PMC_IS_CONTROL(i)  ((pmu_conf.pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
+#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
+#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
+#define PMC_IS_MONITOR(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
+#define PMC_IS_CONTROL(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
  
-#define PMC_DFL_VAL(i)     pmu_conf.pmc_desc[i].default_value
-#define PMC_RSVD_MASK(i)   pmu_conf.pmc_desc[i].reserved_mask
-#define PMD_PMD_DEP(i)    pmu_conf.pmd_desc[i].dep_pmd[0]
-#define PMC_PMD_DEP(i)    pmu_conf.pmc_desc[i].dep_pmd[0]
+#define PMC_DFL_VAL(i)     pmu_conf->pmc_desc[i].default_value
+#define PMC_RSVD_MASK(i)   pmu_conf->pmc_desc[i].reserved_mask
+#define PMD_PMD_DEP(i)    pmu_conf->pmd_desc[i].dep_pmd[0]
+#define PMC_PMD_DEP(i)    pmu_conf->pmc_desc[i].dep_pmd[0]
  
  #define PFM_NUM_IBRS     IA64_NUM_DBG_REGS
  #define PFM_NUM_DBRS     IA64_NUM_DBG_REGS
@@ -133,6 +136,8 @@
  #define PFM_CPUINFO_SET(v)     pfm_get_cpu_var(pfm_syst_info) |= (v)
  #define PFM_CPUINFO_GET()      pfm_get_cpu_var(pfm_syst_info)
  
+#define RDEP(x)        (1UL<<(x))
+
  /*
   * context protection macros
   * in SMP:
@@ -284,7 +289,7 @@ typedef struct pfm_context {
  
         unsigned long           ctx_ovfl_regs[4];       /* which registers overflowed (notification) */
  
-       struct semaphore        ctx_restart_sem;        /* use for blocking notification mode */
+       struct completion       ctx_restart_done;       /* use for blocking notification mode */
  
         unsigned long           ctx_used_pmds[4];       /* bitmask of PMD used            */
         unsigned long           ctx_all_pmds[4];        /* bitmask of all accessible PMDs */
@@ -310,6 +315,7 @@ typedef struct pfm_context {
         unsigned int            ctx_cpu;                /* cpu to which perfmon is applied (system wide) */
  
         int                     ctx_fd;                 /* file descriptor used my this context */
+       pfm_ovfl_arg_t          ctx_ovfl_arg;           /* argument to custom buffer format handler */
  
         pfm_buffer_fmt_t        *ctx_buf_fmt;           /* buffer format callbacks */
         void                    *ctx_smpl_hdr;          /* points to sampling buffer header kernel vaddr */
@@ -374,26 +380,32 @@ typedef struct {
   * dep_pmd[]: a bitmask of dependent PMD registers
   * dep_pmc[]: a bitmask of dependent PMC registers
   */
+typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
  typedef struct {
         unsigned int            type;
         int                     pm_pos;
         unsigned long           default_value;  /* power-on default value */
         unsigned long           reserved_mask;  /* bitmask of reserved bits */
-       int                     (*read_check)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-       int                     (*write_check)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+       pfm_reg_check_t         read_check;
+       pfm_reg_check_t         write_check;
         unsigned long           dep_pmd[4];
         unsigned long           dep_pmc[4];
  } pfm_reg_desc_t;
  
  /* assume cnum is a valid monitor */
-#define PMC_PM(cnum, val)      (((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
-#define PMC_WR_FUNC(cnum)      (pmu_conf.pmc_desc[cnum].write_check)
-#define PMD_WR_FUNC(cnum)      (pmu_conf.pmd_desc[cnum].write_check)
-#define PMD_RD_FUNC(cnum)      (pmu_conf.pmd_desc[cnum].read_check)
+#define PMC_PM(cnum, val)      (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
  
  /*
   * This structure is initialized at boot time and contains
   * a description of the PMU main characteristics.
+ *
+ * If the probe function is defined, detection is based
+ * on its return value: 
+ *     - 0 means recognized PMU
+ *     - anything else means not supported
+ * When the probe function is not defined, then the pmu_family field
+ * is used and it must match the host CPU family such that:
+ *     - cpu->family & config->pmu_family != 0
   */
  typedef struct {
         unsigned long  ovfl_val;        /* overflow value for counters */
@@ -407,15 +419,18 @@ typedef struct {
         unsigned long  impl_pmds[4];    /* bitmask of implemented PMDS */
  
         char          *pmu_name;        /* PMU family name */
-       unsigned int  enabled;          /* indicates if perfmon initialized properly */
         unsigned int  pmu_family;       /* cpuid family pattern used to identify pmu */
-
+       unsigned int  flags;            /* pmu specific flags */
         unsigned int  num_ibrs;         /* number of IBRS: computed at init time */
         unsigned int  num_dbrs;         /* number of DBRS: computed at init time */
         unsigned int  num_counters;     /* PMC/PMD counting pairs : computed at init time */
-
+       int           (*probe)(void);   /* customized probe routine */
         unsigned int  use_rr_dbregs:1;  /* set if debug registers used for range restriction */
  } pmu_config_t;
+/*
+ * PMU specific flags
+ */
+#define PFM_PMU_IRQ_RESEND     1       /* PMU needs explicit IRQ resend */
  
  /*
   * debug register related type definitions
@@ -468,14 +483,6 @@ typedef struct {
  
  #define PFM_CMD_ARG_MANY       -1 /* cannot be zero */
  
-typedef struct {
-       int     debug;          /* turn on/off debugging via syslog */
-       int     debug_ovfl;     /* turn on/off debug printk in overflow handler */
-       int     fastctxsw;      /* turn on/off fast (unsecure) ctxsw */
-       int     expert_mode;    /* turn on/off value checking */
-       int     debug_pfm_read;
-} pfm_sysctl_t;
-
  typedef struct {
         unsigned long pfm_spurious_ovfl_intr_count;     /* keep track of spurious ovfl interrupts */
         unsigned long pfm_replay_ovfl_intr_count;       /* keep track of replayed ovfl interrupts */
@@ -494,15 +501,20 @@ typedef struct {
  static pfm_stats_t             pfm_stats[NR_CPUS];
  static pfm_session_t           pfm_sessions;   /* global sessions information */
  
+static DEFINE_SPINLOCK(pfm_alt_install_check);
+static pfm_intr_handler_desc_t  *pfm_alt_intr_handler;
+
  static struct proc_dir_entry   *perfmon_dir;
  static pfm_uuid_t              pfm_null_uuid = {0,};
  
  static spinlock_t              pfm_buffer_fmt_lock;
  static LIST_HEAD(pfm_buffer_fmt_list);
  
+static pmu_config_t            *pmu_conf;
+
  /* sysctl() controls */
-static pfm_sysctl_t pfm_sysctl;
-int pfm_debug_var;
+pfm_sysctl_t pfm_sysctl;
+EXPORT_SYMBOL(pfm_sysctl);
  
  static ctl_table pfm_ctl_table[]={
         {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
@@ -559,12 +571,6 @@ pfm_unreserve_page(unsigned long a)
         ClearPageReserved(vmalloc_to_page((void*)a));
  }
  
-static inline int
-pfm_remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
-{
-       return remap_page_range(vma, from, phys_addr, size, prot);
-}
-
  static inline unsigned long
  pfm_protect_ctx_ctxsw(pfm_context_t *x)
  {
@@ -572,7 +578,7 @@ pfm_protect_ctx_ctxsw(pfm_context_t *x)
         return 0UL;
  }
  
-static inline unsigned long
+static inline void
  pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
  {
         spin_unlock(&(x)->ctx_lock);
@@ -607,6 +613,7 @@ DEFINE_PER_CPU(unsigned long, pfm_syst_info);
  DEFINE_PER_CPU(struct task_struct *, pmu_owner);
  DEFINE_PER_CPU(pfm_context_t  *, pmu_ctx);
  DEFINE_PER_CPU(unsigned long, pmu_activation_number);
+EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
  
  
  /* forward declaration */
@@ -620,20 +627,21 @@ static void pfm_lazy_save_regs (struct task_struct *ta);
  #endif
  
  void dump_pmu_state(const char *);
+static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
  
-/*
- * the HP simulator must be first because
- * CONFIG_IA64_HP_SIM is independent of CONFIG_MCKINLEY or CONFIG_ITANIUM
- */
-#if defined(CONFIG_IA64_HP_SIM)
-#include "perfmon_hpsim.h"
-#elif   defined(CONFIG_ITANIUM)
  #include "perfmon_itanium.h"
-#elif defined(CONFIG_MCKINLEY)
  #include "perfmon_mckinley.h"
-#else
+#include "perfmon_montecito.h"
  #include "perfmon_generic.h"
-#endif
+
+static pmu_config_t *pmu_confs[]={
+       &pmu_conf_mont,
+       &pmu_conf_mck,
+       &pmu_conf_ita,
+       &pmu_conf_gen, /* must be last */
+       NULL
+};
+
  
  static int pfm_end_notify_user(pfm_context_t *ctx);
  
@@ -702,6 +710,7 @@ pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
  
         for (i=0; i < nibrs; i++) {
                 ia64_set_ibr(i, ibrs[i]);
+               ia64_dv_serialize_instruction();
         }
         ia64_srlz_i();
  }
@@ -713,6 +722,7 @@ pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
  
         for (i=0; i < ndbrs; i++) {
                 ia64_set_dbr(i, dbrs[i]);
+               ia64_dv_serialize_data();
         }
         ia64_srlz_d();
  }
@@ -723,7 +733,7 @@ pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
  static inline unsigned long
  pfm_read_soft_counter(pfm_context_t *ctx, int i)
  {
-       return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf.ovfl_val);
+       return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
  }
  
  /*
@@ -732,7 +742,7 @@ pfm_read_soft_counter(pfm_context_t *ctx, int i)
  static inline void
  pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
  {
-       unsigned long ovfl_val = pmu_conf.ovfl_val;
+       unsigned long ovfl_val = pmu_conf->ovfl_val;
  
         ctx->ctx_pmds[i].val = val  & ~ovfl_val;
         /*
@@ -791,18 +801,6 @@ pfm_reset_msgq(pfm_context_t *ctx)
         DPRINT(("ctx=%p msgq reset\n", ctx));
  }
  
-
-/* Here we want the physical address of the memory.
- * This is used when initializing the contents of the
- * area and marking the pages as reserved.
- */
-static inline unsigned long
-pfm_kvirt_to_pa(unsigned long adr)
-{
-       __u64 pa = ia64_tpa(adr);
-       return pa;
-}
-
  static void *
  pfm_rvmalloc(unsigned long size)
  {
@@ -878,7 +876,7 @@ pfm_mask_monitoring(struct task_struct *task)
  
         DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
  
-       ovfl_mask = pmu_conf.ovfl_val;
+       ovfl_mask = pmu_conf->ovfl_val;
         /*
          * monitoring can only be masked as a result of a valid
          * counter overflow. In UP, it means that the PMU still
@@ -953,7 +951,7 @@ pfm_restore_monitoring(struct task_struct *task)
         int i, is_system;
  
         is_system = ctx->ctx_fl_system;
-       ovfl_mask = pmu_conf.ovfl_val;
+       ovfl_mask = pmu_conf->ovfl_val;
  
         if (task != current) {
                 printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
@@ -1024,8 +1022,8 @@ pfm_restore_monitoring(struct task_struct *task)
          * XXX: need to optimize 
          */
         if (ctx->ctx_fl_using_dbreg) {
-               pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-               pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+               pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+               pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
         }
  
         /*
@@ -1058,7 +1056,7 @@ static inline void
  pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
  {
         int i;
-       unsigned long val, ovfl_val = pmu_conf.ovfl_val;
+       unsigned long val, ovfl_val = pmu_conf->ovfl_val;
  
         for (i=0; mask; i++, mask>>=1) {
                 if ((mask & 0x1) == 0) continue;
@@ -1075,7 +1073,7 @@ static inline void
  pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
  {
         struct thread_struct *thread = &task->thread;
-       unsigned long ovfl_val = pmu_conf.ovfl_val;
+       unsigned long ovfl_val = pmu_conf->ovfl_val;
         unsigned long mask = ctx->ctx_all_pmds[0];
         unsigned long val;
         int i;
@@ -1277,6 +1275,8 @@ out:
  }
  EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
  
+extern void update_pal_halt_status(int);
+
  static int
  pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
  {
@@ -1323,6 +1323,11 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
                 is_syswide,
                 cpu));
  
+       /*
+        * disable default_idle() to go to PAL_HALT
+        */
+       update_pal_halt_status(0);
+
         UNLOCK_PFS(flags);
  
         return 0;
@@ -1330,7 +1335,7 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
  error_conflict:
         DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
                 pfm_sessions.pfs_sys_session[cpu]->pid,
-               smp_processor_id()));
+               cpu));
  abort:
         UNLOCK_PFS(flags);
  
@@ -1378,6 +1383,12 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
                 is_syswide,
                 cpu));
  
+       /*
+        * if possible, enable default_idle() to go into PAL_HALT
+        */
+       if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
+               update_pal_halt_status(1);
+
         UNLOCK_PFS(flags);
  
         return 0;
@@ -1498,15 +1509,8 @@ exit_pfm_fs(void)
         mntput(pfmfs_mnt);
  }
  
-static loff_t
-pfm_lseek(struct file *file, loff_t offset, int whence)
-{
-       DPRINT(("pfm_lseek called\n"));
-       return -ESPIPE;
-}
-
  static ssize_t
-pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
+pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
  {
         pfm_context_t *ctx;
         pfm_msg_t *msg;
@@ -1531,10 +1535,6 @@ pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
                 DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
                 return -EINVAL;
         }
-       /*
-        * seeks are not allowed on message queues
-        */
-       if (ppos != &filp->f_pos) return -ESPIPE;
  
         PROTECT_CTX(ctx, flags);
  
@@ -1591,7 +1591,7 @@ pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
                 goto abort_locked;
         }
  
-       DPRINT(("[%d] fd=%d type=%d\n", current->pid, msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
+       DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
  
         ret = -EFAULT;
         if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
@@ -1603,7 +1603,7 @@ abort:
  }
  
  static ssize_t
-pfm_write(struct file *file, const char *ubuf,
+pfm_write(struct file *file, const char __user *ubuf,
                           size_t size, loff_t *ppos)
  {
         DPRINT(("pfm_write called\n"));
@@ -1653,7 +1653,7 @@ pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned lon
  }
  
  /*
- * context is locked when coming here and interrupts are disabled
+ * interrupt cannot be masked when coming here
   */
  static inline int
  pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
@@ -1675,7 +1675,6 @@ static int
  pfm_fasync(int fd, struct file *filp, int on)
  {
         pfm_context_t *ctx;
-       unsigned long flags;
         int ret;
  
         if (PFM_IS_FILE(filp) == 0) {
@@ -1688,19 +1687,21 @@ pfm_fasync(int fd, struct file *filp, int on)
                 printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
                 return -EBADF;
         }
-
-
-       PROTECT_CTX(ctx, flags);
-
+       /*
+        * we cannot mask interrupts during this call because this may
+        * may go to sleep if memory is not readily avalaible.
+        *
+        * We are protected from the conetxt disappearing by the get_fd()/put_fd()
+        * done in caller. Serialization of this function is ensured by caller.
+        */
         ret = pfm_do_fasync(fd, filp, ctx, on);
  
+
         DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
                 fd,
                 on,
                 ctx->ctx_async_queue, ret));
  
-       UNPROTECT_CTX(ctx, flags);
-
         return ret;
  }
  
@@ -1714,7 +1715,7 @@ static void
  pfm_syswide_force_stop(void *info)
  {
         pfm_context_t   *ctx = (pfm_context_t *)info;
-       struct pt_regs *regs = ia64_task_regs(current);
+       struct pt_regs *regs = task_pt_regs(current);
         struct task_struct *owner;
         unsigned long flags;
         int ret;
@@ -1819,7 +1820,7 @@ pfm_flush(struct file *filp)
         is_system = ctx->ctx_fl_system;
  
         task = PFM_CTX_TASK(ctx);
-       regs = ia64_task_regs(task);
+       regs = task_pt_regs(task);
  
         DPRINT(("ctx_state=%d is_current=%d\n",
                 state,
@@ -1949,7 +1950,7 @@ pfm_close(struct inode *inode, struct file *filp)
         is_system = ctx->ctx_fl_system;
  
         task = PFM_CTX_TASK(ctx);
-       regs = ia64_task_regs(task);
+       regs = task_pt_regs(task);
  
         DPRINT(("ctx_state=%d is_current=%d\n", 
                 state,
@@ -1993,7 +1994,7 @@ pfm_close(struct inode *inode, struct file *filp)
                 /*
                  * force task to wake up from MASKED state
                  */
-               up(&ctx->ctx_restart_sem);
+               complete(&ctx->ctx_restart_done);
  
                 DPRINT(("waking up ctx_state=%d\n", state));
  
@@ -2012,7 +2013,7 @@ pfm_close(struct inode *inode, struct file *filp)
  
                 /*
                  * XXX: check for signals :
-                *      - ok of explicit close
+                *      - ok for explicit close
                  *      - not ok when coming from exit_files()
                  */
                 schedule();
@@ -2127,7 +2128,7 @@ pfm_no_open(struct inode *irrelevant, struct file *dontcare)
  
  
  static struct file_operations pfm_file_ops = {
-       .llseek   = pfm_lseek,
+       .llseek   = no_llseek,
         .read     = pfm_read,
         .write    = pfm_write,
         .poll     = pfm_poll,
@@ -2174,9 +2175,7 @@ pfm_alloc_fd(struct file **cfile)
  
         DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
  
-       inode->i_sb   = pfmfs_mnt->mnt_sb;
         inode->i_mode = S_IFCHR|S_IRUGO;
-       inode->i_sock = 0;
         inode->i_uid  = current->fsuid;
         inode->i_gid  = current->fsgid;
  
@@ -2224,21 +2223,33 @@ out:
  static void
  pfm_free_fd(int fd, struct file *file)
  {
-       if (file) put_filp(file);
+       struct files_struct *files = current->files;
+       struct fdtable *fdt;
+
+       /* 
+        * there ie no fd_uninstall(), so we do it here
+        */
+       spin_lock(&files->file_lock);
+       fdt = files_fdtable(files);
+       rcu_assign_pointer(fdt->fd[fd], NULL);
+       spin_unlock(&files->file_lock);
+
+       if (file)
+               put_filp(file);
         put_unused_fd(fd);
  }
  
  static int
  pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
  {
-       unsigned long page;
-
         DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
  
         while (size > 0) {
-               page = pfm_kvirt_to_pa(buf);
+               unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
+
  
-               if (pfm_remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
+               if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
+                       return -ENOMEM;
  
                 addr  += PAGE_SIZE;
                 buf   += PAGE_SIZE;
@@ -2274,7 +2285,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
          * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
          *      return -ENOMEM;
          */
-       if (size > task->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;
+       if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+               return -ENOMEM;
  
         /*
          * We do the easy to undo allocations first.
@@ -2295,20 +2307,14 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
                 DPRINT(("Cannot allocate vma\n"));
                 goto error_kmem;
         }
+       memset(vma, 0, sizeof(*vma));
+
         /*
          * partially initialize the vma for the sampling buffer
-        *
-        * The VM_DONTCOPY flag is very important as it ensures that the mapping
-        * will never be inherited for any child process (via fork()) which is always
-        * what we want.
          */
         vma->vm_mm           = mm;
         vma->vm_flags        = VM_READ| VM_MAYREAD |VM_RESERVED;
         vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
-       vma->vm_ops          = NULL;
-       vma->vm_pgoff        = 0;
-       vma->vm_file         = NULL;
-       vma->vm_private_data = NULL; 
  
         /*
          * Now we have everything we need and we can initialize
@@ -2334,6 +2340,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
                 goto error;
         }
         vma->vm_end = vma->vm_start + size;
+       vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
  
         DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
  
@@ -2350,8 +2357,9 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
          */
         insert_vm_struct(mm, vma);
  
-       mm->total_vm  += size >> PAGE_SHIFT;
-
+       vx_vmpages_add(mm, size >> PAGE_SHIFT);
+       vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+                                                       vma_pages(vma));
         up_write(&task->mm->mmap_sem);
  
         /*
@@ -2513,12 +2521,12 @@ pfm_reset_pmu_state(pfm_context_t *ctx)
           *
           * PMC0 is treated differently.
           */
-       ctx->ctx_all_pmcs[0] = pmu_conf.impl_pmcs[0] & ~0x1;
+       ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
  
         /*
          * bitmask of all PMDs that are accesible to this context
          */
-       ctx->ctx_all_pmds[0] = pmu_conf.impl_pmds[0];
+       ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
  
         DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
  
@@ -2581,7 +2589,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
                 return -EINVAL;
         }
  
-       if (task->state == TASK_ZOMBIE) {
+       if (task->exit_state == EXIT_ZOMBIE) {
                 DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
                 return -EBUSY;
         }
@@ -2591,7 +2599,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
          */
         if (task == current) return 0;
  
-       if (task->state != TASK_STOPPED) {
+       if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
                 DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
                 return -EBUSY;
         }
@@ -2658,8 +2666,10 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
         ctx = pfm_context_alloc();
         if (!ctx) goto error;
  
-       req->ctx_fd = ctx->ctx_fd = pfm_alloc_fd(&filp);
-       if (req->ctx_fd < 0) goto error_file;
+       ret = pfm_alloc_fd(&filp);
+       if (ret < 0) goto error_file;
+
+       req->ctx_fd = ctx->ctx_fd = ret;
  
         /*
          * attach context to file
@@ -2699,7 +2709,7 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
         /*
          * init restart semaphore to locked
          */
-       sema_init(&ctx->ctx_restart_sem, 0);
+       init_completion(&ctx->ctx_restart_done);
  
         /*
          * activation is used in SMP only
@@ -2858,16 +2868,17 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         unsigned long value, pmc_pm;
         unsigned long smpl_pmds, reset_pmds, impl_pmds;
         unsigned int cnum, reg_flags, flags, pmc_type;
-       int i, can_access_pmu = 0, is_loaded, is_system;
+       int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
         int is_monitor, is_counting, state;
         int ret = -EINVAL;
+       pfm_reg_check_t wr_func;
  #define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
  
         state     = ctx->ctx_state;
         is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
         is_system = ctx->ctx_fl_system;
         task      = ctx->ctx_task;
-       impl_pmds = pmu_conf.impl_pmds[0];
+       impl_pmds = pmu_conf->impl_pmds[0];
  
         if (state == PFM_CTX_ZOMBIE) return -EINVAL;
  
@@ -2884,6 +2895,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 }
                 can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
         }
+       expert_mode = pfm_sysctl.expert_mode; 
  
         for (i = 0; i < count; i++, req++) {
  
@@ -2900,8 +2912,8 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         goto error;
                 }
  
-               pmc_type   = pmu_conf.pmc_desc[cnum].type;
-               pmc_pm     = (value >> pmu_conf.pmc_desc[cnum].pm_pos) & 0x1;
+               pmc_type   = pmu_conf->pmc_desc[cnum].type;
+               pmc_pm     = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
                 is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
                 is_monitor  = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
  
@@ -2914,6 +2926,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
                         goto error;
                 }
+               wr_func = pmu_conf->pmc_desc[cnum].write_check;
                 /*
                  * If the PMC is a monitor, then if the value is not the default:
                  *      - system-wide session: PMCx.pm=1 (privileged monitor)
@@ -2962,8 +2975,8 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 /*
                  * execute write checker, if any
                  */
-               if (pfm_sysctl.expert_mode == 0 && PMC_WR_FUNC(cnum)) {
-                       ret = PMC_WR_FUNC(cnum)(task, ctx, cnum, &value, regs);
+               if (likely(expert_mode == 0 && wr_func)) {
+                       ret = (*wr_func)(task, ctx, cnum, &value, regs);
                         if (ret) goto error;
                         ret = -EINVAL;
                 }
@@ -3014,7 +3027,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                  * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
                  * possible leak here.
                  */
-               CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
+               CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
  
                 /*
                  * keep track of the monitor PMC that we are using.
@@ -3061,11 +3074,12 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  #endif
                 }
  
-               DPRINT(("pmc[%u]=0x%lx loaded=%d access_pmu=%d all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
+               DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
                           cnum,
                           value,
                           is_loaded,
                           can_access_pmu,
+                         flags,
                           ctx->ctx_all_pmcs[0],
                           ctx->ctx_used_pmds[0],
                           ctx->ctx_pmds[cnum].eventid,
@@ -3096,14 +3110,15 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         unsigned long value, hw_value, ovfl_mask;
         unsigned int cnum;
         int i, can_access_pmu = 0, state;
-       int is_counting, is_loaded, is_system;
+       int is_counting, is_loaded, is_system, expert_mode;
         int ret = -EINVAL;
+       pfm_reg_check_t wr_func;
  
  
         state     = ctx->ctx_state;
         is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
         is_system = ctx->ctx_fl_system;
-       ovfl_mask = pmu_conf.ovfl_val;
+       ovfl_mask = pmu_conf->ovfl_val;
         task      = ctx->ctx_task;
  
         if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
@@ -3125,6 +3140,7 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 }
                 can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
         }
+       expert_mode = pfm_sysctl.expert_mode; 
  
         for (i = 0; i < count; i++, req++) {
  
@@ -3136,14 +3152,15 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         goto abort_mission;
                 }
                 is_counting = PMD_IS_COUNTING(cnum);
+               wr_func     = pmu_conf->pmd_desc[cnum].write_check;
  
                 /*
                  * execute write checker, if any
                  */
-               if (pfm_sysctl.expert_mode == 0 && PMD_WR_FUNC(cnum)) {
+               if (unlikely(expert_mode == 0 && wr_func)) {
                         unsigned long v = value;
  
-                       ret = PMD_WR_FUNC(cnum)(task, ctx, cnum, &v, regs);
+                       ret = (*wr_func)(task, ctx, cnum, &v, regs);
                         if (ret) goto abort_mission;
  
                         value = v;
@@ -3238,8 +3255,8 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         }
                 }
  
-               DPRINT(("pmd[%u]=0x%lx loaded=%d access_pmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
-                         "long_reset=0x%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
+               DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
+                         "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
                         cnum,
                         value,
                         is_loaded,
@@ -3249,6 +3266,8 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                         ctx->ctx_pmds[cnum].short_reset,
                         ctx->ctx_pmds[cnum].long_reset,
                         PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
+                       ctx->ctx_pmds[cnum].seed,
+                       ctx->ctx_pmds[cnum].mask,
                         ctx->ctx_used_pmds[0],
                         ctx->ctx_pmds[cnum].reset_pmds[0],
                         ctx->ctx_reload_pmds[0],
@@ -3289,8 +3308,9 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         pfarg_reg_t *req = (pfarg_reg_t *)arg;
         unsigned int cnum, reg_flags = 0;
         int i, can_access_pmu = 0, state;
-       int is_loaded, is_system, is_counting;
+       int is_loaded, is_system, is_counting, expert_mode;
         int ret = -EINVAL;
+       pfm_reg_check_t rd_func;
  
         /*
          * access is possible when loaded only for
@@ -3300,7 +3320,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         state     = ctx->ctx_state;
         is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
         is_system = ctx->ctx_fl_system;
-       ovfl_mask = pmu_conf.ovfl_val;
+       ovfl_mask = pmu_conf->ovfl_val;
         task      = ctx->ctx_task;
  
         if (state == PFM_CTX_ZOMBIE) return -EINVAL;
@@ -3323,8 +3343,9 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  
                 if (can_access_pmu) ia64_srlz_d();
         }
+       expert_mode = pfm_sysctl.expert_mode; 
  
-       DPRINT(("loaded=%d access_pmu=%d ctx_state=%d\n",
+       DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
                 is_loaded,
                 can_access_pmu,
                 state));
@@ -3369,6 +3390,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                          */
                         val = is_loaded ? thread->pmds[cnum] : 0UL;
                 }
+               rd_func = pmu_conf->pmd_desc[cnum].read_check;
  
                 if (is_counting) {
                         /*
@@ -3381,9 +3403,9 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 /*
                  * execute read checker, if any
                  */
-               if (unlikely(pfm_sysctl.expert_mode == 0 && PMD_RD_FUNC(cnum))) {
+               if (unlikely(expert_mode == 0 && rd_func)) {
                         unsigned long v = val;
-                       ret = PMD_RD_FUNC(cnum)(ctx->ctx_task, ctx, cnum, &v, regs);
+                       ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
                         if (ret) goto error;
                         val = v;
                         ret = -EINVAL;
@@ -3463,7 +3485,7 @@ pfm_use_debug_registers(struct task_struct *task)
         unsigned long flags;
         int ret = 0;
  
-       if (pmu_conf.use_rr_dbregs == 0) return 0;
+       if (pmu_conf->use_rr_dbregs == 0) return 0;
  
         DPRINT(("called for [%d]\n", task->pid));
  
@@ -3517,7 +3539,7 @@ pfm_release_debug_registers(struct task_struct *task)
         unsigned long flags;
         int ret;
  
-       if (pmu_conf.use_rr_dbregs == 0) return 0;
+       if (pmu_conf->use_rr_dbregs == 0) return 0;
  
         LOCK_PFS(flags);
         if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
@@ -3668,7 +3690,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
          */
         if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
                 DPRINT(("unblocking [%d] \n", task->pid));
-               up(&ctx->ctx_restart_sem);
+               complete(&ctx->ctx_restart_done);
         } else {
                 DPRINT(("[%d] armed exit trap\n", task->pid));
  
@@ -3692,8 +3714,6 @@ pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  
         pfm_sysctl.debug = m == 0 ? 0 : 1;
  
-       pfm_debug_var = pfm_sysctl.debug;
-
         printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
  
         if (m == 0) {
@@ -3720,7 +3740,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
         int i, can_access_pmu = 0;
         int is_system, is_loaded;
  
-       if (pmu_conf.use_rr_dbregs == 0) return -EINVAL;
+       if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
  
         state     = ctx->ctx_state;
         is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
@@ -3802,14 +3822,14 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
          */
         if (first_time && can_access_pmu) {
                 DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
-               for (i=0; i < pmu_conf.num_ibrs; i++) {
+               for (i=0; i < pmu_conf->num_ibrs; i++) {
                         ia64_set_ibr(i, 0UL);
-                       ia64_srlz_i();
+                       ia64_dv_serialize_instruction();
                 }
                 ia64_srlz_i();
-               for (i=0; i < pmu_conf.num_dbrs; i++) {
+               for (i=0; i < pmu_conf->num_dbrs; i++) {
                         ia64_set_dbr(i, 0UL);
-                       ia64_srlz_d();
+                       ia64_dv_serialize_data();
                 }
                 ia64_srlz_d();
         }
@@ -3856,20 +3876,25 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
                 if (mode == PFM_CODE_RR) {
                         CTX_USED_IBR(ctx, rnum);
  
-                       if (can_access_pmu) ia64_set_ibr(rnum, dbreg.val);
+                       if (can_access_pmu) {
+                               ia64_set_ibr(rnum, dbreg.val);
+                               ia64_dv_serialize_instruction();
+                       }
  
                         ctx->ctx_ibrs[rnum] = dbreg.val;
  
-                       DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x is_loaded=%d access_pmu=%d\n",
+                       DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
                                 rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
                 } else {
                         CTX_USED_DBR(ctx, rnum);
  
-                       if (can_access_pmu) ia64_set_dbr(rnum, dbreg.val);
-
+                       if (can_access_pmu) {
+                               ia64_set_dbr(rnum, dbreg.val);
+                               ia64_dv_serialize_data();
+                       }
                         ctx->ctx_dbrs[rnum] = dbreg.val;
  
-                       DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x is_loaded=%d access_pmu=%d\n",
+                       DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
                                 rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
                 }
         }
@@ -3970,7 +3995,10 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         state     = ctx->ctx_state;
         is_system = ctx->ctx_fl_system;
  
-       if (state != PFM_CTX_LOADED && state != PFM_CTX_MASKED) return -EINVAL;
+       /*
+        * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
+        */
+       if (state == PFM_CTX_UNLOADED) return -EINVAL;
  
         /*
          * In system wide and when the context is loaded, access can only happen
@@ -4029,7 +4057,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                  */
                 ia64_psr(regs)->up = 0;
         } else {
-               tregs = ia64_task_regs(task);
+               tregs = task_pt_regs(task);
  
                 /*
                  * stop monitoring at the user level
@@ -4111,7 +4139,7 @@ pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 ia64_psr(regs)->up = 1;
  
         } else {
-               tregs = ia64_task_regs(ctx->ctx_task);
+               tregs = task_pt_regs(ctx->ctx_task);
  
                 /*
                  * start monitoring at the kernel level the next
@@ -4201,7 +4229,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
                         req->load_pid,
                         ctx->ctx_state));
-               return -EINVAL;
+               return -EBUSY;
         }
  
         DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
@@ -4295,6 +4323,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
         DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
                 thread->pfm_context, ctx));
  
+       ret = -EBUSY;
         old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
         if (old != NULL) {
                 DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
@@ -4367,8 +4396,8 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                  * guaranteed safe by earlier check against DBG_VALID
                  */
                 if (ctx->ctx_fl_using_dbreg) {
-                       pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-                       pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+                       pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+                       pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
                 }
                 /*
                  * set new ownership
@@ -4380,7 +4409,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
                 /*
                  * when not current, task MUST be stopped, so this is safe
                  */
-               regs = ia64_task_regs(task);
+               regs = task_pt_regs(task);
  
                 /* force a full reload */
                 ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
@@ -4506,7 +4535,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
         /*
          * per-task mode
          */
-       tregs = task == current ? regs : ia64_task_regs(task);
+       tregs = task == current ? regs : task_pt_regs(task);
  
         if (task == current) {
                 /*
@@ -4559,31 +4588,6 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
         return 0;
  }
  
-static void
-pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
-{
-       struct task_struct *task = ctx->ctx_task;
-
-       ia64_psr(regs)->up = 0;
-       ia64_psr(regs)->sp = 1;
-
-       if (GET_PMU_OWNER() == task) {
-               DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
-               SET_PMU_OWNER(NULL, NULL);
-       }
-
-       /*
-        * disconnect the task from the context and vice-versa
-        */
-       PFM_SET_WORK_PENDING(task, 0);
-
-       task->thread.pfm_context  = NULL;
-       task->thread.flags       &= ~IA64_THREAD_PM_VALID;
-
-       DPRINT(("force cleanupf for [%d]\n",  task->pid));
-}
-
-
  
  /*
   * called only from exit_thread(): task == current
@@ -4594,7 +4598,7 @@ pfm_exit_thread(struct task_struct *task)
  {
         pfm_context_t *ctx;
         unsigned long flags;
-       struct pt_regs *regs = ia64_task_regs(task);
+       struct pt_regs *regs = task_pt_regs(task);
         int ret, state;
         int free_ok = 0;
  
@@ -4701,21 +4705,22 @@ static int
  pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
  {
         struct task_struct *task;
-       int state;
+       int state, old_state;
  
+recheck:
         state = ctx->ctx_state;
+       task  = ctx->ctx_task;
  
-       task = PFM_CTX_TASK(ctx);
         if (task == NULL) {
                 DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
                 return 0;
         }
  
         DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
-                               ctx->ctx_fd,
-                               state,
-                               task->pid,
-                               task->state, PFM_CMD_STOPPED(cmd)));
+               ctx->ctx_fd,
+               state,
+               task->pid,
+               task->state, PFM_CMD_STOPPED(cmd)));
  
         /*
          * self-monitoring always ok.
@@ -4727,31 +4732,73 @@ pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
         if (task == current || ctx->ctx_fl_system) return 0;
  
         /*
-        * context is UNLOADED, MASKED we are safe to go
+        * we are monitoring another thread
          */
-       if (state != PFM_CTX_LOADED) return 0;
-
-       if (state == PFM_CTX_ZOMBIE) return -EINVAL;
+       switch(state) {
+               case PFM_CTX_UNLOADED:
+                       /*
+                        * if context is UNLOADED we are safe to go
+                        */
+                       return 0;
+               case PFM_CTX_ZOMBIE:
+                       /*
+                        * no command can operate on a zombie context
+                        */
+                       DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
+                       return -EINVAL;
+               case PFM_CTX_MASKED:
+                       /*
+                        * PMU state has been saved to software even though
+                        * the thread may still be running.
+                        */
+                       if (cmd != PFM_UNLOAD_CONTEXT) return 0;
+       }
  
         /*
-        * context is loaded, we must make sure the task is stopped
+        * context is LOADED or MASKED. Some commands may need to have 
+        * the task stopped.
+        *
          * We could lift this restriction for UP but it would mean that
          * the user has no guarantee the task would not run between
          * two successive calls to perfmonctl(). That's probably OK.
          * If this user wants to ensure the task does not run, then
          * the task must be stopped.
          */
-       if (PFM_CMD_STOPPED(cmd) && task->state != TASK_STOPPED) {
-               DPRINT(("[%d] task not in stopped state\n", task->pid));
-               return -EBUSY;
-       }
+       if (PFM_CMD_STOPPED(cmd)) {
+               if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
+                       DPRINT(("[%d] task not in stopped state\n", task->pid));
+                       return -EBUSY;
+               }
+               /*
+                * task is now stopped, wait for ctxsw out
+                *
+                * This is an interesting point in the code.
+                * We need to unprotect the context because
+                * the pfm_save_regs() routines needs to grab
+                * the same lock. There are danger in doing
+                * this because it leaves a window open for
+                * another task to get access to the context
+                * and possibly change its state. The one thing
+                * that is not possible is for the context to disappear
+                * because we are protected by the VFS layer, i.e.,
+                * get_fd()/put_fd().
+                */
+               old_state = state;
  
-       UNPROTECT_CTX(ctx, flags);
+               UNPROTECT_CTX(ctx, flags);
  
-       wait_task_inactive(task);
+               wait_task_inactive(task);
  
-       PROTECT_CTX(ctx, flags);
+               PROTECT_CTX(ctx, flags);
  
+               /*
+                * we must recheck to verify if state has changed
+                */
+               if (ctx->ctx_state != old_state) {
+                       DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
+                       goto recheck;
+               }
+       }
         return 0;
  }
  
@@ -4759,10 +4806,8 @@ pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
   * system-call entry point (must return long)
   */
  asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
-               long arg8, long stack)
+sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
  {
-       struct pt_regs *regs = (struct pt_regs *)&stack;
         struct file *file = NULL;
         pfm_context_t *ctx = NULL;
         unsigned long flags = 0UL;
@@ -4777,7 +4822,7 @@ sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, lon
         /*
          * reject any call if perfmon was disabled at initialization
          */
-       if (unlikely(PFM_IS_DISABLED())) return -ENOSYS;
+       if (unlikely(pmu_conf == NULL)) return -ENOSYS;
  
         if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
                 DPRINT(("invalid cmd=%d\n", cmd));
@@ -4886,7 +4931,7 @@ restart_args:
         if (unlikely(ret)) goto abort_locked;
  
  skip_fd:
-       ret = (*func)(ctx, args_k, count, regs);
+       ret = (*func)(ctx, args_k, count, task_pt_regs(current));
  
         call_made = 1;
  
@@ -4894,14 +4939,16 @@ abort_locked:
         if (likely(ctx)) {
                 DPRINT(("context unlocked\n"));
                 UNPROTECT_CTX(ctx, flags);
-               fput(file);
         }
  
         /* copy argument back to user, if needed */
         if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
  
  error_args:
-       if (args_k) kfree(args_k);
+       if (file)
+               fput(file);
+
+       kfree(args_k);
  
         DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
  
@@ -4957,26 +5004,14 @@ pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_reg
  static void
  pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
  {
-       if (ctx->ctx_fl_system) {
-               printk(KERN_ERR "perfmon: pfm_context_force_terminate [%d] is system-wide\n", current->pid);
-               return;
-       }
-       /*
-        * we stop the whole thing, we do no need to flush
-        * we know we WERE masked
-        */
-       pfm_clear_psr_up();
-       ia64_psr(regs)->up = 0;
-       ia64_psr(regs)->sp = 1;
+       int ret;
  
-       /*
-        * disconnect the task from the context and vice-versa
-        */
-       current->thread.pfm_context  = NULL;
-       current->thread.flags       &= ~IA64_THREAD_PM_VALID;
-       ctx->ctx_task = NULL;
+       DPRINT(("entering for [%d]\n", current->pid));
  
-       DPRINT(("context terminated\n"));
+       ret = pfm_context_unload(ctx, NULL, 0, regs);
+       if (ret) {
+               printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
+       }
  
         /*
          * and wakeup controlling task, indicating we are now disconnected
@@ -4991,13 +5026,21 @@ pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
  }
  
  static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
-
+ /*
+  * pfm_handle_work() can be called with interrupts enabled
+  * (TIF_NEED_RESCHED) or disabled. The down_interruptible
+  * call may sleep, therefore we must re-enable interrupts
+  * to avoid deadlocks. It is safe to do so because this function
+  * is called ONLY when returning to user level (PUStk=1), in which case
+  * there is no risk of kernel stack overflow due to deep
+  * interrupt nesting.
+  */
  void
  pfm_handle_work(void)
  {
         pfm_context_t *ctx;
         struct pt_regs *regs;
-       unsigned long flags;
+       unsigned long flags, dummy_flags;
         unsigned long ovfl_regs;
         unsigned int reason;
         int ret;
@@ -5014,7 +5057,7 @@ pfm_handle_work(void)
  
         pfm_clear_task_notify();
  
-       regs = ia64_task_regs(current);
+       regs = task_pt_regs(current);
  
         /*
          * extract reason for being here and clear
@@ -5034,19 +5077,34 @@ pfm_handle_work(void)
         //if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
         if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
  
+       /*
+        * restore interrupt mask to what it was on entry.
+        * Could be enabled/diasbled.
+        */
         UNPROTECT_CTX(ctx, flags);
  
+       /*
+        * force interrupt enable because of down_interruptible()
+        */
+       local_irq_enable();
+
         DPRINT(("before block sleeping\n"));
  
         /*
          * may go through without blocking on SMP systems
          * if restart has been received already by the time we call down()
          */
-       ret = down_interruptible(&ctx->ctx_restart_sem);
+       ret = wait_for_completion_interruptible(&ctx->ctx_restart_done);
  
         DPRINT(("after block sleeping ret=%d\n", ret));
  
-       PROTECT_CTX(ctx, flags);
+       /*
+        * lock context and mask interrupts again
+        * We save flags into a dummy because we may have
+        * altered interrupts mask compared to entry in this
+        * function.
+        */
+       PROTECT_CTX(ctx, dummy_flags);
  
         /*
          * we need to read the ovfl_regs only after wake-up
@@ -5072,7 +5130,9 @@ skip_blocking:
         ctx->ctx_ovfl_regs[0] = 0UL;
  
  nothing_to_do:
-
+       /*
+        * restore flags as they were upon entry
+        */
         UNPROTECT_CTX(ctx, flags);
  }
  
@@ -5160,7 +5220,7 @@ pfm_end_notify_user(pfm_context_t *ctx)
  static void
  pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
  {
-       pfm_ovfl_arg_t ovfl_arg;
+       pfm_ovfl_arg_t *ovfl_arg;
         unsigned long mask;
         unsigned long old_val, ovfl_val, new_val;
         unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
@@ -5178,7 +5238,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
  
         tstamp   = ia64_get_itc();
         mask     = pmc0 >> PMU_FIRST_COUNTER;
-       ovfl_val = pmu_conf.ovfl_val;
+       ovfl_val = pmu_conf->ovfl_val;
         has_smpl = CTX_HAS_SMPL(ctx);
  
         DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
@@ -5247,7 +5307,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                 int j, k, ret = 0;
                 int this_cpu = smp_processor_id();
  
-               pmd_mask   = ovfl_pmds >> PMU_FIRST_COUNTER;
+               pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
+               ovfl_arg = &ctx->ctx_ovfl_arg;
  
                 prefetch(ctx->ctx_smpl_hdr);
  
@@ -5257,15 +5318,15 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
  
                         if ((pmd_mask & 0x1) == 0) continue;
  
-                       ovfl_arg.ovfl_pmd      = (unsigned char )i;
-                       ovfl_arg.ovfl_notify   = ovfl_notify & mask ? 1 : 0;
-                       ovfl_arg.active_set    = 0;
-                       ovfl_arg.ovfl_ctrl.val = 0; /* module must fill in all fields */
-                       ovfl_arg.smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
+                       ovfl_arg->ovfl_pmd      = (unsigned char )i;
+                       ovfl_arg->ovfl_notify   = ovfl_notify & mask ? 1 : 0;
+                       ovfl_arg->active_set    = 0;
+                       ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
+                       ovfl_arg->smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
  
-                       ovfl_arg.pmd_value      = ctx->ctx_pmds[i].val;
-                       ovfl_arg.pmd_last_reset = ctx->ctx_pmds[i].lval;
-                       ovfl_arg.pmd_eventid    = ctx->ctx_pmds[i].eventid;
+                       ovfl_arg->pmd_value      = ctx->ctx_pmds[i].val;
+                       ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
+                       ovfl_arg->pmd_eventid    = ctx->ctx_pmds[i].eventid;
  
                         /*
                          * copy values of pmds of interest. Sampling format may copy them
@@ -5274,8 +5335,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                         if (smpl_pmds) {
                                 for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
                                         if ((smpl_pmds & 0x1) == 0) continue;
-                                       ovfl_arg.smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
-                                       DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg.smpl_pmds_values[k-1]));
+                                       ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
+                                       DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
                                 }
                         }
  
@@ -5286,7 +5347,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                         /*
                          * call custom buffer format record (handler) routine
                          */
-                       ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, &ovfl_arg, regs, tstamp);
+                       ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
  
                         end_cycles = ia64_get_itc();
  
@@ -5294,13 +5355,13 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                          * For those controls, we take the union because they have
                          * an all or nothing behavior.
                          */
-                       ovfl_ctrl.bits.notify_user     |= ovfl_arg.ovfl_ctrl.bits.notify_user;
-                       ovfl_ctrl.bits.block_task      |= ovfl_arg.ovfl_ctrl.bits.block_task;
-                       ovfl_ctrl.bits.mask_monitoring |= ovfl_arg.ovfl_ctrl.bits.mask_monitoring;
+                       ovfl_ctrl.bits.notify_user     |= ovfl_arg->ovfl_ctrl.bits.notify_user;
+                       ovfl_ctrl.bits.block_task      |= ovfl_arg->ovfl_ctrl.bits.block_task;
+                       ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
                         /*
                          * build the bitmask of pmds to reset now
                          */
-                       if (ovfl_arg.ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
+                       if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
  
                         pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
                 }
@@ -5330,9 +5391,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
                 if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
         }
  
-       DPRINT(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n",
-               ovfl_pmds,
-               reset_pmds));
+       DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
+
         /*
          * reset the requested PMD registers using the short reset values
          */
@@ -5512,135 +5572,215 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
         int ret;
  
         this_cpu = get_cpu();
-       min      = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
-       max      = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
+       if (likely(!pfm_alt_intr_handler)) {
+               min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
+               max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
  
-       start_cycles = ia64_get_itc();
+               start_cycles = ia64_get_itc();
  
-       ret = pfm_do_interrupt_handler(irq, arg, regs);
+               ret = pfm_do_interrupt_handler(irq, arg, regs);
  
-       total_cycles = ia64_get_itc();
+               total_cycles = ia64_get_itc();
  
-       /*
-        * don't measure spurious interrupts
-        */
-       if (likely(ret == 0)) {
-               total_cycles -= start_cycles;
+               /*
+                * don't measure spurious interrupts
+                */
+               if (likely(ret == 0)) {
+                       total_cycles -= start_cycles;
  
-               if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
-               if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
+                       if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
+                       if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
  
-               pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
+                       pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
+               }
         }
+       else {
+               (*pfm_alt_intr_handler->handler)(irq, arg, regs);
+       }
+
         put_cpu_no_resched();
         return IRQ_HANDLED;
  }
  
+/*
+ * /proc/perfmon interface, for debug only
+ */
  
-/* for debug only */
-static int
-pfm_proc_info(char *page)
+#define PFM_PROC_SHOW_HEADER   ((void *)NR_CPUS+1)
+
+static void *
+pfm_proc_start(struct seq_file *m, loff_t *pos)
  {
-       char *p = page;
-       struct list_head * pos;
-       pfm_buffer_fmt_t * entry;
-       unsigned long psr, flags;
-       int online_cpus = 0;
-       int i;
+       if (*pos == 0) {
+               return PFM_PROC_SHOW_HEADER;
+       }
  
-               p += sprintf(p, "perfmon version           : %u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN);
-               p += sprintf(p, "model                     : %s\n", pmu_conf.pmu_name);
-               p += sprintf(p, "fastctxsw                 : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
-               p += sprintf(p, "expert mode               : %s\n", pfm_sysctl.expert_mode > 0 ? "Yes": "No");
-               p += sprintf(p, "ovfl_mask                 : 0x%lx\n", pmu_conf.ovfl_val);
-
-       for(i=0; i < NR_CPUS; i++) {
-               if (cpu_online(i) == 0) continue;
-               p += sprintf(p, "CPU%-2d overflow intrs      : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_count);
-               p += sprintf(p, "CPU%-2d overflow cycles     : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles);
-               p += sprintf(p, "CPU%-2d overflow min        : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles_min);
-               p += sprintf(p, "CPU%-2d overflow max        : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles_max);
-               p += sprintf(p, "CPU%-2d smpl handler calls  : %lu\n", i, pfm_stats[i].pfm_smpl_handler_calls);
-               p += sprintf(p, "CPU%-2d smpl handler cycles : %lu\n", i, pfm_stats[i].pfm_smpl_handler_cycles);
-               p += sprintf(p, "CPU%-2d spurious intrs      : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count);
-               p += sprintf(p, "CPU%-2d replay   intrs      : %lu\n", i, pfm_stats[i].pfm_replay_ovfl_intr_count);
-               p += sprintf(p, "CPU%-2d syst_wide           : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0);
-               p += sprintf(p, "CPU%-2d dcr_pp              : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0);
-               p += sprintf(p, "CPU%-2d exclude idle        : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0);
-               p += sprintf(p, "CPU%-2d owner               : %d\n" , i, pfm_get_cpu_data(pmu_owner, i) ? pfm_get_cpu_data(pmu_owner, i)->pid: -1);
-               p += sprintf(p, "CPU%-2d context             : %p\n" , i, pfm_get_cpu_data(pmu_ctx, i));
-               p += sprintf(p, "CPU%-2d activations         : %lu\n", i, pfm_get_cpu_data(pmu_activation_number,i));
-               online_cpus++;
-       }
-
-       if (online_cpus == 1)
-       {
-               psr = pfm_get_psr();
-               ia64_srlz_d();
-               p += sprintf(p, "CPU%-2d psr                 : 0x%lx\n", smp_processor_id(), psr);
-               p += sprintf(p, "CPU%-2d pmc0                : 0x%lx\n", smp_processor_id(), ia64_get_pmc(0));
-               for(i=4; i < 8; i++) {
-                       p += sprintf(p, "CPU%-2d pmc%u                : 0x%lx\n", smp_processor_id(), i, ia64_get_pmc(i));
-                       p += sprintf(p, "CPU%-2d pmd%u                : 0x%lx\n", smp_processor_id(), i, ia64_get_pmd(i));
-               }
+       while (*pos <= NR_CPUS) {
+               if (cpu_online(*pos - 1)) {
+                       return (void *)*pos;
+               }
+               ++*pos;
         }
+       return NULL;
+}
  
-       LOCK_PFS(flags);
-       p += sprintf(p, "proc_sessions             : %u\n"
-                       "sys_sessions              : %u\n"
-                       "sys_use_dbregs            : %u\n"
-                       "ptrace_use_dbregs         : %u\n",
-                       pfm_sessions.pfs_task_sessions,
-                       pfm_sessions.pfs_sys_sessions,
-                       pfm_sessions.pfs_sys_use_dbregs,
-                       pfm_sessions.pfs_ptrace_use_dbregs);
-       UNLOCK_PFS(flags);
+static void *
+pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       ++*pos;
+       return pfm_proc_start(m, pos);
+}
+
+static void
+pfm_proc_stop(struct seq_file *m, void *v)
+{
+}
+
+static void
+pfm_proc_show_header(struct seq_file *m)
+{
+       struct list_head * pos;
+       pfm_buffer_fmt_t * entry;
+       unsigned long flags;
+
+       seq_printf(m,
+               "perfmon version           : %u.%u\n"
+               "model                     : %s\n"
+               "fastctxsw                 : %s\n"
+               "expert mode               : %s\n"
+               "ovfl_mask                 : 0x%lx\n"
+               "PMU flags                 : 0x%x\n",
+               PFM_VERSION_MAJ, PFM_VERSION_MIN,
+               pmu_conf->pmu_name,
+               pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
+               pfm_sysctl.expert_mode > 0 ? "Yes": "No",
+               pmu_conf->ovfl_val,
+               pmu_conf->flags);
+
+       LOCK_PFS(flags);
+
+       seq_printf(m,
+               "proc_sessions             : %u\n"
+               "sys_sessions              : %u\n"
+               "sys_use_dbregs            : %u\n"
+               "ptrace_use_dbregs         : %u\n",
+               pfm_sessions.pfs_task_sessions,
+               pfm_sessions.pfs_sys_sessions,
+               pfm_sessions.pfs_sys_use_dbregs,
+               pfm_sessions.pfs_ptrace_use_dbregs);
+
+       UNLOCK_PFS(flags);
  
         spin_lock(&pfm_buffer_fmt_lock);
  
         list_for_each(pos, &pfm_buffer_fmt_list) {
                 entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
-               p += sprintf(p, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
-                               entry->fmt_uuid[0],
-                               entry->fmt_uuid[1],
-                               entry->fmt_uuid[2],
-                               entry->fmt_uuid[3],
-                               entry->fmt_uuid[4],
-                               entry->fmt_uuid[5],
-                               entry->fmt_uuid[6],
-                               entry->fmt_uuid[7],
-                               entry->fmt_uuid[8],
-                               entry->fmt_uuid[9],
-                               entry->fmt_uuid[10],
-                               entry->fmt_uuid[11],
-                               entry->fmt_uuid[12],
-                               entry->fmt_uuid[13],
-                               entry->fmt_uuid[14],
-                               entry->fmt_uuid[15],
-                               entry->fmt_name);
+               seq_printf(m, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
+                       entry->fmt_uuid[0],
+                       entry->fmt_uuid[1],
+                       entry->fmt_uuid[2],
+                       entry->fmt_uuid[3],
+                       entry->fmt_uuid[4],
+                       entry->fmt_uuid[5],
+                       entry->fmt_uuid[6],
+                       entry->fmt_uuid[7],
+                       entry->fmt_uuid[8],
+                       entry->fmt_uuid[9],
+                       entry->fmt_uuid[10],
+                       entry->fmt_uuid[11],
+                       entry->fmt_uuid[12],
+                       entry->fmt_uuid[13],
+                       entry->fmt_uuid[14],
+                       entry->fmt_uuid[15],
+                       entry->fmt_name);
         }
         spin_unlock(&pfm_buffer_fmt_lock);
  
-       return p - page;
  }
  
-/* /proc interface, for debug only */
  static int
-perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
+pfm_proc_show(struct seq_file *m, void *v)
  {
-       int len = pfm_proc_info(page);
+       unsigned long psr;
+       unsigned int i;
+       int cpu;
+
+       if (v == PFM_PROC_SHOW_HEADER) {
+               pfm_proc_show_header(m);
+               return 0;
+       }
+
+       /* show info for CPU (v - 1) */
+
+       cpu = (long)v - 1;
+       seq_printf(m,
+               "CPU%-2d overflow intrs      : %lu\n"
+               "CPU%-2d overflow cycles     : %lu\n"
+               "CPU%-2d overflow min        : %lu\n"
+               "CPU%-2d overflow max        : %lu\n"
+               "CPU%-2d smpl handler calls  : %lu\n"
+               "CPU%-2d smpl handler cycles : %lu\n"
+               "CPU%-2d spurious intrs      : %lu\n"
+               "CPU%-2d replay   intrs      : %lu\n"
+               "CPU%-2d syst_wide           : %d\n"
+               "CPU%-2d dcr_pp              : %d\n"
+               "CPU%-2d exclude idle        : %d\n"
+               "CPU%-2d owner               : %d\n"
+               "CPU%-2d context             : %p\n"
+               "CPU%-2d activations         : %lu\n",
+               cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
+               cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
+               cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
+               cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
+               cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
+               cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
+               cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
+               cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
+               cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
+               cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
+               cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
+               cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
+               cpu, pfm_get_cpu_data(pmu_ctx, cpu),
+               cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
+
+       if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
+
+               psr = pfm_get_psr();
  
-       if (len <= off+count) *eof = 1;
+               ia64_srlz_d();
  
-       *start = page + off;
-       len   -= off;
+               seq_printf(m, 
+                       "CPU%-2d psr                 : 0x%lx\n"
+                       "CPU%-2d pmc0                : 0x%lx\n", 
+                       cpu, psr,
+                       cpu, ia64_get_pmc(0));
+
+               for (i=0; PMC_IS_LAST(i) == 0;  i++) {
+                       if (PMC_IS_COUNTING(i) == 0) continue;
+                       seq_printf(m, 
+                               "CPU%-2d pmc%u                : 0x%lx\n"
+                               "CPU%-2d pmd%u                : 0x%lx\n", 
+                               cpu, i, ia64_get_pmc(i),
+                               cpu, i, ia64_get_pmd(i));
+               }
+       }
+       return 0;
+}
  
-       if (len>count) len = count;
-       if (len<0) len = 0;
+struct seq_operations pfm_seq_ops = {
+       .start =        pfm_proc_start,
+       .next =         pfm_proc_next,
+       .stop =         pfm_proc_stop,
+       .show =         pfm_proc_show
+};
  
-       return len;
+static int
+pfm_proc_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &pfm_seq_ops);
  }
  
+
  /*
   * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
   * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
@@ -5661,7 +5801,7 @@ pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_c
          * on every CPU, so we can rely on the pid to identify the idle task.
          */
         if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
-               regs = ia64_task_regs(task);
+               regs = task_pt_regs(task);
                 ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
                 return;
         }
@@ -5694,6 +5834,32 @@ pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_c
  }
  
  #ifdef CONFIG_SMP
+
+static void
+pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
+{
+       struct task_struct *task = ctx->ctx_task;
+
+       ia64_psr(regs)->up = 0;
+       ia64_psr(regs)->sp = 1;
+
+       if (GET_PMU_OWNER() == task) {
+               DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
+               SET_PMU_OWNER(NULL, NULL);
+       }
+
+       /*
+        * disconnect the task from the context and vice-versa
+        */
+       PFM_SET_WORK_PENDING(task, 0);
+
+       task->thread.pfm_context  = NULL;
+       task->thread.flags       &= ~IA64_THREAD_PM_VALID;
+
+       DPRINT(("force cleanup for [%d]\n",  task->pid));
+}
+
+
  /*
   * in 2.6, interrupts are masked when we come here and the runqueue lock is held
   */
@@ -5718,7 +5884,7 @@ pfm_save_regs(struct task_struct *task)
         flags = pfm_protect_ctx_ctxsw(ctx);
  
         if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
-               struct pt_regs *regs = ia64_task_regs(task);
+               struct pt_regs *regs = task_pt_regs(task);
  
                 pfm_clear_psr_up();
  
@@ -5732,14 +5898,6 @@ pfm_save_regs(struct task_struct *task)
                 return;
         }
  
-       /*
-        * sanity check
-        */
-       if (ctx->ctx_last_activation != GET_ACTIVATION()) {
-               pfm_unprotect_ctx_ctxsw(ctx, flags);
-               return;
-       }
-
         /*
          * save current PSR: needed because we modify it
          */
@@ -5899,6 +6057,7 @@ pfm_load_regs (struct task_struct *task)
         unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
         unsigned long flags;
         u64 psr, psr_up;
+       int need_irq_resend;
  
         ctx = PFM_GET_CTX(task);
         if (unlikely(ctx == NULL)) return;
@@ -5919,11 +6078,13 @@ pfm_load_regs (struct task_struct *task)
         flags = pfm_protect_ctx_ctxsw(ctx);
         psr   = pfm_get_psr();
  
+       need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
+
         BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
         BUG_ON(psr & IA64_PSR_I);
  
         if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
-               struct pt_regs *regs = ia64_task_regs(task);
+               struct pt_regs *regs = task_pt_regs(task);
  
                 BUG_ON(ctx->ctx_smpl_hdr);
  
@@ -5944,8 +6105,8 @@ pfm_load_regs (struct task_struct *task)
          * stale state.
          */
         if (ctx->ctx_fl_using_dbreg) {
-               pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-               pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+               pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+               pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
         }
         /*
          * retrieve saved psr.up
@@ -6004,12 +6165,12 @@ pfm_load_regs (struct task_struct *task)
                 ia64_set_pmc(0, t->pmcs[0]);
                 ia64_srlz_d();
                 t->pmcs[0] = 0UL;
-#ifndef CONFIG_MCKINLEY
+
                 /*
                  * will replay the PMU interrupt
                  */
-               hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
-#endif
+               if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+
                 pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
         }
  
@@ -6061,6 +6222,7 @@ pfm_load_regs (struct task_struct *task)
         struct task_struct *owner;
         unsigned long pmd_mask, pmc_mask;
         u64 psr, psr_up;
+       int need_irq_resend;
  
         owner = GET_PMU_OWNER();
         ctx   = PFM_GET_CTX(task);
@@ -6079,14 +6241,15 @@ pfm_load_regs (struct task_struct *task)
          * (not perfmon) by the previous task.
          */
         if (ctx->ctx_fl_using_dbreg) {
-               pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-               pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+               pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+               pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
         }
  
         /*
          * retrieved saved psr.up
          */
         psr_up = ctx->ctx_saved_psr_up;
+       need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
  
         /*
          * short path, our state is still there, just
@@ -6143,12 +6306,11 @@ pfm_load_regs (struct task_struct *task)
  
                 t->pmcs[0] = 0UL;
  
-#ifndef CONFIG_MCKINLEY
                 /*
                  * will replay the PMU interrupt
                  */
-               hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
-#endif
+               if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+
                 pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
         }
  
@@ -6184,15 +6346,15 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
          */
         is_self = ctx->ctx_task == task ? 1 : 0;
  
-#ifdef CONFIG_SMP
-       if (task == current) {
-#else
         /*
-        * in UP, the state can still be in the registers
+        * can access PMU is task is the owner of the PMU state on the current CPU
+        * or if we are running on the CPU bound to the context in system-wide mode
+        * (that is not necessarily the task the context is attached to in this mode).
+        * In system-wide we always have can_access_pmu true because a task running on an
+        * invalid processor is flagged earlier in the call stack (see pfm_stop).
          */
-       if (task == current || GET_PMU_OWNER() == task) {
-#endif
-               can_access_pmu = 1;
+       can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
+       if (can_access_pmu) {
                 /*
                  * Mark the PMU as not owned
                  * This will cause the interrupt handler to do nothing in case an overflow
@@ -6202,6 +6364,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
                  * on.
                  */
                 SET_PMU_OWNER(NULL, NULL);
+               DPRINT(("releasing ownership\n"));
  
                 /*
                  * read current overflow status:
@@ -6222,7 +6385,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
                  */
                 task->thread.pmcs[0] = 0;
         }
-       ovfl_val = pmu_conf.ovfl_val;
+       ovfl_val = pmu_conf->ovfl_val;
         /*
          * we save all the used pmds
          * we take care of overflows for counting PMDs
@@ -6230,6 +6393,9 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
          * XXX: sampling situation is not taken into account here
          */
         mask2 = ctx->ctx_used_pmds[0];
+
+       DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
+
         for (i = 0; mask2; i++, mask2>>=1) {
  
                 /* skip non used pmds */
@@ -6268,7 +6434,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
                         }
                 }
  
-               DPRINT(("[%d] is_self=%d ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, is_self, i, val, pmd_val));
+               DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
  
                 if (is_self) task->thread.pmds[i] = pmd_val;
  
@@ -6282,11 +6448,176 @@ static struct irqaction perfmon_irqaction = {
         .name    = "perfmon"
  };
  
+static void
+pfm_alt_save_pmu_state(void *data)
+{
+       struct pt_regs *regs;
+
+       regs = task_pt_regs(current);
+
+       DPRINT(("called\n"));
+
+       /*
+        * should not be necessary but
+        * let's take not risk
+        */
+       pfm_clear_psr_up();
+       pfm_clear_psr_pp();
+       ia64_psr(regs)->pp = 0;
+
+       /*
+        * This call is required
+        * May cause a spurious interrupt on some processors
+        */
+       pfm_freeze_pmu();
+
+       ia64_srlz_d();
+}
+
+void
+pfm_alt_restore_pmu_state(void *data)
+{
+       struct pt_regs *regs;
+
+       regs = task_pt_regs(current);
+
+       DPRINT(("called\n"));
+
+       /*
+        * put PMU back in state expected
+        * by perfmon
+        */
+       pfm_clear_psr_up();
+       pfm_clear_psr_pp();
+       ia64_psr(regs)->pp = 0;
+
+       /*
+        * perfmon runs with PMU unfrozen at all times
+        */
+       pfm_unfreeze_pmu();
+
+       ia64_srlz_d();
+}
+
+int
+pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
+{
+       int ret, i;
+       int reserve_cpu;
+
+       /* some sanity checks */
+       if (hdl == NULL || hdl->handler == NULL) return -EINVAL;
+
+       /* do the easy test first */
+       if (pfm_alt_intr_handler) return -EBUSY;
+
+       /* one at a time in the install or remove, just fail the others */
+       if (!spin_trylock(&pfm_alt_install_check)) {
+               return -EBUSY;
+       }
+
+       /* reserve our session */
+       for_each_online_cpu(reserve_cpu) {
+               ret = pfm_reserve_session(NULL, 1, reserve_cpu);
+               if (ret) goto cleanup_reserve;
+       }
+
+       /* save the current system wide pmu states */
+       ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
+       if (ret) {
+               DPRINT(("on_each_cpu() failed: %d\n", ret));
+               goto cleanup_reserve;
+       }
+
+       /* officially change to the alternate interrupt handler */
+       pfm_alt_intr_handler = hdl;
+
+       spin_unlock(&pfm_alt_install_check);
+
+       return 0;
+
+cleanup_reserve:
+       for_each_online_cpu(i) {
+               /* don't unreserve more than we reserved */
+               if (i >= reserve_cpu) break;
+
+               pfm_unreserve_session(NULL, 1, i);
+       }
+
+       spin_unlock(&pfm_alt_install_check);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(pfm_install_alt_pmu_interrupt);
+
+int
+pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
+{
+       int i;
+       int ret;
+
+       if (hdl == NULL) return -EINVAL;
+
+       /* cannot remove someone else's handler! */
+       if (pfm_alt_intr_handler != hdl) return -EINVAL;
+
+       /* one at a time in the install or remove, just fail the others */
+       if (!spin_trylock(&pfm_alt_install_check)) {
+               return -EBUSY;
+       }
+
+       pfm_alt_intr_handler = NULL;
+
+       ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
+       if (ret) {
+               DPRINT(("on_each_cpu() failed: %d\n", ret));
+       }
+
+       for_each_online_cpu(i) {
+               pfm_unreserve_session(NULL, 1, i);
+       }
+
+       spin_unlock(&pfm_alt_install_check);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(pfm_remove_alt_pmu_interrupt);
+
  /*
   * perfmon initialization routine, called from the initcall() table
   */
  static int init_pfm_fs(void);
  
+static int __init
+pfm_probe_pmu(void)
+{
+       pmu_config_t **p;
+       int family;
+
+       family = local_cpu_data->family;
+       p      = pmu_confs;
+
+       while(*p) {
+               if ((*p)->probe) {
+                       if ((*p)->probe() == 0) goto found;
+               } else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) {
+                       goto found;
+               }
+               p++;
+       }
+       return -1;
+found:
+       pmu_conf = *p;
+       return 0;
+}
+
+static struct file_operations pfm_proc_fops = {
+       .open           = pfm_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
  int __init
  pfm_init(void)
  {
@@ -6297,12 +6628,9 @@ pfm_init(void)
                 PFM_VERSION_MIN,
                 IA64_PERFMON_VECTOR);
  
-       /*
-        * PMU type sanity check
-        * XXX: maybe better to implement autodetection (but then we have a larger kernel)
-        */
-       if (local_cpu_data->family != pmu_conf.pmu_family) {
-               printk(KERN_INFO "perfmon: disabled, kernel only supports %s PMU family\n", pmu_conf.pmu_name);
+       if (pfm_probe_pmu()) {
+               printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", 
+                               local_cpu_data->family);
                 return -ENODEV;
         }
  
@@ -6313,56 +6641,64 @@ pfm_init(void)
         n = 0;
         for (i=0; PMC_IS_LAST(i) == 0;  i++) {
                 if (PMC_IS_IMPL(i) == 0) continue;
-               pmu_conf.impl_pmcs[i>>6] |= 1UL << (i&63);
+               pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63);
                 n++;
         }
-       pmu_conf.num_pmcs = n;
+       pmu_conf->num_pmcs = n;
  
         n = 0; n_counters = 0;
         for (i=0; PMD_IS_LAST(i) == 0;  i++) {
                 if (PMD_IS_IMPL(i) == 0) continue;
-               pmu_conf.impl_pmds[i>>6] |= 1UL << (i&63);
+               pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63);
                 n++;
                 if (PMD_IS_COUNTING(i)) n_counters++;
         }
-       pmu_conf.num_pmds      = n;
-       pmu_conf.num_counters  = n_counters;
+       pmu_conf->num_pmds      = n;
+       pmu_conf->num_counters  = n_counters;
  
         /*
          * sanity checks on the number of debug registers
          */
-       if (pmu_conf.use_rr_dbregs) {
-               if (pmu_conf.num_ibrs > IA64_NUM_DBG_REGS) {
-                       printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf.num_ibrs);
+       if (pmu_conf->use_rr_dbregs) {
+               if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
+                       printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
+                       pmu_conf = NULL;
                         return -1;
                 }
-               if (pmu_conf.num_dbrs > IA64_NUM_DBG_REGS) {
-                       printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf.num_ibrs);
+               if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
+                       printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
+                       pmu_conf = NULL;
                         return -1;
                 }
         }
  
         printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
-              pmu_conf.pmu_name,
-              pmu_conf.num_pmcs,
-              pmu_conf.num_pmds,
-              pmu_conf.num_counters,
-              ffz(pmu_conf.ovfl_val));
+              pmu_conf->pmu_name,
+              pmu_conf->num_pmcs,
+              pmu_conf->num_pmds,
+              pmu_conf->num_counters,
+              ffz(pmu_conf->ovfl_val));
  
         /* sanity check */
-       if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) {
+       if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
                 printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
+               pmu_conf = NULL;
                 return -1;
         }
  
         /*
          * create /proc/perfmon (mostly for debugging purposes)
          */
-       perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
+       perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
         if (perfmon_dir == NULL) {
                 printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
+               pmu_conf = NULL;
                 return -1;
         }
+       /*
+        * install customized file operations for /proc/perfmon entry
+        */
+       perfmon_dir->proc_fops = &pfm_proc_fops;
  
         /*
          * create /proc/sys/kernel/perfmon (for debugging purposes)
@@ -6379,9 +6715,6 @@ pfm_init(void)
  
         for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
  
-       /* we are all set */
-       pmu_conf.enabled = 1;
-
         return 0;
  }
  
@@ -6393,8 +6726,6 @@ __initcall(pfm_init);
  void
  pfm_init_percpu (void)
  {
-       int i;
-
         /*
          * make sure no measurement is active
          * (may inherit programmed PMCs from EFI).
@@ -6412,28 +6743,6 @@ pfm_init_percpu (void)
  
         ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
         ia64_srlz_d();
-
-       /*
-        * we first initialize the PMU to a stable state.
-        * the values may have been changed from their power-up
-        * values by software executed before the kernel took over.
-        *
-        * At this point, pmu_conf has not yet been initialized
-        *
-        * On McKinley, this code is ineffective until PMC4 is initialized
-        * but that's all right because we take care of pmc0 later.
-        *
-        * XXX: potential problems with pmc1.
-        */
-       for (i=1; PMC_IS_LAST(i) == 0;  i++) {
-               if (PMC_IS_IMPL(i) == 0) continue;
-               ia64_set_pmc(i, PMC_DFL_VAL(i));
-       }
-
-       for (i=0; PMD_IS_LAST(i) == 0; i++) {
-               if (PMD_IS_IMPL(i) == 0) continue;
-               ia64_set_pmd(i, 0UL);
-       }
  }
  
  /*
@@ -6452,7 +6761,7 @@ dump_pmu_state(const char *from)
         local_irq_save(flags);
  
         this_cpu = smp_processor_id();
-       regs     = ia64_task_regs(current);
+       regs     = task_pt_regs(current);
         info     = PFM_CPUINFO_GET();
         dcr      = ia64_getreg(_IA64_REG_CR_DCR);
  
@@ -6538,8 +6847,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
  }
  #else  /* !CONFIG_PERFMON */
  asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
-               long arg8, long stack)
+sys_perfmonctl (int fd, int cmd, void *arg, int count)
  {
         return -ENOSYS;
  }