X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fia64%2Fkernel%2Fperfmon.c;h=4f1543cdafec51d8f9651df34a4476d80e86224e;hb=8e8ece46a861c84343256819eaec77e608ff9217;hp=ae2eb13f94255e4ec88e98e96730e46e301af263;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ae2eb13f9..4f1543cda 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5,13 +5,13 @@
  * The initial version of perfmon.c was written by
  * Ganesh Venkitachalam, IBM Corp.
  *
- * Then it was modified for perfmon-1.x by Stephane Eranian and 
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
  * David Mosberger, Hewlett Packard Co.
- * 
+ *
  * Version Perfmon-2.x is a rewrite of perfmon-1.x
- * by Stephane Eranian, Hewlett Packard Co. 
+ * by Stephane Eranian, Hewlett Packard Co.
  *
- * Copyright (C) 1999-2003  Hewlett Packard Co
+ * Copyright (C) 1999-2003, 2005  Hewlett Packard Co
  *               Stephane Eranian <eranian@hpl.hp.com>
  *               David Mosberger-Tang <davidm@hpl.hp.com>
  *
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
@@ -37,8 +38,10 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/vs_memory.h>
+#include <linux/vs_cvirt.h>
 
-#include <asm/bitops.h>
 #include <asm/errno.h>
 #include <asm/intrinsics.h>
 #include <asm/page.h>
@@ -86,27 +89,25 @@
 #define	PFM_REG_CONFIG		(0x8<<4|PFM_REG_IMPL) /* configuration register */
 #define PFM_REG_BUFFER	 	(0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
 
-#define PMC_IS_LAST(i)	(pmu_conf.pmc_desc[i].type & PFM_REG_END)
-#define PMD_IS_LAST(i)	(pmu_conf.pmd_desc[i].type & PFM_REG_END)
-
-#define PFM_IS_DISABLED() (pmu_conf.enabled == 0)
+#define PMC_IS_LAST(i)	(pmu_conf->pmc_desc[i].type & PFM_REG_END)
+#define PMD_IS_LAST(i)	(pmu_conf->pmd_desc[i].type & PFM_REG_END)
 
 #define PMC_OVFL_NOTIFY(ctx, i)	((ctx)->ctx_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
 
 /* i assumed unsigned */
-#define PMC_IS_IMPL(i)	  (i< PMU_MAX_PMCS && (pmu_conf.pmc_desc[i].type & PFM_REG_IMPL))
-#define PMD_IS_IMPL(i)	  (i< PMU_MAX_PMDS && (pmu_conf.pmd_desc[i].type & PFM_REG_IMPL))
+#define PMC_IS_IMPL(i)	  (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
+#define PMD_IS_IMPL(i)	  (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
 
 /* XXX: these assume that register i is implemented */
-#define PMD_IS_COUNTING(i) ((pmu_conf.pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
-#define PMC_IS_COUNTING(i) ((pmu_conf.pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
-#define PMC_IS_MONITOR(i)  ((pmu_conf.pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
-#define PMC_IS_CONTROL(i)  ((pmu_conf.pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
+#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
+#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
+#define PMC_IS_MONITOR(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
+#define PMC_IS_CONTROL(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
 
-#define PMC_DFL_VAL(i)     pmu_conf.pmc_desc[i].default_value
-#define PMC_RSVD_MASK(i)   pmu_conf.pmc_desc[i].reserved_mask
-#define PMD_PMD_DEP(i)	   pmu_conf.pmd_desc[i].dep_pmd[0]
-#define PMC_PMD_DEP(i)	   pmu_conf.pmc_desc[i].dep_pmd[0]
+#define PMC_DFL_VAL(i)     pmu_conf->pmc_desc[i].default_value
+#define PMC_RSVD_MASK(i)   pmu_conf->pmc_desc[i].reserved_mask
+#define PMD_PMD_DEP(i)	   pmu_conf->pmd_desc[i].dep_pmd[0]
+#define PMC_PMD_DEP(i)	   pmu_conf->pmc_desc[i].dep_pmd[0]
 
 #define PFM_NUM_IBRS	  IA64_NUM_DBG_REGS
 #define PFM_NUM_DBRS	  IA64_NUM_DBG_REGS
@@ -133,6 +134,8 @@
 #define PFM_CPUINFO_SET(v)	pfm_get_cpu_var(pfm_syst_info) |= (v)
 #define PFM_CPUINFO_GET()	pfm_get_cpu_var(pfm_syst_info)
 
+#define RDEP(x)	(1UL<<(x))
+
 /*
  * context protection macros
  * in SMP:
@@ -310,6 +313,7 @@ typedef struct pfm_context {
 	unsigned int		ctx_cpu;		/* cpu to which perfmon is applied (system wide) */
 
 	int			ctx_fd;			/* file descriptor used my this context */
+	pfm_ovfl_arg_t		ctx_ovfl_arg;		/* argument to custom buffer format handler */
 
 	pfm_buffer_fmt_t	*ctx_buf_fmt;		/* buffer format callbacks */
 	void			*ctx_smpl_hdr;		/* points to sampling buffer header kernel vaddr */
@@ -374,26 +378,32 @@ typedef struct {
  * dep_pmd[]: a bitmask of dependent PMD registers
  * dep_pmc[]: a bitmask of dependent PMC registers
  */
+typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
 typedef struct {
 	unsigned int		type;
 	int			pm_pos;
 	unsigned long		default_value;	/* power-on default value */
 	unsigned long		reserved_mask;	/* bitmask of reserved bits */
-	int			(*read_check)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-	int			(*write_check)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+	pfm_reg_check_t		read_check;
+	pfm_reg_check_t		write_check;
 	unsigned long		dep_pmd[4];
 	unsigned long		dep_pmc[4];
 } pfm_reg_desc_t;
 
 /* assume cnum is a valid monitor */
-#define PMC_PM(cnum, val)	(((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
-#define PMC_WR_FUNC(cnum)	(pmu_conf.pmc_desc[cnum].write_check)
-#define PMD_WR_FUNC(cnum)	(pmu_conf.pmd_desc[cnum].write_check)
-#define PMD_RD_FUNC(cnum)	(pmu_conf.pmd_desc[cnum].read_check)
+#define PMC_PM(cnum, val)	(((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
 
 /*
  * This structure is initialized at boot time and contains
  * a description of the PMU main characteristics.
+ *
+ * If the probe function is defined, detection is based
+ * on its return value: 
+ * 	- 0 means recognized PMU
+ * 	- anything else means not supported
+ * When the probe function is not defined, then the pmu_family field
+ * is used and it must match the host CPU family such that:
+ * 	- cpu->family & config->pmu_family != 0
  */
 typedef struct {
 	unsigned long  ovfl_val;	/* overflow value for counters */
@@ -407,15 +417,18 @@ typedef struct {
 	unsigned long  impl_pmds[4];	/* bitmask of implemented PMDS */
 
 	char	      *pmu_name;	/* PMU family name */
-	unsigned int  enabled;		/* indicates if perfmon initialized properly */
 	unsigned int  pmu_family;	/* cpuid family pattern used to identify pmu */
-
+	unsigned int  flags;		/* pmu specific flags */
 	unsigned int  num_ibrs;		/* number of IBRS: computed at init time */
 	unsigned int  num_dbrs;		/* number of DBRS: computed at init time */
 	unsigned int  num_counters;	/* PMC/PMD counting pairs : computed at init time */
-
+	int           (*probe)(void);   /* customized probe routine */
 	unsigned int  use_rr_dbregs:1;	/* set if debug registers used for range restriction */
 } pmu_config_t;
+/*
+ * PMU specific flags
+ */
+#define PFM_PMU_IRQ_RESEND	1	/* PMU needs explicit IRQ resend */
 
 /*
  * debug register related type definitions
@@ -500,6 +513,8 @@ static pfm_uuid_t		pfm_null_uuid = {0,};
 static spinlock_t		pfm_buffer_fmt_lock;
 static LIST_HEAD(pfm_buffer_fmt_list);
 
+static pmu_config_t		*pmu_conf;
+
 /* sysctl() controls */
 static pfm_sysctl_t pfm_sysctl;
 int pfm_debug_var;
@@ -559,12 +574,6 @@ pfm_unreserve_page(unsigned long a)
 	ClearPageReserved(vmalloc_to_page((void*)a));
 }
 
-static inline int
-pfm_remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
-{
-	return remap_page_range(vma, from, phys_addr, size, prot);
-}
-
 static inline unsigned long
 pfm_protect_ctx_ctxsw(pfm_context_t *x)
 {
@@ -620,20 +629,19 @@ static void pfm_lazy_save_regs (struct task_struct *ta);
 #endif
 
 void dump_pmu_state(const char *);
+static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
 
-/*
- * the HP simulator must be first because
- * CONFIG_IA64_HP_SIM is independent of CONFIG_MCKINLEY or CONFIG_ITANIUM
- */
-#if defined(CONFIG_IA64_HP_SIM)
-#include "perfmon_hpsim.h"
-#elif   defined(CONFIG_ITANIUM)
 #include "perfmon_itanium.h"
-#elif defined(CONFIG_MCKINLEY)
 #include "perfmon_mckinley.h"
-#else
 #include "perfmon_generic.h"
-#endif
+
+static pmu_config_t *pmu_confs[]={
+	&pmu_conf_mck,
+	&pmu_conf_ita,
+	&pmu_conf_gen, /* must be last */
+	NULL
+};
+
 
 static int pfm_end_notify_user(pfm_context_t *ctx);
 
@@ -702,6 +710,7 @@ pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
 
 	for (i=0; i < nibrs; i++) {
 		ia64_set_ibr(i, ibrs[i]);
+		ia64_dv_serialize_instruction();
 	}
 	ia64_srlz_i();
 }
@@ -713,6 +722,7 @@ pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
 
 	for (i=0; i < ndbrs; i++) {
 		ia64_set_dbr(i, dbrs[i]);
+		ia64_dv_serialize_data();
 	}
 	ia64_srlz_d();
 }
@@ -723,7 +733,7 @@ pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
 static inline unsigned long
 pfm_read_soft_counter(pfm_context_t *ctx, int i)
 {
-	return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf.ovfl_val);
+	return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
 }
 
 /*
@@ -732,7 +742,7 @@ pfm_read_soft_counter(pfm_context_t *ctx, int i)
 static inline void
 pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
 {
-	unsigned long ovfl_val = pmu_conf.ovfl_val;
+	unsigned long ovfl_val = pmu_conf->ovfl_val;
 
 	ctx->ctx_pmds[i].val = val  & ~ovfl_val;
 	/*
@@ -791,18 +801,6 @@ pfm_reset_msgq(pfm_context_t *ctx)
 	DPRINT(("ctx=%p msgq reset\n", ctx));
 }
 
-
-/* Here we want the physical address of the memory.
- * This is used when initializing the contents of the
- * area and marking the pages as reserved.
- */
-static inline unsigned long
-pfm_kvirt_to_pa(unsigned long adr)
-{
-	__u64 pa = ia64_tpa(adr);
-	return pa;
-}
-
 static void *
 pfm_rvmalloc(unsigned long size)
 {
@@ -878,7 +876,7 @@ pfm_mask_monitoring(struct task_struct *task)
 
 	DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
 
-	ovfl_mask = pmu_conf.ovfl_val;
+	ovfl_mask = pmu_conf->ovfl_val;
 	/*
 	 * monitoring can only be masked as a result of a valid
 	 * counter overflow. In UP, it means that the PMU still
@@ -953,7 +951,7 @@ pfm_restore_monitoring(struct task_struct *task)
 	int i, is_system;
 
 	is_system = ctx->ctx_fl_system;
-	ovfl_mask = pmu_conf.ovfl_val;
+	ovfl_mask = pmu_conf->ovfl_val;
 
 	if (task != current) {
 		printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
@@ -1024,8 +1022,8 @@ pfm_restore_monitoring(struct task_struct *task)
 	 * XXX: need to optimize 
 	 */
 	if (ctx->ctx_fl_using_dbreg) {
-		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
 	}
 
 	/*
@@ -1058,7 +1056,7 @@ static inline void
 pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
 {
 	int i;
-	unsigned long val, ovfl_val = pmu_conf.ovfl_val;
+	unsigned long val, ovfl_val = pmu_conf->ovfl_val;
 
 	for (i=0; mask; i++, mask>>=1) {
 		if ((mask & 0x1) == 0) continue;
@@ -1075,7 +1073,7 @@ static inline void
 pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
 {
 	struct thread_struct *thread = &task->thread;
-	unsigned long ovfl_val = pmu_conf.ovfl_val;
+	unsigned long ovfl_val = pmu_conf->ovfl_val;
 	unsigned long mask = ctx->ctx_all_pmds[0];
 	unsigned long val;
 	int i;
@@ -1498,15 +1496,8 @@ exit_pfm_fs(void)
 	mntput(pfmfs_mnt);
 }
 
-static loff_t
-pfm_lseek(struct file *file, loff_t offset, int whence)
-{
-	DPRINT(("pfm_lseek called\n"));
-	return -ESPIPE;
-}
-
 static ssize_t
-pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
+pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
 {
 	pfm_context_t *ctx;
 	pfm_msg_t *msg;
@@ -1531,10 +1522,6 @@ pfm_read(struct file *filp, char *buf, size_t size, loff_t *ppos)
 		DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
 		return -EINVAL;
 	}
-	/*
-	 * seeks are not allowed on message queues
-	 */
-	if (ppos != &filp->f_pos) return -ESPIPE;
 
 	PROTECT_CTX(ctx, flags);
 
@@ -1603,7 +1590,7 @@ abort:
 }
 
 static ssize_t
-pfm_write(struct file *file, const char *ubuf,
+pfm_write(struct file *file, const char __user *ubuf,
 			  size_t size, loff_t *ppos)
 {
 	DPRINT(("pfm_write called\n"));
@@ -1653,7 +1640,7 @@ pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned lon
 }
 
 /*
- * context is locked when coming here and interrupts are disabled
+ * interrupt cannot be masked when coming here
  */
 static inline int
 pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
@@ -1675,7 +1662,6 @@ static int
 pfm_fasync(int fd, struct file *filp, int on)
 {
 	pfm_context_t *ctx;
-	unsigned long flags;
 	int ret;
 
 	if (PFM_IS_FILE(filp) == 0) {
@@ -1688,19 +1674,21 @@ pfm_fasync(int fd, struct file *filp, int on)
 		printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
 		return -EBADF;
 	}
-
-
-	PROTECT_CTX(ctx, flags);
-
+	/*
+	 * we cannot mask interrupts during this call because this may
+	 * may go to sleep if memory is not readily avalaible.
+	 *
+	 * We are protected from the conetxt disappearing by the get_fd()/put_fd()
+	 * done in caller. Serialization of this function is ensured by caller.
+	 */
 	ret = pfm_do_fasync(fd, filp, ctx, on);
 
+
 	DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
 		fd,
 		on,
 		ctx->ctx_async_queue, ret));
 
-	UNPROTECT_CTX(ctx, flags);
-
 	return ret;
 }
 
@@ -2012,7 +2000,7 @@ pfm_close(struct inode *inode, struct file *filp)
 
 		/*
 		 * XXX: check for signals :
-		 * 	- ok of explicit close
+		 * 	- ok for explicit close
 		 * 	- not ok when coming from exit_files()
 		 */
       		schedule();
@@ -2127,7 +2115,7 @@ pfm_no_open(struct inode *irrelevant, struct file *dontcare)
 
 
 static struct file_operations pfm_file_ops = {
-	.llseek   = pfm_lseek,
+	.llseek   = no_llseek,
 	.read     = pfm_read,
 	.write    = pfm_write,
 	.poll     = pfm_poll,
@@ -2174,9 +2162,7 @@ pfm_alloc_fd(struct file **cfile)
 
 	DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
 
-	inode->i_sb   = pfmfs_mnt->mnt_sb;
 	inode->i_mode = S_IFCHR|S_IRUGO;
-	inode->i_sock = 0;
 	inode->i_uid  = current->fsuid;
 	inode->i_gid  = current->fsgid;
 
@@ -2224,6 +2210,15 @@ out:
 static void
 pfm_free_fd(int fd, struct file *file)
 {
+	struct files_struct *files = current->files;
+
+	/* 
+	 * there ie no fd_uninstall(), so we do it here
+	 */
+	spin_lock(&files->file_lock);
+        files->fd[fd] = NULL;
+	spin_unlock(&files->file_lock);
+
 	if (file) put_filp(file);
 	put_unused_fd(fd);
 }
@@ -2231,14 +2226,14 @@ pfm_free_fd(int fd, struct file *file)
 static int
 pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
 {
-	unsigned long page;
-
 	DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
 
 	while (size > 0) {
-		page = pfm_kvirt_to_pa(buf);
+		unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
+
 
-		if (pfm_remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
+		if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
+			return -ENOMEM;
 
 		addr  += PAGE_SIZE;
 		buf   += PAGE_SIZE;
@@ -2274,7 +2269,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
 	 * 	return -ENOMEM;
 	 */
-	if (size > task->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;
+	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+		return -ENOMEM;
 
 	/*
 	 * We do the easy to undo allocations first.
@@ -2295,20 +2291,14 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
 	}
+	memset(vma, 0, sizeof(*vma));
+
 	/*
 	 * partially initialize the vma for the sampling buffer
-	 *
-	 * The VM_DONTCOPY flag is very important as it ensures that the mapping
-	 * will never be inherited for any child process (via fork()) which is always
-	 * what we want.
 	 */
 	vma->vm_mm	     = mm;
 	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
-	vma->vm_ops	     = NULL;
-	vma->vm_pgoff	     = 0;
-	vma->vm_file	     = NULL;
-	vma->vm_private_data = NULL; 
 
 	/*
 	 * Now we have everything we need and we can initialize
@@ -2334,6 +2324,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 		goto error;
 	}
 	vma->vm_end = vma->vm_start + size;
+	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 
 	DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
 
@@ -2350,8 +2341,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	 */
 	insert_vm_struct(mm, vma);
 
-	mm->total_vm  += size >> PAGE_SHIFT;
-
+	vx_vmpages_add(mm, size >> PAGE_SHIFT);
+	vm_stat_account(vma);
 	up_write(&task->mm->mmap_sem);
 
 	/*
@@ -2513,12 +2504,12 @@ pfm_reset_pmu_state(pfm_context_t *ctx)
 	  *
 	  * PMC0 is treated differently.
 	  */
-	ctx->ctx_all_pmcs[0] = pmu_conf.impl_pmcs[0] & ~0x1;
+	ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
 
 	/*
 	 * bitmask of all PMDs that are accesible to this context
 	 */
-	ctx->ctx_all_pmds[0] = pmu_conf.impl_pmds[0];
+	ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
 
 	DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
 
@@ -2581,7 +2572,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 		return -EINVAL;
 	}
 
-	if (task->state == TASK_ZOMBIE) {
+	if (task->exit_state == EXIT_ZOMBIE) {
 		DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
 		return -EBUSY;
 	}
@@ -2591,7 +2582,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	 */
 	if (task == current) return 0;
 
-	if (task->state != TASK_STOPPED) {
+	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
 		DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
 		return -EBUSY;
 	}
@@ -2658,8 +2649,10 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 	ctx = pfm_context_alloc();
 	if (!ctx) goto error;
 
-	req->ctx_fd = ctx->ctx_fd = pfm_alloc_fd(&filp);
-	if (req->ctx_fd < 0) goto error_file;
+	ret = pfm_alloc_fd(&filp);
+	if (ret < 0) goto error_file;
+
+	req->ctx_fd = ctx->ctx_fd = ret;
 
 	/*
 	 * attach context to file
@@ -2858,16 +2851,17 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	unsigned long value, pmc_pm;
 	unsigned long smpl_pmds, reset_pmds, impl_pmds;
 	unsigned int cnum, reg_flags, flags, pmc_type;
-	int i, can_access_pmu = 0, is_loaded, is_system;
+	int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
 	int is_monitor, is_counting, state;
 	int ret = -EINVAL;
+	pfm_reg_check_t	wr_func;
 #define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
 
 	state     = ctx->ctx_state;
 	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
 	is_system = ctx->ctx_fl_system;
 	task      = ctx->ctx_task;
-	impl_pmds = pmu_conf.impl_pmds[0];
+	impl_pmds = pmu_conf->impl_pmds[0];
 
 	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
 
@@ -2884,6 +2878,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		}
 		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
 	}
+	expert_mode = pfm_sysctl.expert_mode; 
 
 	for (i = 0; i < count; i++, req++) {
 
@@ -2900,8 +2895,8 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			goto error;
 		}
 
-		pmc_type   = pmu_conf.pmc_desc[cnum].type;
-		pmc_pm     = (value >> pmu_conf.pmc_desc[cnum].pm_pos) & 0x1;
+		pmc_type   = pmu_conf->pmc_desc[cnum].type;
+		pmc_pm     = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
 		is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
 		is_monitor  = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
 
@@ -2914,6 +2909,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
 			goto error;
 		}
+		wr_func = pmu_conf->pmc_desc[cnum].write_check;
 		/*
 		 * If the PMC is a monitor, then if the value is not the default:
 		 * 	- system-wide session: PMCx.pm=1 (privileged monitor)
@@ -2962,8 +2958,8 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		/*
 		 * execute write checker, if any
 		 */
-		if (pfm_sysctl.expert_mode == 0 && PMC_WR_FUNC(cnum)) {
-			ret = PMC_WR_FUNC(cnum)(task, ctx, cnum, &value, regs);
+		if (likely(expert_mode == 0 && wr_func)) {
+			ret = (*wr_func)(task, ctx, cnum, &value, regs);
 			if (ret) goto error;
 			ret = -EINVAL;
 		}
@@ -3014,7 +3010,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		 * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
 		 * possible leak here.
 		 */
-		CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
+		CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
 
 		/*
 		 * keep track of the monitor PMC that we are using.
@@ -3061,11 +3057,12 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 #endif
 		}
 
-		DPRINT(("pmc[%u]=0x%lx loaded=%d access_pmu=%d all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
+		DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
 			  cnum,
 			  value,
 			  is_loaded,
 			  can_access_pmu,
+			  flags,
 			  ctx->ctx_all_pmcs[0],
 			  ctx->ctx_used_pmds[0],
 			  ctx->ctx_pmds[cnum].eventid,
@@ -3096,14 +3093,15 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	unsigned long value, hw_value, ovfl_mask;
 	unsigned int cnum;
 	int i, can_access_pmu = 0, state;
-	int is_counting, is_loaded, is_system;
+	int is_counting, is_loaded, is_system, expert_mode;
 	int ret = -EINVAL;
+	pfm_reg_check_t wr_func;
 
 
 	state     = ctx->ctx_state;
 	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
 	is_system = ctx->ctx_fl_system;
-	ovfl_mask = pmu_conf.ovfl_val;
+	ovfl_mask = pmu_conf->ovfl_val;
 	task      = ctx->ctx_task;
 
 	if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
@@ -3125,6 +3123,7 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		}
 		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
 	}
+	expert_mode = pfm_sysctl.expert_mode; 
 
 	for (i = 0; i < count; i++, req++) {
 
@@ -3136,14 +3135,15 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			goto abort_mission;
 		}
 		is_counting = PMD_IS_COUNTING(cnum);
+		wr_func     = pmu_conf->pmd_desc[cnum].write_check;
 
 		/*
 		 * execute write checker, if any
 		 */
-		if (pfm_sysctl.expert_mode == 0 && PMD_WR_FUNC(cnum)) {
+		if (unlikely(expert_mode == 0 && wr_func)) {
 			unsigned long v = value;
 
-			ret = PMD_WR_FUNC(cnum)(task, ctx, cnum, &v, regs);
+			ret = (*wr_func)(task, ctx, cnum, &v, regs);
 			if (ret) goto abort_mission;
 
 			value = v;
@@ -3238,8 +3238,8 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			}
 		}
 
-		DPRINT(("pmd[%u]=0x%lx loaded=%d access_pmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
-			  "long_reset=0x%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
+		DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
+			  "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
 			cnum,
 			value,
 			is_loaded,
@@ -3249,6 +3249,8 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			ctx->ctx_pmds[cnum].short_reset,
 			ctx->ctx_pmds[cnum].long_reset,
 			PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
+			ctx->ctx_pmds[cnum].seed,
+			ctx->ctx_pmds[cnum].mask,
 			ctx->ctx_used_pmds[0],
 			ctx->ctx_pmds[cnum].reset_pmds[0],
 			ctx->ctx_reload_pmds[0],
@@ -3289,8 +3291,9 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	pfarg_reg_t *req = (pfarg_reg_t *)arg;
 	unsigned int cnum, reg_flags = 0;
 	int i, can_access_pmu = 0, state;
-	int is_loaded, is_system, is_counting;
+	int is_loaded, is_system, is_counting, expert_mode;
 	int ret = -EINVAL;
+	pfm_reg_check_t rd_func;
 
 	/*
 	 * access is possible when loaded only for
@@ -3300,7 +3303,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	state     = ctx->ctx_state;
 	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
 	is_system = ctx->ctx_fl_system;
-	ovfl_mask = pmu_conf.ovfl_val;
+	ovfl_mask = pmu_conf->ovfl_val;
 	task      = ctx->ctx_task;
 
 	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
@@ -3323,8 +3326,9 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 
 		if (can_access_pmu) ia64_srlz_d();
 	}
+	expert_mode = pfm_sysctl.expert_mode; 
 
-	DPRINT(("loaded=%d access_pmu=%d ctx_state=%d\n",
+	DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
 		is_loaded,
 		can_access_pmu,
 		state));
@@ -3369,6 +3373,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			 */
 			val = is_loaded ? thread->pmds[cnum] : 0UL;
 		}
+		rd_func = pmu_conf->pmd_desc[cnum].read_check;
 
 		if (is_counting) {
 			/*
@@ -3381,9 +3386,9 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		/*
 		 * execute read checker, if any
 		 */
-		if (unlikely(pfm_sysctl.expert_mode == 0 && PMD_RD_FUNC(cnum))) {
+		if (unlikely(expert_mode == 0 && rd_func)) {
 			unsigned long v = val;
-			ret = PMD_RD_FUNC(cnum)(ctx->ctx_task, ctx, cnum, &v, regs);
+			ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
 			if (ret) goto error;
 			val = v;
 			ret = -EINVAL;
@@ -3463,7 +3468,7 @@ pfm_use_debug_registers(struct task_struct *task)
 	unsigned long flags;
 	int ret = 0;
 
-	if (pmu_conf.use_rr_dbregs == 0) return 0;
+	if (pmu_conf->use_rr_dbregs == 0) return 0;
 
 	DPRINT(("called for [%d]\n", task->pid));
 
@@ -3517,7 +3522,7 @@ pfm_release_debug_registers(struct task_struct *task)
 	unsigned long flags;
 	int ret;
 
-	if (pmu_conf.use_rr_dbregs == 0) return 0;
+	if (pmu_conf->use_rr_dbregs == 0) return 0;
 
 	LOCK_PFS(flags);
 	if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
@@ -3720,7 +3725,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
 	int i, can_access_pmu = 0;
 	int is_system, is_loaded;
 
-	if (pmu_conf.use_rr_dbregs == 0) return -EINVAL;
+	if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
 
 	state     = ctx->ctx_state;
 	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
@@ -3802,14 +3807,14 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
  	 */
 	if (first_time && can_access_pmu) {
 		DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
-		for (i=0; i < pmu_conf.num_ibrs; i++) {
+		for (i=0; i < pmu_conf->num_ibrs; i++) {
 			ia64_set_ibr(i, 0UL);
-			ia64_srlz_i();
+			ia64_dv_serialize_instruction();
 		}
 		ia64_srlz_i();
-		for (i=0; i < pmu_conf.num_dbrs; i++) {
+		for (i=0; i < pmu_conf->num_dbrs; i++) {
 			ia64_set_dbr(i, 0UL);
-			ia64_srlz_d();
+			ia64_dv_serialize_data();
 		}
 		ia64_srlz_d();
 	}
@@ -3856,20 +3861,25 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
 		if (mode == PFM_CODE_RR) {
 			CTX_USED_IBR(ctx, rnum);
 
-			if (can_access_pmu) ia64_set_ibr(rnum, dbreg.val);
+			if (can_access_pmu) {
+				ia64_set_ibr(rnum, dbreg.val);
+				ia64_dv_serialize_instruction();
+			}
 
 			ctx->ctx_ibrs[rnum] = dbreg.val;
 
-			DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x is_loaded=%d access_pmu=%d\n",
+			DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
 				rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
 		} else {
 			CTX_USED_DBR(ctx, rnum);
 
-			if (can_access_pmu) ia64_set_dbr(rnum, dbreg.val);
-
+			if (can_access_pmu) {
+				ia64_set_dbr(rnum, dbreg.val);
+				ia64_dv_serialize_data();
+			}
 			ctx->ctx_dbrs[rnum] = dbreg.val;
 
-			DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x is_loaded=%d access_pmu=%d\n",
+			DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
 				rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
 		}
 	}
@@ -3970,7 +3980,10 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	state     = ctx->ctx_state;
 	is_system = ctx->ctx_fl_system;
 
-	if (state != PFM_CTX_LOADED && state != PFM_CTX_MASKED) return -EINVAL;
+	/*
+	 * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
+	 */
+	if (state == PFM_CTX_UNLOADED) return -EINVAL;
 
 	/*
  	 * In system wide and when the context is loaded, access can only happen
@@ -4367,8 +4380,8 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		 * guaranteed safe by earlier check against DBG_VALID
 		 */
 		if (ctx->ctx_fl_using_dbreg) {
-			pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-			pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+			pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+			pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
 		}
 		/*
 		 * set new ownership
@@ -4559,31 +4572,6 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 	return 0;
 }
 
-static void
-pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
-{
-	struct task_struct *task = ctx->ctx_task;
-
-	ia64_psr(regs)->up = 0;
-	ia64_psr(regs)->sp = 1;
-
-	if (GET_PMU_OWNER() == task) {
-		DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
-		SET_PMU_OWNER(NULL, NULL);
-	}
-
-	/*
-	 * disconnect the task from the context and vice-versa
-	 */
-	PFM_SET_WORK_PENDING(task, 0);
-
-	task->thread.pfm_context  = NULL;
-	task->thread.flags       &= ~IA64_THREAD_PM_VALID;
-
-	DPRINT(("force cleanupf for [%d]\n",  task->pid));
-}
-
-
 
 /*
  * called only from exit_thread(): task == current
@@ -4701,21 +4689,22 @@ static int
 pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
 {
 	struct task_struct *task;
-	int state;
+	int state, old_state;
 
+recheck:
 	state = ctx->ctx_state;
+	task  = ctx->ctx_task;
 
-	task = PFM_CTX_TASK(ctx);
 	if (task == NULL) {
 		DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
 		return 0;
 	}
 
 	DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
-				ctx->ctx_fd,
-				state,
-				task->pid,
-				task->state, PFM_CMD_STOPPED(cmd)));
+		ctx->ctx_fd,
+		state,
+		task->pid,
+		task->state, PFM_CMD_STOPPED(cmd)));
 
 	/*
 	 * self-monitoring always ok.
@@ -4727,31 +4716,63 @@ pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
 	if (task == current || ctx->ctx_fl_system) return 0;
 
 	/*
-	 * context is UNLOADED, MASKED we are safe to go
+	 * if context is UNLOADED we are safe to go
 	 */
-	if (state != PFM_CTX_LOADED) return 0;
+	if (state == PFM_CTX_UNLOADED) return 0;
 
-	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
+	/*
+	 * no command can operate on a zombie context
+	 */
+	if (state == PFM_CTX_ZOMBIE) {
+		DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
+		return -EINVAL;
+	}
 
 	/*
-	 * context is loaded, we must make sure the task is stopped
+	 * context is LOADED or MASKED. Some commands may need to have 
+	 * the task stopped.
+	 *
 	 * We could lift this restriction for UP but it would mean that
 	 * the user has no guarantee the task would not run between
 	 * two successive calls to perfmonctl(). That's probably OK.
 	 * If this user wants to ensure the task does not run, then
 	 * the task must be stopped.
 	 */
-	if (PFM_CMD_STOPPED(cmd) && task->state != TASK_STOPPED) {
-		DPRINT(("[%d] task not in stopped state\n", task->pid));
-		return -EBUSY;
-	}
+	if (PFM_CMD_STOPPED(cmd)) {
+		if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
+			DPRINT(("[%d] task not in stopped state\n", task->pid));
+			return -EBUSY;
+		}
+		/*
+		 * task is now stopped, wait for ctxsw out
+		 *
+		 * This is an interesting point in the code.
+		 * We need to unprotect the context because
+		 * the pfm_save_regs() routines needs to grab
+		 * the same lock. There are danger in doing
+		 * this because it leaves a window open for
+		 * another task to get access to the context
+		 * and possibly change its state. The one thing
+		 * that is not possible is for the context to disappear
+		 * because we are protected by the VFS layer, i.e.,
+		 * get_fd()/put_fd().
+		 */
+		old_state = state;
 
-	UNPROTECT_CTX(ctx, flags);
+		UNPROTECT_CTX(ctx, flags);
 
-	wait_task_inactive(task);
+		wait_task_inactive(task);
 
-	PROTECT_CTX(ctx, flags);
+		PROTECT_CTX(ctx, flags);
 
+		/*
+		 * we must recheck to verify if state has changed
+		 */
+		if (ctx->ctx_state != old_state) {
+			DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
+			goto recheck;
+		}
+	}
 	return 0;
 }
 
@@ -4759,10 +4780,8 @@ pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
  * system-call entry point (must return long)
  */
 asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
-		long arg8, long stack)
+sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
 {
-	struct pt_regs *regs = (struct pt_regs *)&stack;
 	struct file *file = NULL;
 	pfm_context_t *ctx = NULL;
 	unsigned long flags = 0UL;
@@ -4777,7 +4796,7 @@ sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, lon
 	/*
 	 * reject any call if perfmon was disabled at initialization
 	 */
-	if (unlikely(PFM_IS_DISABLED())) return -ENOSYS;
+	if (unlikely(pmu_conf == NULL)) return -ENOSYS;
 
 	if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
 		DPRINT(("invalid cmd=%d\n", cmd));
@@ -4886,7 +4905,7 @@ restart_args:
 	if (unlikely(ret)) goto abort_locked;
 
 skip_fd:
-	ret = (*func)(ctx, args_k, count, regs);
+	ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
 
 	call_made = 1;
 
@@ -4957,26 +4976,14 @@ pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_reg
 static void
 pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
 {
-	if (ctx->ctx_fl_system) {
-		printk(KERN_ERR "perfmon: pfm_context_force_terminate [%d] is system-wide\n", current->pid);
-		return;
-	}
-	/*
-	 * we stop the whole thing, we do no need to flush
-	 * we know we WERE masked
-	 */
-	pfm_clear_psr_up();
-	ia64_psr(regs)->up = 0;
-	ia64_psr(regs)->sp = 1;
+	int ret;
 
-	/*
-	 * disconnect the task from the context and vice-versa
-	 */
-	current->thread.pfm_context  = NULL;
-	current->thread.flags       &= ~IA64_THREAD_PM_VALID;
-	ctx->ctx_task = NULL;
+	DPRINT(("entering for [%d]\n", current->pid));
 
-	DPRINT(("context terminated\n"));
+	ret = pfm_context_unload(ctx, NULL, 0, regs);
+	if (ret) {
+		printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
+	}
 
 	/*
 	 * and wakeup controlling task, indicating we are now disconnected
@@ -5036,6 +5043,18 @@ pfm_handle_work(void)
 
 	UNPROTECT_CTX(ctx, flags);
 
+	 /*
+	  * pfm_handle_work() is currently called with interrupts disabled.
+	  * The down_interruptible call may sleep, therefore we
+	  * must re-enable interrupts to avoid deadlocks. It is
+	  * safe to do so because this function is called ONLY
+	  * when returning to user level (PUStk=1), in which case
+	  * there is no risk of kernel stack overflow due to deep
+	  * interrupt nesting.
+	  */
+	BUG_ON(flags & IA64_PSR_I);
+	local_irq_enable();
+
 	DPRINT(("before block sleeping\n"));
 
 	/*
@@ -5046,6 +5065,12 @@ pfm_handle_work(void)
 
 	DPRINT(("after block sleeping ret=%d\n", ret));
 
+	/*
+	 * disable interrupts to restore state we had upon entering
+	 * this function
+	 */
+	local_irq_disable();
+
 	PROTECT_CTX(ctx, flags);
 
 	/*
@@ -5160,7 +5185,7 @@ pfm_end_notify_user(pfm_context_t *ctx)
 static void
 pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
 {
-	pfm_ovfl_arg_t ovfl_arg;
+	pfm_ovfl_arg_t *ovfl_arg;
 	unsigned long mask;
 	unsigned long old_val, ovfl_val, new_val;
 	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
@@ -5178,7 +5203,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 
 	tstamp   = ia64_get_itc();
 	mask     = pmc0 >> PMU_FIRST_COUNTER;
-	ovfl_val = pmu_conf.ovfl_val;
+	ovfl_val = pmu_conf->ovfl_val;
 	has_smpl = CTX_HAS_SMPL(ctx);
 
 	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
@@ -5247,7 +5272,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 		int j, k, ret = 0;
 		int this_cpu = smp_processor_id();
 
-		pmd_mask   = ovfl_pmds >> PMU_FIRST_COUNTER;
+		pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
+		ovfl_arg = &ctx->ctx_ovfl_arg;
 
 		prefetch(ctx->ctx_smpl_hdr);
 
@@ -5257,15 +5283,15 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 
 			if ((pmd_mask & 0x1) == 0) continue;
 
-			ovfl_arg.ovfl_pmd      = (unsigned char )i;
-			ovfl_arg.ovfl_notify   = ovfl_notify & mask ? 1 : 0;
-			ovfl_arg.active_set    = 0;
-			ovfl_arg.ovfl_ctrl.val = 0; /* module must fill in all fields */
-			ovfl_arg.smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
+			ovfl_arg->ovfl_pmd      = (unsigned char )i;
+			ovfl_arg->ovfl_notify   = ovfl_notify & mask ? 1 : 0;
+			ovfl_arg->active_set    = 0;
+			ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
+			ovfl_arg->smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
 
-			ovfl_arg.pmd_value      = ctx->ctx_pmds[i].val;
-			ovfl_arg.pmd_last_reset = ctx->ctx_pmds[i].lval;
-			ovfl_arg.pmd_eventid    = ctx->ctx_pmds[i].eventid;
+			ovfl_arg->pmd_value      = ctx->ctx_pmds[i].val;
+			ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
+			ovfl_arg->pmd_eventid    = ctx->ctx_pmds[i].eventid;
 
 			/*
 		 	 * copy values of pmds of interest. Sampling format may copy them
@@ -5274,8 +5300,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 			if (smpl_pmds) {
 				for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
 					if ((smpl_pmds & 0x1) == 0) continue;
-					ovfl_arg.smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
-					DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg.smpl_pmds_values[k-1]));
+					ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
+					DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
 				}
 			}
 
@@ -5286,7 +5312,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 			/*
 		 	 * call custom buffer format record (handler) routine
 		 	 */
-			ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, &ovfl_arg, regs, tstamp);
+			ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
 
 			end_cycles = ia64_get_itc();
 
@@ -5294,13 +5320,13 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 			 * For those controls, we take the union because they have
 			 * an all or nothing behavior.
 			 */
-			ovfl_ctrl.bits.notify_user     |= ovfl_arg.ovfl_ctrl.bits.notify_user;
-			ovfl_ctrl.bits.block_task      |= ovfl_arg.ovfl_ctrl.bits.block_task;
-			ovfl_ctrl.bits.mask_monitoring |= ovfl_arg.ovfl_ctrl.bits.mask_monitoring;
+			ovfl_ctrl.bits.notify_user     |= ovfl_arg->ovfl_ctrl.bits.notify_user;
+			ovfl_ctrl.bits.block_task      |= ovfl_arg->ovfl_ctrl.bits.block_task;
+			ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
 			/*
 			 * build the bitmask of pmds to reset now
 			 */
-			if (ovfl_arg.ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
+			if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
 
 			pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
 		}
@@ -5330,9 +5356,8 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 		if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
 	}
 
-	DPRINT(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n",
-		ovfl_pmds,
-		reset_pmds));
+	DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
+
 	/*
 	 * reset the requested PMD registers using the short reset values
 	 */
@@ -5536,111 +5561,185 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
 	return IRQ_HANDLED;
 }
 
+/*
+ * /proc/perfmon interface, for debug only
+ */
+
+#define PFM_PROC_SHOW_HEADER	((void *)NR_CPUS+1)
 
-/* for debug only */
-static int
-pfm_proc_info(char *page)
+static void *
+pfm_proc_start(struct seq_file *m, loff_t *pos)
 {
-	char *p = page;
-	struct list_head * pos;
-	pfm_buffer_fmt_t * entry;
-	unsigned long psr, flags;
-	int online_cpus = 0;
-	int i;
+	if (*pos == 0) {
+		return PFM_PROC_SHOW_HEADER;
+	}
 
-		p += sprintf(p, "perfmon version           : %u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN);
-		p += sprintf(p, "model                     : %s\n", pmu_conf.pmu_name);
-		p += sprintf(p, "fastctxsw                 : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
-		p += sprintf(p, "expert mode               : %s\n", pfm_sysctl.expert_mode > 0 ? "Yes": "No");
-		p += sprintf(p, "ovfl_mask                 : 0x%lx\n", pmu_conf.ovfl_val);
-
-	for(i=0; i < NR_CPUS; i++) {
-		if (cpu_online(i) == 0) continue;
-		p += sprintf(p, "CPU%-2d overflow intrs      : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_count);
-		p += sprintf(p, "CPU%-2d overflow cycles     : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles);
-		p += sprintf(p, "CPU%-2d overflow min        : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles_min);
-		p += sprintf(p, "CPU%-2d overflow max        : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles_max);
-		p += sprintf(p, "CPU%-2d smpl handler calls  : %lu\n", i, pfm_stats[i].pfm_smpl_handler_calls);
-		p += sprintf(p, "CPU%-2d smpl handler cycles : %lu\n", i, pfm_stats[i].pfm_smpl_handler_cycles);
-		p += sprintf(p, "CPU%-2d spurious intrs      : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count);
-		p += sprintf(p, "CPU%-2d replay   intrs      : %lu\n", i, pfm_stats[i].pfm_replay_ovfl_intr_count);
-		p += sprintf(p, "CPU%-2d syst_wide           : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0);
-		p += sprintf(p, "CPU%-2d dcr_pp              : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0);
-		p += sprintf(p, "CPU%-2d exclude idle        : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0);
-		p += sprintf(p, "CPU%-2d owner               : %d\n" , i, pfm_get_cpu_data(pmu_owner, i) ? pfm_get_cpu_data(pmu_owner, i)->pid: -1);
-		p += sprintf(p, "CPU%-2d context             : %p\n" , i, pfm_get_cpu_data(pmu_ctx, i));
-		p += sprintf(p, "CPU%-2d activations         : %lu\n", i, pfm_get_cpu_data(pmu_activation_number,i));
-		online_cpus++;
-	}
-
-	if (online_cpus == 1)
-	{
-		psr = pfm_get_psr();
-		ia64_srlz_d();
-		p += sprintf(p, "CPU%-2d psr                 : 0x%lx\n", smp_processor_id(), psr);
-		p += sprintf(p, "CPU%-2d pmc0                : 0x%lx\n", smp_processor_id(), ia64_get_pmc(0));
-		for(i=4; i < 8; i++) {
-   			p += sprintf(p, "CPU%-2d pmc%u                : 0x%lx\n", smp_processor_id(), i, ia64_get_pmc(i));
-   			p += sprintf(p, "CPU%-2d pmd%u                : 0x%lx\n", smp_processor_id(), i, ia64_get_pmd(i));
-  		}
+	while (*pos <= NR_CPUS) {
+		if (cpu_online(*pos - 1)) {
+			return (void *)*pos;
+		}
+		++*pos;
 	}
+	return NULL;
+}
 
-	LOCK_PFS(flags);
-	p += sprintf(p, "proc_sessions             : %u\n"
-			"sys_sessions              : %u\n"
-			"sys_use_dbregs            : %u\n"
-			"ptrace_use_dbregs         : %u\n",
-			pfm_sessions.pfs_task_sessions,
-			pfm_sessions.pfs_sys_sessions,
-			pfm_sessions.pfs_sys_use_dbregs,
-			pfm_sessions.pfs_ptrace_use_dbregs);
-	UNLOCK_PFS(flags);
+static void *
+pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return pfm_proc_start(m, pos);
+}
+
+static void
+pfm_proc_stop(struct seq_file *m, void *v)
+{
+}
+
+static void
+pfm_proc_show_header(struct seq_file *m)
+{
+	struct list_head * pos;
+	pfm_buffer_fmt_t * entry;
+	unsigned long flags;
+
+ 	seq_printf(m,
+		"perfmon version           : %u.%u\n"
+		"model                     : %s\n"
+		"fastctxsw                 : %s\n"
+		"expert mode               : %s\n"
+		"ovfl_mask                 : 0x%lx\n"
+		"PMU flags                 : 0x%x\n",
+		PFM_VERSION_MAJ, PFM_VERSION_MIN,
+		pmu_conf->pmu_name,
+		pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
+		pfm_sysctl.expert_mode > 0 ? "Yes": "No",
+		pmu_conf->ovfl_val,
+		pmu_conf->flags);
+
+  	LOCK_PFS(flags);
+
+ 	seq_printf(m,
+ 		"proc_sessions             : %u\n"
+ 		"sys_sessions              : %u\n"
+ 		"sys_use_dbregs            : %u\n"
+ 		"ptrace_use_dbregs         : %u\n",
+ 		pfm_sessions.pfs_task_sessions,
+ 		pfm_sessions.pfs_sys_sessions,
+ 		pfm_sessions.pfs_sys_use_dbregs,
+ 		pfm_sessions.pfs_ptrace_use_dbregs);
+
+  	UNLOCK_PFS(flags);
 
 	spin_lock(&pfm_buffer_fmt_lock);
 
 	list_for_each(pos, &pfm_buffer_fmt_list) {
 		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
-		p += sprintf(p, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
-				entry->fmt_uuid[0],
-				entry->fmt_uuid[1],
-				entry->fmt_uuid[2],
-				entry->fmt_uuid[3],
-				entry->fmt_uuid[4],
-				entry->fmt_uuid[5],
-				entry->fmt_uuid[6],
-				entry->fmt_uuid[7],
-				entry->fmt_uuid[8],
-				entry->fmt_uuid[9],
-				entry->fmt_uuid[10],
-				entry->fmt_uuid[11],
-				entry->fmt_uuid[12],
-				entry->fmt_uuid[13],
-				entry->fmt_uuid[14],
-				entry->fmt_uuid[15],
-				entry->fmt_name);
+		seq_printf(m, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
+			entry->fmt_uuid[0],
+			entry->fmt_uuid[1],
+			entry->fmt_uuid[2],
+			entry->fmt_uuid[3],
+			entry->fmt_uuid[4],
+			entry->fmt_uuid[5],
+			entry->fmt_uuid[6],
+			entry->fmt_uuid[7],
+			entry->fmt_uuid[8],
+			entry->fmt_uuid[9],
+			entry->fmt_uuid[10],
+			entry->fmt_uuid[11],
+			entry->fmt_uuid[12],
+			entry->fmt_uuid[13],
+			entry->fmt_uuid[14],
+			entry->fmt_uuid[15],
+			entry->fmt_name);
 	}
 	spin_unlock(&pfm_buffer_fmt_lock);
 
-	return p - page;
 }
 
-/* /proc interface, for debug only */
 static int
-perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
+pfm_proc_show(struct seq_file *m, void *v)
 {
-	int len = pfm_proc_info(page);
+	unsigned long psr;
+	unsigned int i;
+	int cpu;
+
+	if (v == PFM_PROC_SHOW_HEADER) {
+		pfm_proc_show_header(m);
+		return 0;
+	}
 
-	if (len <= off+count) *eof = 1;
+	/* show info for CPU (v - 1) */
+
+	cpu = (long)v - 1;
+	seq_printf(m,
+		"CPU%-2d overflow intrs      : %lu\n"
+		"CPU%-2d overflow cycles     : %lu\n"
+		"CPU%-2d overflow min        : %lu\n"
+		"CPU%-2d overflow max        : %lu\n"
+		"CPU%-2d smpl handler calls  : %lu\n"
+		"CPU%-2d smpl handler cycles : %lu\n"
+		"CPU%-2d spurious intrs      : %lu\n"
+		"CPU%-2d replay   intrs      : %lu\n"
+		"CPU%-2d syst_wide           : %d\n"
+		"CPU%-2d dcr_pp              : %d\n"
+		"CPU%-2d exclude idle        : %d\n"
+		"CPU%-2d owner               : %d\n"
+		"CPU%-2d context             : %p\n"
+		"CPU%-2d activations         : %lu\n",
+		cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
+		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
+		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
+		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
+		cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
+		cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
+		cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
+		cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
+		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
+		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
+		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
+		cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
+		cpu, pfm_get_cpu_data(pmu_ctx, cpu),
+		cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
+
+	if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
 
-	*start = page + off;
-	len   -= off;
+		psr = pfm_get_psr();
 
-	if (len>count) len = count;
-	if (len<0) len = 0;
+		ia64_srlz_d();
 
-	return len;
+		seq_printf(m, 
+			"CPU%-2d psr                 : 0x%lx\n"
+			"CPU%-2d pmc0                : 0x%lx\n", 
+			cpu, psr,
+			cpu, ia64_get_pmc(0));
+
+		for (i=0; PMC_IS_LAST(i) == 0;  i++) {
+			if (PMC_IS_COUNTING(i) == 0) continue;
+   			seq_printf(m, 
+				"CPU%-2d pmc%u                : 0x%lx\n"
+   				"CPU%-2d pmd%u                : 0x%lx\n", 
+				cpu, i, ia64_get_pmc(i),
+				cpu, i, ia64_get_pmd(i));
+  		}
+	}
+	return 0;
 }
 
+struct seq_operations pfm_seq_ops = {
+	.start =	pfm_proc_start,
+ 	.next =		pfm_proc_next,
+ 	.stop =		pfm_proc_stop,
+ 	.show =		pfm_proc_show
+};
+
+static int
+pfm_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &pfm_seq_ops);
+}
+
+
 /*
  * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
  * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
@@ -5694,6 +5793,32 @@ pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_c
 }
 
 #ifdef CONFIG_SMP
+
+static void
+pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
+{
+	struct task_struct *task = ctx->ctx_task;
+
+	ia64_psr(regs)->up = 0;
+	ia64_psr(regs)->sp = 1;
+
+	if (GET_PMU_OWNER() == task) {
+		DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
+		SET_PMU_OWNER(NULL, NULL);
+	}
+
+	/*
+	 * disconnect the task from the context and vice-versa
+	 */
+	PFM_SET_WORK_PENDING(task, 0);
+
+	task->thread.pfm_context  = NULL;
+	task->thread.flags       &= ~IA64_THREAD_PM_VALID;
+
+	DPRINT(("force cleanup for [%d]\n",  task->pid));
+}
+
+
 /*
  * in 2.6, interrupts are masked when we come here and the runqueue lock is held
  */
@@ -5732,14 +5857,6 @@ pfm_save_regs(struct task_struct *task)
 		return;
 	}
 
-	/*
-	 * sanity check
-	 */
-	if (ctx->ctx_last_activation != GET_ACTIVATION()) {
-		pfm_unprotect_ctx_ctxsw(ctx, flags);
-		return;
-	}
-
 	/*
 	 * save current PSR: needed because we modify it
 	 */
@@ -5899,6 +6016,7 @@ pfm_load_regs (struct task_struct *task)
 	unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
 	unsigned long flags;
 	u64 psr, psr_up;
+	int need_irq_resend;
 
 	ctx = PFM_GET_CTX(task);
 	if (unlikely(ctx == NULL)) return;
@@ -5919,6 +6037,8 @@ pfm_load_regs (struct task_struct *task)
 	flags = pfm_protect_ctx_ctxsw(ctx);
 	psr   = pfm_get_psr();
 
+	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
+
 	BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
 	BUG_ON(psr & IA64_PSR_I);
 
@@ -5944,8 +6064,8 @@ pfm_load_regs (struct task_struct *task)
 	 * stale state.
 	 */
 	if (ctx->ctx_fl_using_dbreg) {
-		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
 	}
 	/*
 	 * retrieve saved psr.up
@@ -6004,12 +6124,12 @@ pfm_load_regs (struct task_struct *task)
 		ia64_set_pmc(0, t->pmcs[0]);
 		ia64_srlz_d();
 		t->pmcs[0] = 0UL;
-#ifndef CONFIG_MCKINLEY
+
 		/*
 		 * will replay the PMU interrupt
 		 */
-		hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
-#endif
+		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+
 		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
 
@@ -6061,6 +6181,7 @@ pfm_load_regs (struct task_struct *task)
 	struct task_struct *owner;
 	unsigned long pmd_mask, pmc_mask;
 	u64 psr, psr_up;
+	int need_irq_resend;
 
 	owner = GET_PMU_OWNER();
 	ctx   = PFM_GET_CTX(task);
@@ -6079,14 +6200,15 @@ pfm_load_regs (struct task_struct *task)
 	 * (not perfmon) by the previous task.
 	 */
 	if (ctx->ctx_fl_using_dbreg) {
-		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf.num_ibrs);
-		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf.num_dbrs);
+		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
 	}
 
 	/*
 	 * retrieved saved psr.up
 	 */
 	psr_up = ctx->ctx_saved_psr_up;
+	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
 
 	/*
 	 * short path, our state is still there, just
@@ -6143,12 +6265,11 @@ pfm_load_regs (struct task_struct *task)
 
 		t->pmcs[0] = 0UL;
 
-#ifndef CONFIG_MCKINLEY
 		/*
 		 * will replay the PMU interrupt
 		 */
-		hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
-#endif
+		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+
 		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
 
@@ -6184,15 +6305,15 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 	 */
 	is_self = ctx->ctx_task == task ? 1 : 0;
 
-#ifdef CONFIG_SMP
-	if (task == current) {
-#else
 	/*
-	 * in UP, the state can still be in the registers
+	 * can access PMU is task is the owner of the PMU state on the current CPU
+	 * or if we are running on the CPU bound to the context in system-wide mode
+	 * (that is not necessarily the task the context is attached to in this mode).
+	 * In system-wide we always have can_access_pmu true because a task running on an
+	 * invalid processor is flagged earlier in the call stack (see pfm_stop).
 	 */
-	if (task == current || GET_PMU_OWNER() == task) {
-#endif
-		can_access_pmu = 1;
+	can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
+	if (can_access_pmu) {
 		/*
 		 * Mark the PMU as not owned
 		 * This will cause the interrupt handler to do nothing in case an overflow
@@ -6202,6 +6323,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 		 * on.
 		 */
 		SET_PMU_OWNER(NULL, NULL);
+		DPRINT(("releasing ownership\n"));
 
 		/*
 		 * read current overflow status:
@@ -6222,7 +6344,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 		 */
 		task->thread.pmcs[0] = 0;
 	}
-	ovfl_val = pmu_conf.ovfl_val;
+	ovfl_val = pmu_conf->ovfl_val;
 	/*
 	 * we save all the used pmds
 	 * we take care of overflows for counting PMDs
@@ -6230,6 +6352,9 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 	 * XXX: sampling situation is not taken into account here
 	 */
 	mask2 = ctx->ctx_used_pmds[0];
+
+	DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
+
 	for (i = 0; mask2; i++, mask2>>=1) {
 
 		/* skip non used pmds */
@@ -6268,7 +6393,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 			}
 		}
 
-		DPRINT(("[%d] is_self=%d ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, is_self, i, val, pmd_val));
+		DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
 
 		if (is_self) task->thread.pmds[i] = pmd_val;
 
@@ -6287,6 +6412,36 @@ static struct irqaction perfmon_irqaction = {
  */
 static int init_pfm_fs(void);
 
+static int __init
+pfm_probe_pmu(void)
+{
+	pmu_config_t **p;
+	int family;
+
+	family = local_cpu_data->family;
+	p      = pmu_confs;
+
+	while(*p) {
+		if ((*p)->probe) {
+			if ((*p)->probe() == 0) goto found;
+		} else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) {
+			goto found;
+		}
+		p++;
+	}
+	return -1;
+found:
+	pmu_conf = *p;
+	return 0;
+}
+
+static struct file_operations pfm_proc_fops = {
+	.open		= pfm_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 int __init
 pfm_init(void)
 {
@@ -6297,12 +6452,9 @@ pfm_init(void)
 		PFM_VERSION_MIN,
 		IA64_PERFMON_VECTOR);
 
-	/*
-	 * PMU type sanity check
-	 * XXX: maybe better to implement autodetection (but then we have a larger kernel)
-	 */
-	if (local_cpu_data->family != pmu_conf.pmu_family) {
-		printk(KERN_INFO "perfmon: disabled, kernel only supports %s PMU family\n", pmu_conf.pmu_name);
+	if (pfm_probe_pmu()) {
+		printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", 
+				local_cpu_data->family);
 		return -ENODEV;
 	}
 
@@ -6313,56 +6465,64 @@ pfm_init(void)
 	n = 0;
 	for (i=0; PMC_IS_LAST(i) == 0;  i++) {
 		if (PMC_IS_IMPL(i) == 0) continue;
-		pmu_conf.impl_pmcs[i>>6] |= 1UL << (i&63);
+		pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63);
 		n++;
 	}
-	pmu_conf.num_pmcs = n;
+	pmu_conf->num_pmcs = n;
 
 	n = 0; n_counters = 0;
 	for (i=0; PMD_IS_LAST(i) == 0;  i++) {
 		if (PMD_IS_IMPL(i) == 0) continue;
-		pmu_conf.impl_pmds[i>>6] |= 1UL << (i&63);
+		pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63);
 		n++;
 		if (PMD_IS_COUNTING(i)) n_counters++;
 	}
-	pmu_conf.num_pmds      = n;
-	pmu_conf.num_counters  = n_counters;
+	pmu_conf->num_pmds      = n;
+	pmu_conf->num_counters  = n_counters;
 
 	/*
 	 * sanity checks on the number of debug registers
 	 */
-	if (pmu_conf.use_rr_dbregs) {
-		if (pmu_conf.num_ibrs > IA64_NUM_DBG_REGS) {
-			printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf.num_ibrs);
+	if (pmu_conf->use_rr_dbregs) {
+		if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
+			printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
+			pmu_conf = NULL;
 			return -1;
 		}
-		if (pmu_conf.num_dbrs > IA64_NUM_DBG_REGS) {
-			printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf.num_ibrs);
+		if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
+			printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
+			pmu_conf = NULL;
 			return -1;
 		}
 	}
 
 	printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
-	       pmu_conf.pmu_name,
-	       pmu_conf.num_pmcs,
-	       pmu_conf.num_pmds,
-	       pmu_conf.num_counters,
-	       ffz(pmu_conf.ovfl_val));
+	       pmu_conf->pmu_name,
+	       pmu_conf->num_pmcs,
+	       pmu_conf->num_pmds,
+	       pmu_conf->num_counters,
+	       ffz(pmu_conf->ovfl_val));
 
 	/* sanity check */
-	if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) {
+	if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
 		printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
+		pmu_conf = NULL;
 		return -1;
 	}
 
 	/*
 	 * create /proc/perfmon (mostly for debugging purposes)
 	 */
-	perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
+ 	perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
 	if (perfmon_dir == NULL) {
 		printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
+		pmu_conf = NULL;
 		return -1;
 	}
+  	/*
+ 	 * install customized file operations for /proc/perfmon entry
+ 	 */
+ 	perfmon_dir->proc_fops = &pfm_proc_fops;
 
 	/*
 	 * create /proc/sys/kernel/perfmon (for debugging purposes)
@@ -6379,9 +6539,6 @@ pfm_init(void)
 
 	for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
 
-	/* we are all set */
-	pmu_conf.enabled = 1;
-
 	return 0;
 }
 
@@ -6393,8 +6550,6 @@ __initcall(pfm_init);
 void
 pfm_init_percpu (void)
 {
-	int i;
-
 	/*
 	 * make sure no measurement is active
 	 * (may inherit programmed PMCs from EFI).
@@ -6412,28 +6567,6 @@ pfm_init_percpu (void)
 
 	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
 	ia64_srlz_d();
-
-	/*
-	 * we first initialize the PMU to a stable state.
-	 * the values may have been changed from their power-up
-	 * values by software executed before the kernel took over.
-	 *
-	 * At this point, pmu_conf has not yet been initialized
-	 *
-	 * On McKinley, this code is ineffective until PMC4 is initialized
-	 * but that's all right because we take care of pmc0 later.
-	 *
-	 * XXX: potential problems with pmc1.
-	 */
-	for (i=1; PMC_IS_LAST(i) == 0;  i++) {
-		if (PMC_IS_IMPL(i) == 0) continue;
-		ia64_set_pmc(i, PMC_DFL_VAL(i));
-	}
-
-	for (i=0; PMD_IS_LAST(i) == 0; i++) {
-		if (PMD_IS_IMPL(i) == 0) continue;
-		ia64_set_pmd(i, 0UL);
-	}
 }
 
 /*
@@ -6538,8 +6671,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 }
 #else  /* !CONFIG_PERFMON */
 asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
-		long arg8, long stack)
+sys_perfmonctl (int fd, int cmd, void *arg, int count)
 {
 	return -ENOSYS;
 }