Index: linux-2.6.27.y/arch/Kconfig =================================================================== --- linux-2.6.27.y.orig/arch/Kconfig +++ linux-2.6.27.y/arch/Kconfig @@ -13,9 +13,18 @@ config OPROFILE If unsure, say N. +config CHOPSTIX + bool "Chopstix (PlanetLab)" + depends on MODULES && OPROFILE + help + Chopstix allows you to monitor various events by summarizing them + in lossy data structures and transferring these data structures + into user space. If in doubt, say "N". + config HAVE_OPROFILE def_bool n + config KPROBES bool "Kprobes" depends on KALLSYMS && MODULES Index: linux-2.6.27.y/arch/x86/kernel/asm-offsets_32.c =================================================================== --- linux-2.6.27.y.orig/arch/x86/kernel/asm-offsets_32.c +++ linux-2.6.27.y/arch/x86/kernel/asm-offsets_32.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "sigframe.h" @@ -24,9 +25,20 @@ #include #include "../../../drivers/lguest/lg.h" + +#define STACKOFFSET(sym, str, mem) \ + DEFINE(sym, offsetof(struct str, mem)-sizeof(struct str)); + /* workaround for a warning with -Wmissing-prototypes */ void foo(void); +struct event_spec { + unsigned long pc; + unsigned long dcookie; + unsigned count; + unsigned int number; +}; + void foo(void) { OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); @@ -50,6 +62,16 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); + STACKOFFSET(TASK_thread, task_struct, thread); + STACKOFFSET(THREAD_esp, thread_struct, sp); + STACKOFFSET(EVENT_event_data, event, event_data); + STACKOFFSET(EVENT_task, event, task); + STACKOFFSET(EVENT_event_type, event, event_type); + STACKOFFSET(SPEC_number, event_spec, number); + DEFINE(EVENT_SIZE, sizeof(struct event)); + DEFINE(SPEC_SIZE, sizeof(struct event_spec)); + DEFINE(SPEC_EVENT_SIZE, sizeof(struct event_spec)+sizeof(struct event)); + OFFSET(TI_task, thread_info, task); OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); Index: linux-2.6.27.y/arch/x86/kernel/entry_32.S =================================================================== --- linux-2.6.27.y.orig/arch/x86/kernel/entry_32.S +++ linux-2.6.27.y/arch/x86/kernel/entry_32.S @@ -426,6 +426,33 @@ ENTRY(system_call) cmpl $(nr_syscalls), %eax jae syscall_badsys syscall_call: + /* Move Chopstix syscall probe here */ + /* Save and clobber: eax, ecx, ebp */ + pushl %eax + pushl %ecx + pushl %ebp + movl %esp, %ebp + subl $SPEC_EVENT_SIZE, %esp + movl rec_event, %ecx + testl %ecx, %ecx + jz carry_on + # struct event is first, just below %ebp + movl %eax, (SPEC_number-EVENT_SIZE)(%ebp) + leal -SPEC_EVENT_SIZE(%ebp), %eax + movl %eax, EVENT_event_data(%ebp) + movl $6, EVENT_event_type(%ebp) + movl rec_event, %edx + movl $1, 4(%esp) + leal -EVENT_SIZE(%ebp), %eax + movl %eax, (%esp) + call rec_event_asm +carry_on: + addl $SPEC_EVENT_SIZE, %esp + popl %ebp + popl %ecx + popl %eax + /* End chopstix */ + call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) # store the return value syscall_exit: Index: linux-2.6.27.y/arch/x86/mm/fault.c =================================================================== --- linux-2.6.27.y.orig/arch/x86/mm/fault.c +++ linux-2.6.27.y/arch/x86/mm/fault.c @@ -79,6 +79,15 @@ static inline int notify_page_fault(stru #endif } + +extern void (*rec_event)(void *,unsigned int); +struct event_spec { + unsigned long pc; + unsigned long dcookie; + unsigned count; + unsigned char reason; +}; + /* * X86_32 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. Index: linux-2.6.27.y/drivers/oprofile/cpu_buffer.c =================================================================== --- linux-2.6.27.y.orig/drivers/oprofile/cpu_buffer.c +++ linux-2.6.27.y/drivers/oprofile/cpu_buffer.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "event_buffer.h" #include "cpu_buffer.h" @@ -147,6 +148,17 @@ static void increment_head(struct oprofi b->head_pos = 0; } +#ifdef CONFIG_CHOPSTIX + +struct event_spec { + unsigned int pc; + unsigned long dcookie; + unsigned count; +}; + +extern void (*rec_event)(void *,unsigned int); +#endif + static inline void add_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, unsigned long event) @@ -155,6 +167,7 @@ add_sample(struct oprofile_cpu_buffer * entry->eip = pc; entry->event = event; increment_head(cpu_buf); + } static inline void @@ -250,8 +263,28 @@ void oprofile_add_sample(struct pt_regs { int is_kernel = !user_mode(regs); unsigned long pc = profile_pc(regs); + int res=0; +#ifdef CONFIG_CHOPSTIX + if (rec_event) { + struct event esig; + struct event_spec espec; + esig.task = current; + espec.pc=pc; + espec.count=1; + esig.event_data=&espec; + esig.event_type=event; /* index in the event array currently set up */ + /* make sure the counters are loaded in the order we want them to show up*/ + (*rec_event)(&esig, 1); + } + else { oprofile_add_ext_sample(pc, regs, event, is_kernel); + } +#else + oprofile_add_ext_sample(pc, regs, event, is_kernel); +#endif + + } void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) Index: linux-2.6.27.y/fs/bio.c =================================================================== --- linux-2.6.27.y.orig/fs/bio.c +++ linux-2.6.27.y/fs/bio.c @@ -27,6 +27,7 @@ #include #include #include /* for struct sg_iovec */ +#include static struct kmem_cache *bio_slab __read_mostly; @@ -44,6 +45,7 @@ static struct biovec_slab bvec_slabs[BIO }; #undef BV + /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. @@ -1171,6 +1173,14 @@ void bio_check_pages_dirty(struct bio *b } } +struct event_spec { + unsigned long pc; + unsigned long dcookie; + unsigned count; + unsigned char reason; +}; + +extern void (*rec_event)(void *,unsigned int); /** * bio_endio - end I/O on a bio * @bio: bio @@ -1192,6 +1202,24 @@ void bio_endio(struct bio *bio, int erro else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; +#if 0 + if (rec_event) { + struct event event; + struct event_spec espec; + unsigned long eip; + + espec.reason = 1;/*response */ + + eip = bio->bi_end_io; + event.event_data=&espec; + espec.pc=eip; + event.event_type=3; + /* index in the event array currently set up */ + /* make sure the counters are loaded in the order we want them to show up*/ + (*rec_event)(&event, bytes_done); + } +#endif + if (bio->bi_end_io) bio->bi_end_io(bio, error); } Index: linux-2.6.27.y/fs/exec.c =================================================================== --- linux-2.6.27.y.orig/fs/exec.c +++ linux-2.6.27.y/fs/exec.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -698,6 +699,13 @@ struct file *open_exec(const char *name) goto out; } + #ifdef CONFIG_CHOPSTIX + unsigned long cookie; + extern void (*rec_event)(void *, unsigned int); + if (rec_event && !nd.path.dentry->d_cookie) + get_dcookie(&nd.path, &cookie); + #endif + return file; out_path_put: Index: linux-2.6.27.y/include/linux/arrays.h =================================================================== --- /dev/null +++ linux-2.6.27.y/include/linux/arrays.h @@ -0,0 +1,36 @@ +#ifndef __ARRAYS_H__ +#define __ARRAYS_H__ +#include + +#define SAMPLING_METHOD_DEFAULT 0 +#define SAMPLING_METHOD_LOG 1 + +/* Every probe has an array handler */ + +/* XXX - Optimize this structure */ + +extern void (*rec_event)(void *,unsigned int); +struct array_handler { + struct list_head link; + unsigned int (*hash_func)(void *); + unsigned int (*sampling_func)(void *,int,void *); + unsigned short size; + unsigned int threshold; + unsigned char **expcount; + unsigned int sampling_method; + unsigned int **arrays; + unsigned int arraysize; + unsigned int num_samples[2]; + void **epoch_samples; /* size-sized lists of samples */ + unsigned int (*serialize)(void *, void *); + unsigned char code[5]; +}; + +struct event { + struct list_head link; + void *event_data; + unsigned int count; + unsigned int event_type; + struct task_struct *task; +}; +#endif Index: linux-2.6.27.y/include/linux/sched.h.rej =================================================================== --- /dev/null +++ linux-2.6.27.y/include/linux/sched.h.rej @@ -0,0 +1,19 @@ +*************** +*** 850,855 **** + #endif + unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ + enum sleep_type sleep_type; + +--- 850,859 ---- + #endif + unsigned long sleep_avg; + unsigned long long timestamp, last_ran; ++ #ifdef CONFIG_CHOPSTIX ++ unsigned long last_interrupted, last_ran_j; ++ #endif ++ + unsigned long long sched_time; /* sched_clock time spent running */ + enum sleep_type sleep_type; + Index: linux-2.6.27.y/kernel/sched.c =================================================================== --- linux-2.6.27.y.orig/kernel/sched.c +++ linux-2.6.27.y/kernel/sched.c @@ -10,7 +10,7 @@ * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with + * hybrid priority-list and round-robin deventn with * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. @@ -73,12 +73,16 @@ #include #include #include +#include #include #include #include "sched_cpupri.h" +#define INTERRUPTIBLE -1 +#define RUNNING 0 + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -4436,6 +4440,29 @@ pick_next_task(struct rq *rq, struct tas } } +void (*rec_event)(void *,unsigned int) = NULL; +EXPORT_SYMBOL(rec_event); +#ifdef CONFIG_CHOPSTIX + +struct event_spec { + unsigned long pc; + unsigned long dcookie; + unsigned int count; + unsigned int reason; +}; + +/* To support safe calling from asm */ +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) { + struct pt_regs *regs; + struct event_spec *es = event_signature_in->event_data; + regs = task_pt_regs(current); + event_signature_in->task=current; + es->pc=regs->ip; + event_signature_in->count=1; + (*rec_event)(event_signature_in, count); +} +#endif + /* * schedule() is the main scheduler function. */ @@ -5382,6 +5409,7 @@ long sched_setaffinity(pid_t pid, const get_task_struct(p); read_unlock(&tasklist_lock); + retval = -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) Index: linux-2.6.27.y/kernel/sched.c.rej =================================================================== --- /dev/null +++ linux-2.6.27.y/kernel/sched.c.rej @@ -0,0 +1,258 @@ +*************** +*** 23,28 **** + #include + #include + #include + #include + #include + #include +--- 23,29 ---- + #include + #include + #include ++ #include + #include + #include + #include +*************** +*** 451,456 **** + + repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); +--- 455,461 ---- + + repeat_lock_task: + rq = task_rq(p); ++ + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); +*************** +*** 1761,1766 **** + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: +--- 1766,1786 ---- + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; ++ #ifdef CONFIG_CHOPSTIX ++ /* The jiffy of last interruption */ ++ if (p->state & TASK_UNINTERRUPTIBLE) { ++ p->last_interrupted=jiffies; ++ } ++ else ++ if (p->state & TASK_INTERRUPTIBLE) { ++ p->last_interrupted=INTERRUPTIBLE; ++ } ++ else ++ p->last_interrupted=RUNNING; ++ ++ /* The jiffy of last execution */ ++ p->last_ran_j=jiffies; ++ #endif + + /* + * Make sure we do not leak PI boosting priority to the child: +*************** +*** 3628,3633 **** + + #endif + + static inline int interactive_sleep(enum sleep_type sleep_type) + { + return (sleep_type == SLEEP_INTERACTIVE || +--- 3648,3654 ---- + + #endif + ++ + static inline int interactive_sleep(enum sleep_type sleep_type) + { + return (sleep_type == SLEEP_INTERACTIVE || +*************** +*** 3637,3652 **** + /* + * schedule() is the main scheduler function. + */ + asmlinkage void __sched schedule(void) + { + struct task_struct *prev, *next; + struct prio_array *array; + struct list_head *queue; + unsigned long long now; +- unsigned long run_time; + int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; + + /* + * Test if we are atomic. Since do_exit() needs to call into +--- 3658,3685 ---- + /* + * schedule() is the main scheduler function. + */ ++ ++ #ifdef CONFIG_CHOPSTIX ++ extern void (*rec_event)(void *,unsigned int); ++ struct event_spec { ++ unsigned long pc; ++ unsigned long dcookie; ++ unsigned int count; ++ unsigned int reason; ++ }; ++ #endif ++ + asmlinkage void __sched schedule(void) + { + struct task_struct *prev, *next; + struct prio_array *array; + struct list_head *queue; + unsigned long long now; ++ unsigned long run_time, diff; + int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; ++ int sampling_reason; + + /* + * Test if we are atomic. Since do_exit() needs to call into +*************** +*** 3700,3705 **** + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; +--- 3733,3739 ---- + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; ++ + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; +*************** +*** 3709,3714 **** + vx_uninterruptible_inc(prev); + } + deactivate_task(prev, rq); + } + } + +--- 3743,3759 ---- + vx_uninterruptible_inc(prev); + } + deactivate_task(prev, rq); ++ #ifdef CONFIG_CHOPSTIX ++ /* An uninterruptible process just yielded. Record the current jiffie */ ++ if (prev->state & TASK_UNINTERRUPTIBLE) { ++ prev->last_interrupted=jiffies; ++ } ++ /* An interruptible process just yielded, or it got preempted. ++ * Mark it as interruptible */ ++ else if (prev->state & TASK_INTERRUPTIBLE) { ++ prev->last_interrupted=INTERRUPTIBLE; ++ } ++ #endif + } + } + +*************** +*** 3785,3790 **** + prev->sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = next->last_ran = now; +--- 3830,3869 ---- + prev->sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + ++ #ifdef CONFIG_CHOPSTIX ++ /* Run only if the Chopstix module so decrees it */ ++ if (rec_event) { ++ prev->last_ran_j = jiffies; ++ if (next->last_interrupted!=INTERRUPTIBLE) { ++ if (next->last_interrupted!=RUNNING) { ++ diff = (jiffies-next->last_interrupted); ++ sampling_reason = 0;/* BLOCKING */ ++ } ++ else { ++ diff = jiffies-next->last_ran_j; ++ sampling_reason = 1;/* PREEMPTION */ ++ } ++ ++ if (diff >= HZ/10) { ++ struct event event; ++ struct event_spec espec; ++ struct pt_regs *regs; ++ regs = task_pt_regs(current); ++ ++ espec.reason = sampling_reason; ++ event.event_data=&espec; ++ event.task=next; ++ espec.pc=regs->eip; ++ event.event_type=2; ++ /* index in the event array currently set up */ ++ /* make sure the counters are loaded in the order we want them to show up*/ ++ (*rec_event)(&event, diff); ++ } ++ } ++ /* next has been elected to run */ ++ next->last_interrupted=0; ++ } ++ #endif + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = next->last_ran = now; +*************** +*** 5737,5742 **** + jiffies_to_timespec(p->policy == SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + out_nounlock: + return retval; +--- 5817,5823 ---- + jiffies_to_timespec(p->policy == SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); ++ + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + out_nounlock: + return retval; +*************** +*** 7980,7982 **** + } + + #endif +--- 8061,8080 ---- + } + + #endif ++ ++ #ifdef CONFIG_CHOPSTIX ++ void (*rec_event)(void *,unsigned int) = NULL; ++ ++ /* To support safe calling from asm */ ++ asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) { ++ struct pt_regs *regs; ++ struct event_spec *es = event_signature_in->event_data; ++ regs = task_pt_regs(current); ++ event_signature_in->task=current; ++ es->pc=regs->eip; ++ event_signature_in->count=1; ++ (*rec_event)(event_signature_in, count); ++ } ++ EXPORT_SYMBOL(rec_event); ++ EXPORT_SYMBOL(in_sched_functions); ++ #endif Index: linux-2.6.27.y/mm/memory.c =================================================================== --- linux-2.6.27.y.orig/mm/memory.c +++ linux-2.6.27.y/mm/memory.c @@ -61,6 +61,7 @@ #include #include +#include #include "internal.h" @@ -2753,6 +2754,15 @@ out: return ret; } +extern void (*rec_event)(void *,unsigned int); +struct event_spec { + unsigned long pc; + unsigned long dcookie; + unsigned count; + unsigned char reason; +}; + + /* * By the time we get here, we already hold the mm semaphore */ @@ -2782,6 +2792,24 @@ int handle_mm_fault(struct mm_struct *mm if (!pte) return VM_FAULT_OOM; +#ifdef CONFIG_CHOPSTIX + if (rec_event) { + struct event event; + struct event_spec espec; + struct pt_regs *regs; + unsigned int pc; + regs = task_pt_regs(current); + pc = regs->ip & (unsigned int) ~4095; + + espec.reason = 0; /* alloc */ + event.event_data=&espec; + event.task = current; + espec.pc=pc; + event.event_type=5; + (*rec_event)(&event, 1); + } +#endif + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } Index: linux-2.6.27.y/mm/slab.c =================================================================== --- linux-2.6.27.y.orig/mm/slab.c +++ linux-2.6.27.y/mm/slab.c @@ -110,6 +110,7 @@ #include #include #include +#include #include #include @@ -248,6 +249,14 @@ struct slab_rcu { void *addr; }; +extern void (*rec_event)(void *,unsigned int); +struct event_spec { + unsigned long pc; + unsigned long dcookie; + unsigned count; + unsigned char reason; +}; + /* * struct array_cache * @@ -3469,6 +3478,19 @@ __cache_alloc(struct kmem_cache *cachep, local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); +#ifdef CONFIG_CHOPSTIX + if (rec_event && objp) { + struct event event; + struct event_spec espec; + + espec.reason = 0; /* alloc */ + event.event_data=&espec; + event.task = current; + espec.pc=caller; + event.event_type=5; + (*rec_event)(&event, cachep->buffer_size); + } +#endif if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); @@ -3578,12 +3600,26 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static inline void __cache_free(struct kmem_cache *cachep, void *objp, void *caller) { struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); - objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + objp = cache_free_debugcheck(cachep, objp, caller); + #ifdef CONFIG_CHOPSTIX + if (rec_event && objp) { + struct event event; + struct event_spec espec; + + espec.reason = 1; /* free */ + event.event_data=&espec; + event.task = current; + espec.pc=caller; + event.event_type=4; + (*rec_event)(&event, cachep->buffer_size); + } + #endif + vx_slab_free(cachep); /* @@ -3714,6 +3750,7 @@ static __always_inline void *__do_kmallo void *caller) { struct kmem_cache *cachep; + void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. @@ -3741,10 +3778,17 @@ void *__kmalloc_track_caller(size_t size EXPORT_SYMBOL(__kmalloc_track_caller); #else +#ifdef CONFIG_CHOPSTIX +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc(size, flags, __builtin_return_address(0)); +} +#else void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, NULL); } +#endif EXPORT_SYMBOL(__kmalloc); #endif @@ -3764,7 +3808,7 @@ void kmem_cache_free(struct kmem_cache * debug_check_no_locks_freed(objp, obj_size(cachep)); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); + __cache_free(cachep, objp,__builtin_return_address(0)); local_irq_restore(flags); } EXPORT_SYMBOL(kmem_cache_free); @@ -3790,7 +3834,7 @@ void kfree(const void *objp) c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); debug_check_no_obj_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); + __cache_free(c, (void *)objp,__builtin_return_address(0)); local_irq_restore(flags); } EXPORT_SYMBOL(kfree);