drivers/oprofile/buffer_sync.c

   1 /**
   2  * @file buffer_sync.c
   3  *
   4  * @remark Copyright 2002 OProfile authors
   5  * @remark Read the file COPYING
   6  *
   7  * @author John Levon <levon@movementarian.org>
   8  *
   9  * This is the core of the buffer management. Each
  10  * CPU buffer is processed and entered into the
  11  * global event buffer. Such processing is necessary
  12  * in several circumstances, mentioned below.
  13  *
  14  * The processing does the job of converting the
  15  * transitory EIP value into a persistent dentry/offset
  16  * value that the profiler can record at its leisure.
  17  *
  18  * See fs/dcookies.c for a description of the dentry/offset
  19  * objects.
  20  */
  21
  22 #include <linux/mm.h>
  23 #include <linux/workqueue.h>
  24 #include <linux/notifier.h>
  25 #include <linux/dcookies.h>
  26 #include <linux/profile.h>
  27 #include <linux/module.h>
  28 #include <linux/fs.h>
  29
  30 #include "oprofile_stats.h"
  31 #include "event_buffer.h"
  32 #include "cpu_buffer.h"
  33 #include "buffer_sync.h"
  34
  35 #define DEFAULT_EXPIRE (HZ / 4)
  36
  37 static void wq_sync_buffers(void *);
  38 static DECLARE_WORK(sync_wq, wq_sync_buffers, NULL);
  39
  40 static struct timer_list sync_timer;
  41 static void timer_ping(unsigned long data);
  42 static void sync_cpu_buffers(void);
  43
  44
  45 /* We must make sure to process every entry in the CPU buffers
  46  * before a task got the PF_EXITING flag, otherwise we will hold
  47  * references to a possibly freed task_struct. We are safe with
  48  * samples past the PF_EXITING point in do_exit(), because we
  49  * explicitly check for that in cpu_buffer.c
  50  */
  51 static int exit_task_notify(struct notifier_block * self, unsigned long val, void * data)
  52 {
  53         sync_cpu_buffers();
  54         return 0;
  55 }
  56
  57 /* There are two cases of tasks modifying task->mm->mmap list we
  58  * must concern ourselves with. First, when a task is about to
  59  * exit (exit_mmap()), we should process the buffer to deal with
  60  * any samples in the CPU buffer, before we lose the ->mmap information
  61  * we need. It is vital to get this case correct, otherwise we can
  62  * end up trying to access a freed task_struct.
  63  */
  64 static int mm_notify(struct notifier_block * self, unsigned long val, void * data)
  65 {
  66         sync_cpu_buffers();
  67         return 0;
  68 }
  69
  70
  71 /* Second, a task may unmap (part of) an executable mmap,
  72  * so we want to process samples before that happens too. This is merely
  73  * a QOI issue not a correctness one.
  74  */
  75 static int munmap_notify(struct notifier_block * self, unsigned long val, void * data)
  76 {
  77         /* Note that we cannot sync the buffers directly, because we might end up
  78          * taking the the mmap_sem that we hold now inside of event_buffer_read()
  79          * on a page fault, whilst holding buffer_sem - deadlock.
  80          *
  81          * This would mean a threaded reader of the event buffer, but we should
  82          * prevent it anyway.
  83          *
  84          * Delaying the work in a context that doesn't hold the mmap_sem means
  85          * that we won't lose samples from other mappings that current() may
  86          * have. Note that either way, we lose any pending samples for what is
  87          * being unmapped.
  88          */
  89         schedule_work(&sync_wq);
  90         return 0;
  91 }
  92
  93
  94 /* We need to be told about new modules so we don't attribute to a previously
  95  * loaded module, or drop the samples on the floor.
  96  */
  97 static int module_load_notify(struct notifier_block * self, unsigned long val, void * data)
  98 {
  99 #ifdef CONFIG_MODULES
 100         if (val != MODULE_STATE_COMING)
 101                 return 0;
 102
 103         sync_cpu_buffers();
 104         down(&buffer_sem);
 105         add_event_entry(ESCAPE_CODE);
 106         add_event_entry(MODULE_LOADED_CODE);
 107         up(&buffer_sem);
 108 #endif
 109         return 0;
 110 }
 111
 112
 113 static struct notifier_block exit_task_nb = {
 114         .notifier_call  = exit_task_notify,
 115 };
 116
 117 static struct notifier_block exec_unmap_nb = {
 118         .notifier_call  = munmap_notify,
 119 };
 120
 121 static struct notifier_block exit_mmap_nb = {
 122         .notifier_call  = mm_notify,
 123 };
 124
 125 static struct notifier_block module_load_nb = {
 126         .notifier_call = module_load_notify,
 127 };
 128
 129
 130 static void end_sync_timer(void)
 131 {
 132         del_timer_sync(&sync_timer);
 133         /* timer might have queued work, make sure it's completed. */
 134         flush_scheduled_work();
 135 }
 136
 137
 138 int sync_start(void)
 139 {
 140         int err;
 141
 142         init_timer(&sync_timer);
 143         sync_timer.function = timer_ping;
 144         sync_timer.expires = jiffies + DEFAULT_EXPIRE;
 145         add_timer(&sync_timer);
 146
 147         err = profile_event_register(EXIT_TASK, &exit_task_nb);
 148         if (err)
 149                 goto out1;
 150         err = profile_event_register(EXIT_MMAP, &exit_mmap_nb);
 151         if (err)
 152                 goto out2;
 153         err = profile_event_register(EXEC_UNMAP, &exec_unmap_nb);
 154         if (err)
 155                 goto out3;
 156         err = register_module_notifier(&module_load_nb);
 157         if (err)
 158                 goto out4;
 159
 160 out:
 161         return err;
 162 out4:
 163         profile_event_unregister(EXEC_UNMAP, &exec_unmap_nb);
 164 out3:
 165         profile_event_unregister(EXIT_MMAP, &exit_mmap_nb);
 166 out2:
 167         profile_event_unregister(EXIT_TASK, &exit_task_nb);
 168 out1:
 169         end_sync_timer();
 170         goto out;
 171 }
 172
 173
 174 void sync_stop(void)
 175 {
 176         unregister_module_notifier(&module_load_nb);
 177         profile_event_unregister(EXIT_TASK, &exit_task_nb);
 178         profile_event_unregister(EXIT_MMAP, &exit_mmap_nb);
 179         profile_event_unregister(EXEC_UNMAP, &exec_unmap_nb);
 180         end_sync_timer();
 181 }
 182
 183
 184 /* Optimisation. We can manage without taking the dcookie sem
 185  * because we cannot reach this code without at least one
 186  * dcookie user still being registered (namely, the reader
 187  * of the event buffer). */
 188 static inline unsigned long fast_get_dcookie(struct dentry * dentry,
 189         struct vfsmount * vfsmnt)
 190 {
 191         unsigned long cookie;
 192
 193         if (dentry->d_cookie)
 194                 return (unsigned long)dentry;
 195         get_dcookie(dentry, vfsmnt, &cookie);
 196         return cookie;
 197 }
 198
 199
 200 /* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
 201  * which corresponds loosely to "application name". This is
 202  * not strictly necessary but allows oprofile to associate
 203  * shared-library samples with particular applications
 204  */
 205 static unsigned long get_exec_dcookie(struct mm_struct * mm)
 206 {
 207         unsigned long cookie = 0;
 208         struct vm_area_struct * vma;
 209
 210         if (!mm)
 211                 goto out;
 212
 213         for (vma = mm->mmap; vma; vma = vma->vm_next) {
 214                 if (!vma->vm_file)
 215                         continue;
 216                 if (!(vma->vm_flags & VM_EXECUTABLE))
 217                         continue;
 218                 cookie = fast_get_dcookie(vma->vm_file->f_dentry,
 219                         vma->vm_file->f_vfsmnt);
 220                 break;
 221         }
 222
 223 out:
 224         return cookie;
 225 }
 226
 227
 228 /* Convert the EIP value of a sample into a persistent dentry/offset
 229  * pair that can then be added to the global event buffer. We make
 230  * sure to do this lookup before a mm->mmap modification happens so
 231  * we don't lose track.
 232  */
 233 static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset)
 234 {
 235         unsigned long cookie = 0;
 236         struct vm_area_struct * vma;
 237
 238         for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 239
 240                 if (!vma->vm_file)
 241                         continue;
 242
 243                 if (addr < vma->vm_start || addr >= vma->vm_end)
 244                         continue;
 245
 246                 cookie = fast_get_dcookie(vma->vm_file->f_dentry,
 247                         vma->vm_file->f_vfsmnt);
 248                 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start;
 249                 break;
 250         }
 251
 252         return cookie;
 253 }
 254
 255
 256 static unsigned long last_cookie = ~0UL;
 257
 258 static void add_cpu_switch(int i)
 259 {
 260         add_event_entry(ESCAPE_CODE);
 261         add_event_entry(CPU_SWITCH_CODE);
 262         add_event_entry(i);
 263         last_cookie = ~0UL;
 264 }
 265
 266 static void add_kernel_ctx_switch(unsigned int in_kernel)
 267 {
 268         add_event_entry(ESCAPE_CODE);
 269         if (in_kernel)
 270                 add_event_entry(KERNEL_ENTER_SWITCH_CODE);
 271         else
 272                 add_event_entry(KERNEL_EXIT_SWITCH_CODE);
 273 }
 274
 275 static void
 276 add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
 277 {
 278         add_event_entry(ESCAPE_CODE);
 279         add_event_entry(CTX_SWITCH_CODE);
 280         add_event_entry(task->pid);
 281         add_event_entry(cookie);
 282         /* Another code for daemon back-compat */
 283         add_event_entry(ESCAPE_CODE);
 284         add_event_entry(CTX_TGID_CODE);
 285         add_event_entry(task->tgid);
 286 }
 287
 288
 289 static void add_cookie_switch(unsigned long cookie)
 290 {
 291         add_event_entry(ESCAPE_CODE);
 292         add_event_entry(COOKIE_SWITCH_CODE);
 293         add_event_entry(cookie);
 294 }
 295
 296
 297 static void add_sample_entry(unsigned long offset, unsigned long event)
 298 {
 299         add_event_entry(offset);
 300         add_event_entry(event);
 301 }
 302
 303
 304 static void add_us_sample(struct mm_struct * mm, struct op_sample * s)
 305 {
 306         unsigned long cookie;
 307         off_t offset;
 308
 309         cookie = lookup_dcookie(mm, s->eip, &offset);
 310
 311         if (!cookie) {
 312                 atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 313                 return;
 314         }
 315
 316         if (cookie != last_cookie) {
 317                 add_cookie_switch(cookie);
 318                 last_cookie = cookie;
 319         }
 320
 321         add_sample_entry(offset, s->event);
 322 }
 323
 324
 325 /* Add a sample to the global event buffer. If possible the
 326  * sample is converted into a persistent dentry/offset pair
 327  * for later lookup from userspace.
 328  */
 329 static void add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
 330 {
 331         if (in_kernel) {
 332                 add_sample_entry(s->eip, s->event);
 333         } else if (mm) {
 334                 add_us_sample(mm, s);
 335         } else {
 336                 atomic_inc(&oprofile_stats.sample_lost_no_mm);
 337         }
 338 }
 339
 340
 341 static void release_mm(struct mm_struct * mm)
 342 {
 343         if (mm)
 344                 up_read(&mm->mmap_sem);
 345 }
 346
 347
 348 /* Take the task's mmap_sem to protect ourselves from
 349  * races when we do lookup_dcookie().
 350  */
 351 static struct mm_struct * take_tasks_mm(struct task_struct * task)
 352 {
 353         struct mm_struct * mm;
 354
 355         /* Subtle. We don't need to keep a reference to this task's mm,
 356          * because, for the mm to be freed on another CPU, that would have
 357          * to go through the task exit notifier, which ends up sleeping
 358          * on the buffer_sem we hold, so we end up with mutual exclusion
 359          * anyway.
 360          */
 361         task_lock(task);
 362         mm = task->mm;
 363         task_unlock(task);
 364
 365         if (mm) {
 366                 /* needed to walk the task's VMAs */
 367                 down_read(&mm->mmap_sem);
 368         }
 369
 370         return mm;
 371 }
 372
 373
 374 static inline int is_ctx_switch(unsigned long val)
 375 {
 376         return val == ~0UL;
 377 }
 378
 379
 380 /* "acquire" as many cpu buffer slots as we can */
 381 static unsigned long get_slots(struct oprofile_cpu_buffer * b)
 382 {
 383         unsigned long head = b->head_pos;
 384         unsigned long tail = b->tail_pos;
 385
 386         /*
 387          * Subtle. This resets the persistent last_task
 388          * and in_kernel values used for switching notes.
 389          * BUT, there is a small window between reading
 390          * head_pos, and this call, that means samples
 391          * can appear at the new head position, but not
 392          * be prefixed with the notes for switching
 393          * kernel mode or a task switch. This small hole
 394          * can lead to mis-attribution or samples where
 395          * we don't know if it's in the kernel or not,
 396          * at the start of an event buffer.
 397          */
 398         cpu_buffer_reset(b);
 399
 400         if (head >= tail)
 401                 return head - tail;
 402
 403         return head + (b->buffer_size - tail);
 404 }
 405
 406
 407 static void increment_tail(struct oprofile_cpu_buffer * b)
 408 {
 409         unsigned long new_tail = b->tail_pos + 1;
 410
 411         rmb();
 412
 413         if (new_tail < (b->buffer_size))
 414                 b->tail_pos = new_tail;
 415         else
 416                 b->tail_pos = 0;
 417 }
 418
 419
 420 /* Sync one of the CPU's buffers into the global event buffer.
 421  * Here we need to go through each batch of samples punctuated
 422  * by context switch notes, taking the task's mmap_sem and doing
 423  * lookup in task->mm->mmap to convert EIP into dcookie/offset
 424  * value.
 425  */
 426 static void sync_buffer(struct oprofile_cpu_buffer * cpu_buf)
 427 {
 428         struct mm_struct *mm = NULL;
 429         struct task_struct * new;
 430         unsigned long cookie = 0;
 431         int in_kernel = 1;
 432         unsigned int i;
 433
 434         /* Remember, only we can modify tail_pos */
 435
 436         unsigned long const available = get_slots(cpu_buf);
 437
 438         for (i=0; i < available; ++i) {
 439                 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
 440
 441                 if (is_ctx_switch(s->eip)) {
 442                         if (s->event <= 1) {
 443                                 /* kernel/userspace switch */
 444                                 in_kernel = s->event;
 445                                 add_kernel_ctx_switch(s->event);
 446                         } else {
 447                                 struct mm_struct * oldmm = mm;
 448
 449                                 /* userspace context switch */
 450                                 new = (struct task_struct *)s->event;
 451
 452                                 release_mm(oldmm);
 453                                 mm = take_tasks_mm(new);
 454                                 if (mm != oldmm)
 455                                         cookie = get_exec_dcookie(mm);
 456                                 add_user_ctx_switch(new, cookie);
 457                         }
 458                 } else {
 459                         add_sample(mm, s, in_kernel);
 460                 }
 461
 462                 increment_tail(cpu_buf);
 463         }
 464         release_mm(mm);
 465 }
 466
 467
 468 /* Process each CPU's local buffer into the global
 469  * event buffer.
 470  */
 471 static void sync_cpu_buffers(void)
 472 {
 473         int i;
 474
 475         down(&buffer_sem);
 476
 477         for (i = 0; i < NR_CPUS; ++i) {
 478                 struct oprofile_cpu_buffer * cpu_buf;
 479
 480                 if (!cpu_possible(i))
 481                         continue;
 482
 483                 cpu_buf = &cpu_buffer[i];
 484
 485                 add_cpu_switch(i);
 486                 sync_buffer(cpu_buf);
 487         }
 488
 489         up(&buffer_sem);
 490
 491         mod_timer(&sync_timer, jiffies + DEFAULT_EXPIRE);
 492 }
 493
 494
 495 static void wq_sync_buffers(void * data)
 496 {
 497         sync_cpu_buffers();
 498 }
 499
 500
 501 /* It is possible that we could have no munmap() or
 502  * other events for a period of time. This will lead
 503  * the CPU buffers to overflow and lose samples and
 504  * context switches. We try to reduce the problem
 505  * by timing out when nothing happens for a while.
 506  */
 507 static void timer_ping(unsigned long data)
 508 {
 509         schedule_work(&sync_wq);
 510         /* timer is re-added by the scheduled task */
 511 }