kernel/profile.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/kernel/profile.c
   4  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
   5  *  with configurable resolution, support for restricting the cpus on
   6  *  which profiling is done, and switching between cpu time and
   7  *  schedule() calls via kernel command line parameters passed at boot.
   8  *
   9  *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
  10  *      Red Hat, July 2004
  11  *  Consolidation of architecture support code for profiling,
  12  *      Nadia Yvette Chambers, Oracle, July 2004
  13  *  Amortized hit count accounting via per-cpu open-addressed hashtables
  14  *      to resolve timer interrupt livelocks, Nadia Yvette Chambers,
  15  *      Oracle, 2004
  16  */
  17
  18 #include <linux/export.h>
  19 #include <linux/profile.h>
  20 #include <linux/memblock.h>
  21 #include <linux/notifier.h>
  22 #include <linux/mm.h>
  23 #include <linux/cpumask.h>
  24 #include <linux/cpu.h>
  25 #include <linux/highmem.h>
  26 #include <linux/mutex.h>
  27 #include <linux/slab.h>
  28 #include <linux/vmalloc.h>
  29 #include <linux/sched/stat.h>
  30
  31 #include <asm/sections.h>
  32 #include <asm/irq_regs.h>
  33 #include <asm/ptrace.h>
  34
  35 struct profile_hit {
  36         u32 pc, hits;
  37 };
  38 #define PROFILE_GRPSHIFT        3
  39 #define PROFILE_GRPSZ           (1 << PROFILE_GRPSHIFT)
  40 #define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
  41 #define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
  42
  43 static atomic_t *prof_buffer;
  44 static unsigned long prof_len;
  45 static unsigned short int prof_shift;
  46
  47 int prof_on __read_mostly;
  48 EXPORT_SYMBOL_GPL(prof_on);
  49
  50 static cpumask_var_t prof_cpu_mask;
  51 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
  52 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
  53 static DEFINE_PER_CPU(int, cpu_profile_flip);
  54 static DEFINE_MUTEX(profile_flip_mutex);
  55 #endif /* CONFIG_SMP */
  56
  57 int profile_setup(char *str)
  58 {
  59         static const char schedstr[] = "schedule";
  60         static const char sleepstr[] = "sleep";
  61         static const char kvmstr[] = "kvm";
  62         int par;
  63
  64         if (!strncmp(str, sleepstr, strlen(sleepstr))) {
  65 #ifdef CONFIG_SCHEDSTATS
  66                 force_schedstat_enabled();
  67                 prof_on = SLEEP_PROFILING;
  68                 if (str[strlen(sleepstr)] == ',')
  69                         str += strlen(sleepstr) + 1;
  70                 if (get_option(&str, &par))
  71                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  72                 pr_info("kernel sleep profiling enabled (shift: %u)\n",
  73                         prof_shift);
  74 #else
  75                 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
  76 #endif /* CONFIG_SCHEDSTATS */
  77         } else if (!strncmp(str, schedstr, strlen(schedstr))) {
  78                 prof_on = SCHED_PROFILING;
  79                 if (str[strlen(schedstr)] == ',')
  80                         str += strlen(schedstr) + 1;
  81                 if (get_option(&str, &par))
  82                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  83                 pr_info("kernel schedule profiling enabled (shift: %u)\n",
  84                         prof_shift);
  85         } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
  86                 prof_on = KVM_PROFILING;
  87                 if (str[strlen(kvmstr)] == ',')
  88                         str += strlen(kvmstr) + 1;
  89                 if (get_option(&str, &par))
  90                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  91                 pr_info("kernel KVM profiling enabled (shift: %u)\n",
  92                         prof_shift);
  93         } else if (get_option(&str, &par)) {
  94                 prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  95                 prof_on = CPU_PROFILING;
  96                 pr_info("kernel profiling enabled (shift: %u)\n",
  97                         prof_shift);
  98         }
  99         return 1;
 100 }
 101 __setup("profile=", profile_setup);
 102
 103
 104 int __ref profile_init(void)
 105 {
 106         int buffer_bytes;
 107         if (!prof_on)
 108                 return 0;
 109
 110         /* only text is profiled */
 111         prof_len = (_etext - _stext) >> prof_shift;
 112
 113         if (!prof_len) {
 114                 pr_warn("profiling shift: %u too large\n", prof_shift);
 115                 prof_on = 0;
 116                 return -EINVAL;
 117         }
 118
 119         buffer_bytes = prof_len*sizeof(atomic_t);
 120
 121         if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 122                 return -ENOMEM;
 123
 124         cpumask_copy(prof_cpu_mask, cpu_possible_mask);
 125
 126         prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
 127         if (prof_buffer)
 128                 return 0;
 129
 130         prof_buffer = alloc_pages_exact(buffer_bytes,
 131                                         GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
 132         if (prof_buffer)
 133                 return 0;
 134
 135         prof_buffer = vzalloc(buffer_bytes);
 136         if (prof_buffer)
 137                 return 0;
 138
 139         free_cpumask_var(prof_cpu_mask);
 140         return -ENOMEM;
 141 }
 142
 143 /* Profile event notifications */
 144
 145 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 146 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 147 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
 148
 149 void profile_task_exit(struct task_struct *task)
 150 {
 151         blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 152 }
 153
 154 int profile_handoff_task(struct task_struct *task)
 155 {
 156         int ret;
 157         ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
 158         return (ret == NOTIFY_OK) ? 1 : 0;
 159 }
 160
 161 void profile_munmap(unsigned long addr)
 162 {
 163         blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 164 }
 165
 166 int task_handoff_register(struct notifier_block *n)
 167 {
 168         return atomic_notifier_chain_register(&task_free_notifier, n);
 169 }
 170 EXPORT_SYMBOL_GPL(task_handoff_register);
 171
 172 int task_handoff_unregister(struct notifier_block *n)
 173 {
 174         return atomic_notifier_chain_unregister(&task_free_notifier, n);
 175 }
 176 EXPORT_SYMBOL_GPL(task_handoff_unregister);
 177
 178 int profile_event_register(enum profile_type type, struct notifier_block *n)
 179 {
 180         int err = -EINVAL;
 181
 182         switch (type) {
 183         case PROFILE_TASK_EXIT:
 184                 err = blocking_notifier_chain_register(
 185                                 &task_exit_notifier, n);
 186                 break;
 187         case PROFILE_MUNMAP:
 188                 err = blocking_notifier_chain_register(
 189                                 &munmap_notifier, n);
 190                 break;
 191         }
 192
 193         return err;
 194 }
 195 EXPORT_SYMBOL_GPL(profile_event_register);
 196
 197 int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 198 {
 199         int err = -EINVAL;
 200
 201         switch (type) {
 202         case PROFILE_TASK_EXIT:
 203                 err = blocking_notifier_chain_unregister(
 204                                 &task_exit_notifier, n);
 205                 break;
 206         case PROFILE_MUNMAP:
 207                 err = blocking_notifier_chain_unregister(
 208                                 &munmap_notifier, n);
 209                 break;
 210         }
 211
 212         return err;
 213 }
 214 EXPORT_SYMBOL_GPL(profile_event_unregister);
 215
 216 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
 217 /*
 218  * Each cpu has a pair of open-addressed hashtables for pending
 219  * profile hits. read_profile() IPI's all cpus to request them
 220  * to flip buffers and flushes their contents to prof_buffer itself.
 221  * Flip requests are serialized by the profile_flip_mutex. The sole
 222  * use of having a second hashtable is for avoiding cacheline
 223  * contention that would otherwise happen during flushes of pending
 224  * profile hits required for the accuracy of reported profile hits
 225  * and so resurrect the interrupt livelock issue.
 226  *
 227  * The open-addressed hashtables are indexed by profile buffer slot
 228  * and hold the number of pending hits to that profile buffer slot on
 229  * a cpu in an entry. When the hashtable overflows, all pending hits
 230  * are accounted to their corresponding profile buffer slots with
 231  * atomic_add() and the hashtable emptied. As numerous pending hits
 232  * may be accounted to a profile buffer slot in a hashtable entry,
 233  * this amortizes a number of atomic profile buffer increments likely
 234  * to be far larger than the number of entries in the hashtable,
 235  * particularly given that the number of distinct profile buffer
 236  * positions to which hits are accounted during short intervals (e.g.
 237  * several seconds) is usually very small. Exclusion from buffer
 238  * flipping is provided by interrupt disablement (note that for
 239  * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
 240  * process context).
 241  * The hash function is meant to be lightweight as opposed to strong,
 242  * and was vaguely inspired by ppc64 firmware-supported inverted
 243  * pagetable hash functions, but uses a full hashtable full of finite
 244  * collision chains, not just pairs of them.
 245  *
 246  * -- nyc
 247  */
 248 static void __profile_flip_buffers(void *unused)
 249 {
 250         int cpu = smp_processor_id();
 251
 252         per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
 253 }
 254
 255 static void profile_flip_buffers(void)
 256 {
 257         int i, j, cpu;
 258
 259         mutex_lock(&profile_flip_mutex);
 260         j = per_cpu(cpu_profile_flip, get_cpu());
 261         put_cpu();
 262         on_each_cpu(__profile_flip_buffers, NULL, 1);
 263         for_each_online_cpu(cpu) {
 264                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 265                 for (i = 0; i < NR_PROFILE_HIT; ++i) {
 266                         if (!hits[i].hits) {
 267                                 if (hits[i].pc)
 268                                         hits[i].pc = 0;
 269                                 continue;
 270                         }
 271                         atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 272                         hits[i].hits = hits[i].pc = 0;
 273                 }
 274         }
 275         mutex_unlock(&profile_flip_mutex);
 276 }
 277
 278 static void profile_discard_flip_buffers(void)
 279 {
 280         int i, cpu;
 281
 282         mutex_lock(&profile_flip_mutex);
 283         i = per_cpu(cpu_profile_flip, get_cpu());
 284         put_cpu();
 285         on_each_cpu(__profile_flip_buffers, NULL, 1);
 286         for_each_online_cpu(cpu) {
 287                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 288                 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
 289         }
 290         mutex_unlock(&profile_flip_mutex);
 291 }
 292
 293 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 294 {
 295         unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 296         int i, j, cpu;
 297         struct profile_hit *hits;
 298
 299         pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
 300         i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 301         secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 302         cpu = get_cpu();
 303         hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
 304         if (!hits) {
 305                 put_cpu();
 306                 return;
 307         }
 308         /*
 309          * We buffer the global profiler buffer into a per-CPU
 310          * queue and thus reduce the number of global (and possibly
 311          * NUMA-alien) accesses. The write-queue is self-coalescing:
 312          */
 313         local_irq_save(flags);
 314         do {
 315                 for (j = 0; j < PROFILE_GRPSZ; ++j) {
 316                         if (hits[i + j].pc == pc) {
 317                                 hits[i + j].hits += nr_hits;
 318                                 goto out;
 319                         } else if (!hits[i + j].hits) {
 320                                 hits[i + j].pc = pc;
 321                                 hits[i + j].hits = nr_hits;
 322                                 goto out;
 323                         }
 324                 }
 325                 i = (i + secondary) & (NR_PROFILE_HIT - 1);
 326         } while (i != primary);
 327
 328         /*
 329          * Add the current hit(s) and flush the write-queue out
 330          * to the global buffer:
 331          */
 332         atomic_add(nr_hits, &prof_buffer[pc]);
 333         for (i = 0; i < NR_PROFILE_HIT; ++i) {
 334                 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 335                 hits[i].pc = hits[i].hits = 0;
 336         }
 337 out:
 338         local_irq_restore(flags);
 339         put_cpu();
 340 }
 341
 342 static int profile_dead_cpu(unsigned int cpu)
 343 {
 344         struct page *page;
 345         int i;
 346
 347         if (cpumask_available(prof_cpu_mask))
 348                 cpumask_clear_cpu(cpu, prof_cpu_mask);
 349
 350         for (i = 0; i < 2; i++) {
 351                 if (per_cpu(cpu_profile_hits, cpu)[i]) {
 352                         page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
 353                         per_cpu(cpu_profile_hits, cpu)[i] = NULL;
 354                         __free_page(page);
 355                 }
 356         }
 357         return 0;
 358 }
 359
 360 static int profile_prepare_cpu(unsigned int cpu)
 361 {
 362         int i, node = cpu_to_mem(cpu);
 363         struct page *page;
 364
 365         per_cpu(cpu_profile_flip, cpu) = 0;
 366
 367         for (i = 0; i < 2; i++) {
 368                 if (per_cpu(cpu_profile_hits, cpu)[i])
 369                         continue;
 370
 371                 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 372                 if (!page) {
 373                         profile_dead_cpu(cpu);
 374                         return -ENOMEM;
 375                 }
 376                 per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
 377
 378         }
 379         return 0;
 380 }
 381
 382 static int profile_online_cpu(unsigned int cpu)
 383 {
 384         if (cpumask_available(prof_cpu_mask))
 385                 cpumask_set_cpu(cpu, prof_cpu_mask);
 386
 387         return 0;
 388 }
 389
 390 #else /* !CONFIG_SMP */
 391 #define profile_flip_buffers()          do { } while (0)
 392 #define profile_discard_flip_buffers()  do { } while (0)
 393
 394 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 395 {
 396         unsigned long pc;
 397         pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
 398         atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 399 }
 400 #endif /* !CONFIG_SMP */
 401
 402 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 403 {
 404         if (prof_on != type || !prof_buffer)
 405                 return;
 406         do_profile_hits(type, __pc, nr_hits);
 407 }
 408 EXPORT_SYMBOL_GPL(profile_hits);
 409
 410 void profile_tick(int type)
 411 {
 412         struct pt_regs *regs = get_irq_regs();
 413
 414         if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
 415             cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 416                 profile_hit(type, (void *)profile_pc(regs));
 417 }
 418
 419 #ifdef CONFIG_PROC_FS
 420 #include <linux/proc_fs.h>
 421 #include <linux/seq_file.h>
 422 #include <linux/uaccess.h>
 423
 424 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
 425 {
 426         seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
 427         return 0;
 428 }
 429
 430 static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
 431 {
 432         return single_open(file, prof_cpu_mask_proc_show, NULL);
 433 }
 434
 435 static ssize_t prof_cpu_mask_proc_write(struct file *file,
 436         const char __user *buffer, size_t count, loff_t *pos)
 437 {
 438         cpumask_var_t new_value;
 439         int err;
 440
 441         if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
 442                 return -ENOMEM;
 443
 444         err = cpumask_parse_user(buffer, count, new_value);
 445         if (!err) {
 446                 cpumask_copy(prof_cpu_mask, new_value);
 447                 err = count;
 448         }
 449         free_cpumask_var(new_value);
 450         return err;
 451 }
 452
 453 static const struct proc_ops prof_cpu_mask_proc_ops = {
 454         .proc_open      = prof_cpu_mask_proc_open,
 455         .proc_read      = seq_read,
 456         .proc_lseek     = seq_lseek,
 457         .proc_release   = single_release,
 458         .proc_write     = prof_cpu_mask_proc_write,
 459 };
 460
 461 void create_prof_cpu_mask(void)
 462 {
 463         /* create /proc/irq/prof_cpu_mask */
 464         proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
 465 }
 466
 467 /*
 468  * This function accesses profiling information. The returned data is
 469  * binary: the sampling step and the actual contents of the profile
 470  * buffer. Use of the program readprofile is recommended in order to
 471  * get meaningful info out of these data.
 472  */
 473 static ssize_t
 474 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 475 {
 476         unsigned long p = *ppos;
 477         ssize_t read;
 478         char *pnt;
 479         unsigned long sample_step = 1UL << prof_shift;
 480
 481         profile_flip_buffers();
 482         if (p >= (prof_len+1)*sizeof(unsigned int))
 483                 return 0;
 484         if (count > (prof_len+1)*sizeof(unsigned int) - p)
 485                 count = (prof_len+1)*sizeof(unsigned int) - p;
 486         read = 0;
 487
 488         while (p < sizeof(unsigned int) && count > 0) {
 489                 if (put_user(*((char *)(&sample_step)+p), buf))
 490                         return -EFAULT;
 491                 buf++; p++; count--; read++;
 492         }
 493         pnt = (char *)prof_buffer + p - sizeof(atomic_t);
 494         if (copy_to_user(buf, (void *)pnt, count))
 495                 return -EFAULT;
 496         read += count;
 497         *ppos += read;
 498         return read;
 499 }
 500
 501 /*
 502  * Writing to /proc/profile resets the counters
 503  *
 504  * Writing a 'profiling multiplier' value into it also re-sets the profiling
 505  * interrupt frequency, on architectures that support this.
 506  */
 507 static ssize_t write_profile(struct file *file, const char __user *buf,
 508                              size_t count, loff_t *ppos)
 509 {
 510 #ifdef CONFIG_SMP
 511         extern int setup_profiling_timer(unsigned int multiplier);
 512
 513         if (count == sizeof(int)) {
 514                 unsigned int multiplier;
 515
 516                 if (copy_from_user(&multiplier, buf, sizeof(int)))
 517                         return -EFAULT;
 518
 519                 if (setup_profiling_timer(multiplier))
 520                         return -EINVAL;
 521         }
 522 #endif
 523         profile_discard_flip_buffers();
 524         memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
 525         return count;
 526 }
 527
 528 static const struct proc_ops profile_proc_ops = {
 529         .proc_read      = read_profile,
 530         .proc_write     = write_profile,
 531         .proc_lseek     = default_llseek,
 532 };
 533
 534 int __ref create_proc_profile(void)
 535 {
 536         struct proc_dir_entry *entry;
 537 #ifdef CONFIG_SMP
 538         enum cpuhp_state online_state;
 539 #endif
 540
 541         int err = 0;
 542
 543         if (!prof_on)
 544                 return 0;
 545 #ifdef CONFIG_SMP
 546         err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
 547                                 profile_prepare_cpu, profile_dead_cpu);
 548         if (err)
 549                 return err;
 550
 551         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
 552                                 profile_online_cpu, NULL);
 553         if (err < 0)
 554                 goto err_state_prep;
 555         online_state = err;
 556         err = 0;
 557 #endif
 558         entry = proc_create("profile", S_IWUSR | S_IRUGO,
 559                             NULL, &profile_proc_ops);
 560         if (!entry)
 561                 goto err_state_onl;
 562         proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
 563
 564         return err;
 565 err_state_onl:
 566 #ifdef CONFIG_SMP
 567         cpuhp_remove_state(online_state);
 568 err_state_prep:
 569         cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
 570 #endif
 571         return err;
 572 }
 573 subsys_initcall(create_proc_profile);
 574 #endif /* CONFIG_PROC_FS */