mm/page_owner.c

   1 #include <linux/debugfs.h>
   2 #include <linux/mm.h>
   3 #include <linux/slab.h>
   4 #include <linux/uaccess.h>
   5 #include <linux/bootmem.h>
   6 #include <linux/stacktrace.h>
   7 #include <linux/page_owner.h>
   8 #include <linux/jump_label.h>
   9 #include <linux/migrate.h>
  10 #include <linux/stackdepot.h>
  11 #include <linux/seq_file.h>
  12
  13 #include "internal.h"
  14
  15 /*
  16  * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
  17  * to use off stack temporal storage
  18  */
  19 #define PAGE_OWNER_STACK_DEPTH (16)
  20
  21 static bool page_owner_disabled = true;
  22 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
  23
  24 static depot_stack_handle_t dummy_handle;
  25 static depot_stack_handle_t failure_handle;
  26
  27 static void init_early_allocated_pages(void);
  28
  29 static int early_page_owner_param(char *buf)
  30 {
  31         if (!buf)
  32                 return -EINVAL;
  33
  34         if (strcmp(buf, "on") == 0)
  35                 page_owner_disabled = false;
  36
  37         return 0;
  38 }
  39 early_param("page_owner", early_page_owner_param);
  40
  41 static bool need_page_owner(void)
  42 {
  43         if (page_owner_disabled)
  44                 return false;
  45
  46         return true;
  47 }
  48
  49 static noinline void register_dummy_stack(void)
  50 {
  51         unsigned long entries[4];
  52         struct stack_trace dummy;
  53
  54         dummy.nr_entries = 0;
  55         dummy.max_entries = ARRAY_SIZE(entries);
  56         dummy.entries = &entries[0];
  57         dummy.skip = 0;
  58
  59         save_stack_trace(&dummy);
  60         dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
  61 }
  62
  63 static noinline void register_failure_stack(void)
  64 {
  65         unsigned long entries[4];
  66         struct stack_trace failure;
  67
  68         failure.nr_entries = 0;
  69         failure.max_entries = ARRAY_SIZE(entries);
  70         failure.entries = &entries[0];
  71         failure.skip = 0;
  72
  73         save_stack_trace(&failure);
  74         failure_handle = depot_save_stack(&failure, GFP_KERNEL);
  75 }
  76
  77 static void init_page_owner(void)
  78 {
  79         if (page_owner_disabled)
  80                 return;
  81
  82         register_dummy_stack();
  83         register_failure_stack();
  84         static_branch_enable(&page_owner_inited);
  85         init_early_allocated_pages();
  86 }
  87
  88 struct page_ext_operations page_owner_ops = {
  89         .need = need_page_owner,
  90         .init = init_page_owner,
  91 };
  92
  93 void __reset_page_owner(struct page *page, unsigned int order)
  94 {
  95         int i;
  96         struct page_ext *page_ext;
  97
  98         for (i = 0; i < (1 << order); i++) {
  99                 page_ext = lookup_page_ext(page + i);
 100                 if (unlikely(!page_ext))
 101                         continue;
 102                 __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
 103         }
 104 }
 105
 106 static inline bool check_recursive_alloc(struct stack_trace *trace,
 107                                         unsigned long ip)
 108 {
 109         int i, count;
 110
 111         if (!trace->nr_entries)
 112                 return false;
 113
 114         for (i = 0, count = 0; i < trace->nr_entries; i++) {
 115                 if (trace->entries[i] == ip && ++count == 2)
 116                         return true;
 117         }
 118
 119         return false;
 120 }
 121
 122 static noinline depot_stack_handle_t save_stack(gfp_t flags)
 123 {
 124         unsigned long entries[PAGE_OWNER_STACK_DEPTH];
 125         struct stack_trace trace = {
 126                 .nr_entries = 0,
 127                 .entries = entries,
 128                 .max_entries = PAGE_OWNER_STACK_DEPTH,
 129                 .skip = 0
 130         };
 131         depot_stack_handle_t handle;
 132
 133         save_stack_trace(&trace);
 134         if (trace.nr_entries != 0 &&
 135             trace.entries[trace.nr_entries-1] == ULONG_MAX)
 136                 trace.nr_entries--;
 137
 138         /*
 139          * We need to check recursion here because our request to stackdepot
 140          * could trigger memory allocation to save new entry. New memory
 141          * allocation would reach here and call depot_save_stack() again
 142          * if we don't catch it. There is still not enough memory in stackdepot
 143          * so it would try to allocate memory again and loop forever.
 144          */
 145         if (check_recursive_alloc(&trace, _RET_IP_))
 146                 return dummy_handle;
 147
 148         handle = depot_save_stack(&trace, flags);
 149         if (!handle)
 150                 handle = failure_handle;
 151
 152         return handle;
 153 }
 154
 155 noinline void __set_page_owner(struct page *page, unsigned int order,
 156                                         gfp_t gfp_mask)
 157 {
 158         struct page_ext *page_ext = lookup_page_ext(page);
 159
 160         if (unlikely(!page_ext))
 161                 return;
 162
 163         page_ext->handle = save_stack(gfp_mask);
 164         page_ext->order = order;
 165         page_ext->gfp_mask = gfp_mask;
 166         page_ext->last_migrate_reason = -1;
 167
 168         __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 169 }
 170
 171 void __set_page_owner_migrate_reason(struct page *page, int reason)
 172 {
 173         struct page_ext *page_ext = lookup_page_ext(page);
 174         if (unlikely(!page_ext))
 175                 return;
 176
 177         page_ext->last_migrate_reason = reason;
 178 }
 179
 180 void __split_page_owner(struct page *page, unsigned int order)
 181 {
 182         int i;
 183         struct page_ext *page_ext = lookup_page_ext(page);
 184
 185         if (unlikely(!page_ext))
 186                 return;
 187
 188         page_ext->order = 0;
 189         for (i = 1; i < (1 << order); i++)
 190                 __copy_page_owner(page, page + i);
 191 }
 192
 193 void __copy_page_owner(struct page *oldpage, struct page *newpage)
 194 {
 195         struct page_ext *old_ext = lookup_page_ext(oldpage);
 196         struct page_ext *new_ext = lookup_page_ext(newpage);
 197
 198         if (unlikely(!old_ext || !new_ext))
 199                 return;
 200
 201         new_ext->order = old_ext->order;
 202         new_ext->gfp_mask = old_ext->gfp_mask;
 203         new_ext->last_migrate_reason = old_ext->last_migrate_reason;
 204         new_ext->handle = old_ext->handle;
 205
 206         /*
 207          * We don't clear the bit on the oldpage as it's going to be freed
 208          * after migration. Until then, the info can be useful in case of
 209          * a bug, and the overal stats will be off a bit only temporarily.
 210          * Also, migrate_misplaced_transhuge_page() can still fail the
 211          * migration and then we want the oldpage to retain the info. But
 212          * in that case we also don't need to explicitly clear the info from
 213          * the new page, which will be freed.
 214          */
 215         __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
 216 }
 217
 218 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 219                                        pg_data_t *pgdat, struct zone *zone)
 220 {
 221         struct page *page;
 222         struct page_ext *page_ext;
 223         unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
 224         unsigned long end_pfn = pfn + zone->spanned_pages;
 225         unsigned long count[MIGRATE_TYPES] = { 0, };
 226         int pageblock_mt, page_mt;
 227         int i;
 228
 229         /* Scan block by block. First and last block may be incomplete */
 230         pfn = zone->zone_start_pfn;
 231
 232         /*
 233          * Walk the zone in pageblock_nr_pages steps. If a page block spans
 234          * a zone boundary, it will be double counted between zones. This does
 235          * not matter as the mixed block count will still be correct
 236          */
 237         for (; pfn < end_pfn; ) {
 238                 if (!pfn_valid(pfn)) {
 239                         pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
 240                         continue;
 241                 }
 242
 243                 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 244                 block_end_pfn = min(block_end_pfn, end_pfn);
 245
 246                 page = pfn_to_page(pfn);
 247                 pageblock_mt = get_pageblock_migratetype(page);
 248
 249                 for (; pfn < block_end_pfn; pfn++) {
 250                         if (!pfn_valid_within(pfn))
 251                                 continue;
 252
 253                         page = pfn_to_page(pfn);
 254
 255                         if (page_zone(page) != zone)
 256                                 continue;
 257
 258                         if (PageBuddy(page)) {
 259                                 pfn += (1UL << page_order(page)) - 1;
 260                                 continue;
 261                         }
 262
 263                         if (PageReserved(page))
 264                                 continue;
 265
 266                         page_ext = lookup_page_ext(page);
 267                         if (unlikely(!page_ext))
 268                                 continue;
 269
 270                         if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 271                                 continue;
 272
 273                         page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
 274                         if (pageblock_mt != page_mt) {
 275                                 if (is_migrate_cma(pageblock_mt))
 276                                         count[MIGRATE_MOVABLE]++;
 277                                 else
 278                                         count[pageblock_mt]++;
 279
 280                                 pfn = block_end_pfn;
 281                                 break;
 282                         }
 283                         pfn += (1UL << page_ext->order) - 1;
 284                 }
 285         }
 286
 287         /* Print counts */
 288         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 289         for (i = 0; i < MIGRATE_TYPES; i++)
 290                 seq_printf(m, "%12lu ", count[i]);
 291         seq_putc(m, '\n');
 292 }
 293
 294 static ssize_t
 295 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 296                 struct page *page, struct page_ext *page_ext,
 297                 depot_stack_handle_t handle)
 298 {
 299         int ret;
 300         int pageblock_mt, page_mt;
 301         char *kbuf;
 302         unsigned long entries[PAGE_OWNER_STACK_DEPTH];
 303         struct stack_trace trace = {
 304                 .nr_entries = 0,
 305                 .entries = entries,
 306                 .max_entries = PAGE_OWNER_STACK_DEPTH,
 307                 .skip = 0
 308         };
 309
 310         kbuf = kmalloc(count, GFP_KERNEL);
 311         if (!kbuf)
 312                 return -ENOMEM;
 313
 314         ret = snprintf(kbuf, count,
 315                         "Page allocated via order %u, mask %#x(%pGg)\n",
 316                         page_ext->order, page_ext->gfp_mask,
 317                         &page_ext->gfp_mask);
 318
 319         if (ret >= count)
 320                 goto err;
 321
 322         /* Print information relevant to grouping pages by mobility */
 323         pageblock_mt = get_pageblock_migratetype(page);
 324         page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
 325         ret += snprintf(kbuf + ret, count - ret,
 326                         "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
 327                         pfn,
 328                         migratetype_names[page_mt],
 329                         pfn >> pageblock_order,
 330                         migratetype_names[pageblock_mt],
 331                         page->flags, &page->flags);
 332
 333         if (ret >= count)
 334                 goto err;
 335
 336         depot_fetch_stack(handle, &trace);
 337         ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
 338         if (ret >= count)
 339                 goto err;
 340
 341         if (page_ext->last_migrate_reason != -1) {
 342                 ret += snprintf(kbuf + ret, count - ret,
 343                         "Page has been migrated, last migrate reason: %s\n",
 344                         migrate_reason_names[page_ext->last_migrate_reason]);
 345                 if (ret >= count)
 346                         goto err;
 347         }
 348
 349         ret += snprintf(kbuf + ret, count - ret, "\n");
 350         if (ret >= count)
 351                 goto err;
 352
 353         if (copy_to_user(buf, kbuf, ret))
 354                 ret = -EFAULT;
 355
 356         kfree(kbuf);
 357         return ret;
 358
 359 err:
 360         kfree(kbuf);
 361         return -ENOMEM;
 362 }
 363
 364 void __dump_page_owner(struct page *page)
 365 {
 366         struct page_ext *page_ext = lookup_page_ext(page);
 367         unsigned long entries[PAGE_OWNER_STACK_DEPTH];
 368         struct stack_trace trace = {
 369                 .nr_entries = 0,
 370                 .entries = entries,
 371                 .max_entries = PAGE_OWNER_STACK_DEPTH,
 372                 .skip = 0
 373         };
 374         depot_stack_handle_t handle;
 375         gfp_t gfp_mask;
 376         int mt;
 377
 378         if (unlikely(!page_ext)) {
 379                 pr_alert("There is not page extension available.\n");
 380                 return;
 381         }
 382         gfp_mask = page_ext->gfp_mask;
 383         mt = gfpflags_to_migratetype(gfp_mask);
 384
 385         if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
 386                 pr_alert("page_owner info is not active (free page?)\n");
 387                 return;
 388         }
 389
 390         handle = READ_ONCE(page_ext->handle);
 391         if (!handle) {
 392                 pr_alert("page_owner info is not active (free page?)\n");
 393                 return;
 394         }
 395
 396         depot_fetch_stack(handle, &trace);
 397         pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
 398                  page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
 399         print_stack_trace(&trace, 0);
 400
 401         if (page_ext->last_migrate_reason != -1)
 402                 pr_alert("page has been migrated, last migrate reason: %s\n",
 403                         migrate_reason_names[page_ext->last_migrate_reason]);
 404 }
 405
 406 static ssize_t
 407 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 408 {
 409         unsigned long pfn;
 410         struct page *page;
 411         struct page_ext *page_ext;
 412         depot_stack_handle_t handle;
 413
 414         if (!static_branch_unlikely(&page_owner_inited))
 415                 return -EINVAL;
 416
 417         page = NULL;
 418         pfn = min_low_pfn + *ppos;
 419
 420         /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
 421         while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
 422                 pfn++;
 423
 424         drain_all_pages(NULL);
 425
 426         /* Find an allocated page */
 427         for (; pfn < max_pfn; pfn++) {
 428                 /*
 429                  * If the new page is in a new MAX_ORDER_NR_PAGES area,
 430                  * validate the area as existing, skip it if not
 431                  */
 432                 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
 433                         pfn += MAX_ORDER_NR_PAGES - 1;
 434                         continue;
 435                 }
 436
 437                 /* Check for holes within a MAX_ORDER area */
 438                 if (!pfn_valid_within(pfn))
 439                         continue;
 440
 441                 page = pfn_to_page(pfn);
 442                 if (PageBuddy(page)) {
 443                         unsigned long freepage_order = page_order_unsafe(page);
 444
 445                         if (freepage_order < MAX_ORDER)
 446                                 pfn += (1UL << freepage_order) - 1;
 447                         continue;
 448                 }
 449
 450                 page_ext = lookup_page_ext(page);
 451                 if (unlikely(!page_ext))
 452                         continue;
 453
 454                 /*
 455                  * Some pages could be missed by concurrent allocation or free,
 456                  * because we don't hold the zone lock.
 457                  */
 458                 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 459                         continue;
 460
 461                 /*
 462                  * Access to page_ext->handle isn't synchronous so we should
 463                  * be careful to access it.
 464                  */
 465                 handle = READ_ONCE(page_ext->handle);
 466                 if (!handle)
 467                         continue;
 468
 469                 /* Record the next PFN to read in the file offset */
 470                 *ppos = (pfn - min_low_pfn) + 1;
 471
 472                 return print_page_owner(buf, count, pfn, page,
 473                                 page_ext, handle);
 474         }
 475
 476         return 0;
 477 }
 478
 479 static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 480 {
 481         struct page *page;
 482         struct page_ext *page_ext;
 483         unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
 484         unsigned long end_pfn = pfn + zone->spanned_pages;
 485         unsigned long count = 0;
 486
 487         /* Scan block by block. First and last block may be incomplete */
 488         pfn = zone->zone_start_pfn;
 489
 490         /*
 491          * Walk the zone in pageblock_nr_pages steps. If a page block spans
 492          * a zone boundary, it will be double counted between zones. This does
 493          * not matter as the mixed block count will still be correct
 494          */
 495         for (; pfn < end_pfn; ) {
 496                 if (!pfn_valid(pfn)) {
 497                         pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
 498                         continue;
 499                 }
 500
 501                 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 502                 block_end_pfn = min(block_end_pfn, end_pfn);
 503
 504                 page = pfn_to_page(pfn);
 505
 506                 for (; pfn < block_end_pfn; pfn++) {
 507                         if (!pfn_valid_within(pfn))
 508                                 continue;
 509
 510                         page = pfn_to_page(pfn);
 511
 512                         if (page_zone(page) != zone)
 513                                 continue;
 514
 515                         /*
 516                          * We are safe to check buddy flag and order, because
 517                          * this is init stage and only single thread runs.
 518                          */
 519                         if (PageBuddy(page)) {
 520                                 pfn += (1UL << page_order(page)) - 1;
 521                                 continue;
 522                         }
 523
 524                         if (PageReserved(page))
 525                                 continue;
 526
 527                         page_ext = lookup_page_ext(page);
 528                         if (unlikely(!page_ext))
 529                                 continue;
 530
 531                         /* Maybe overraping zone */
 532                         if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 533                                 continue;
 534
 535                         /* Found early allocated page */
 536                         set_page_owner(page, 0, 0);
 537                         count++;
 538                 }
 539         }
 540
 541         pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
 542                 pgdat->node_id, zone->name, count);
 543 }
 544
 545 static void init_zones_in_node(pg_data_t *pgdat)
 546 {
 547         struct zone *zone;
 548         struct zone *node_zones = pgdat->node_zones;
 549         unsigned long flags;
 550
 551         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 552                 if (!populated_zone(zone))
 553                         continue;
 554
 555                 spin_lock_irqsave(&zone->lock, flags);
 556                 init_pages_in_zone(pgdat, zone);
 557                 spin_unlock_irqrestore(&zone->lock, flags);
 558         }
 559 }
 560
 561 static void init_early_allocated_pages(void)
 562 {
 563         pg_data_t *pgdat;
 564
 565         drain_all_pages(NULL);
 566         for_each_online_pgdat(pgdat)
 567                 init_zones_in_node(pgdat);
 568 }
 569
 570 static const struct file_operations proc_page_owner_operations = {
 571         .read           = read_page_owner,
 572 };
 573
 574 static int __init pageowner_init(void)
 575 {
 576         struct dentry *dentry;
 577
 578         if (!static_branch_unlikely(&page_owner_inited)) {
 579                 pr_info("page_owner is disabled\n");
 580                 return 0;
 581         }
 582
 583         dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
 584                         NULL, &proc_page_owner_operations);
 585         if (IS_ERR(dentry))
 586                 return PTR_ERR(dentry);
 587
 588         return 0;
 589 }
 590 late_initcall(pageowner_init)