mm/page_reporting.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/mm.h>
   3 #include <linux/mmzone.h>
   4 #include <linux/page_reporting.h>
   5 #include <linux/gfp.h>
   6 #include <linux/export.h>
   7 #include <linux/module.h>
   8 #include <linux/delay.h>
   9 #include <linux/scatterlist.h>
  10
  11 #include "page_reporting.h"
  12 #include "internal.h"
  13
  14 unsigned int page_reporting_order = MAX_ORDER;
  15 module_param(page_reporting_order, uint, 0644);
  16 MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
  17
  18 #define PAGE_REPORTING_DELAY    (2 * HZ)
  19 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
  20
  21 enum {
  22         PAGE_REPORTING_IDLE = 0,
  23         PAGE_REPORTING_REQUESTED,
  24         PAGE_REPORTING_ACTIVE
  25 };
  26
  27 /* request page reporting */
  28 static void
  29 __page_reporting_request(struct page_reporting_dev_info *prdev)
  30 {
  31         unsigned int state;
  32
  33         /* Check to see if we are in desired state */
  34         state = atomic_read(&prdev->state);
  35         if (state == PAGE_REPORTING_REQUESTED)
  36                 return;
  37
  38         /*
  39          * If reporting is already active there is nothing we need to do.
  40          * Test against 0 as that represents PAGE_REPORTING_IDLE.
  41          */
  42         state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
  43         if (state != PAGE_REPORTING_IDLE)
  44                 return;
  45
  46         /*
  47          * Delay the start of work to allow a sizable queue to build. For
  48          * now we are limiting this to running no more than once every
  49          * couple of seconds.
  50          */
  51         schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
  52 }
  53
  54 /* notify prdev of free page reporting request */
  55 void __page_reporting_notify(void)
  56 {
  57         struct page_reporting_dev_info *prdev;
  58
  59         /*
  60          * We use RCU to protect the pr_dev_info pointer. In almost all
  61          * cases this should be present, however in the unlikely case of
  62          * a shutdown this will be NULL and we should exit.
  63          */
  64         rcu_read_lock();
  65         prdev = rcu_dereference(pr_dev_info);
  66         if (likely(prdev))
  67                 __page_reporting_request(prdev);
  68
  69         rcu_read_unlock();
  70 }
  71
  72 static void
  73 page_reporting_drain(struct page_reporting_dev_info *prdev,
  74                      struct scatterlist *sgl, unsigned int nents, bool reported)
  75 {
  76         struct scatterlist *sg = sgl;
  77
  78         /*
  79          * Drain the now reported pages back into their respective
  80          * free lists/areas. We assume at least one page is populated.
  81          */
  82         do {
  83                 struct page *page = sg_page(sg);
  84                 int mt = get_pageblock_migratetype(page);
  85                 unsigned int order = get_order(sg->length);
  86
  87                 __putback_isolated_page(page, order, mt);
  88
  89                 /* If the pages were not reported due to error skip flagging */
  90                 if (!reported)
  91                         continue;
  92
  93                 /*
  94                  * If page was not comingled with another page we can
  95                  * consider the result to be "reported" since the page
  96                  * hasn't been modified, otherwise we will need to
  97                  * report on the new larger page when we make our way
  98                  * up to that higher order.
  99                  */
 100                 if (PageBuddy(page) && buddy_order(page) == order)
 101                         __SetPageReported(page);
 102         } while ((sg = sg_next(sg)));
 103
 104         /* reinitialize scatterlist now that it is empty */
 105         sg_init_table(sgl, nents);
 106 }
 107
 108 /*
 109  * The page reporting cycle consists of 4 stages, fill, report, drain, and
 110  * idle. We will cycle through the first 3 stages until we cannot obtain a
 111  * full scatterlist of pages, in that case we will switch to idle.
 112  */
 113 static int
 114 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
 115                      unsigned int order, unsigned int mt,
 116                      struct scatterlist *sgl, unsigned int *offset)
 117 {
 118         struct free_area *area = &zone->free_area[order];
 119         struct list_head *list = &area->free_list[mt];
 120         unsigned int page_len = PAGE_SIZE << order;
 121         struct page *page, *next;
 122         long budget;
 123         int err = 0;
 124
 125         /*
 126          * Perform early check, if free area is empty there is
 127          * nothing to process so we can skip this free_list.
 128          */
 129         if (list_empty(list))
 130                 return err;
 131
 132         spin_lock_irq(&zone->lock);
 133
 134         /*
 135          * Limit how many calls we will be making to the page reporting
 136          * device for this list. By doing this we avoid processing any
 137          * given list for too long.
 138          *
 139          * The current value used allows us enough calls to process over a
 140          * sixteenth of the current list plus one additional call to handle
 141          * any pages that may have already been present from the previous
 142          * list processed. This should result in us reporting all pages on
 143          * an idle system in about 30 seconds.
 144          *
 145          * The division here should be cheap since PAGE_REPORTING_CAPACITY
 146          * should always be a power of 2.
 147          */
 148         budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
 149
 150         /* loop through free list adding unreported pages to sg list */
 151         list_for_each_entry_safe(page, next, list, lru) {
 152                 /* We are going to skip over the reported pages. */
 153                 if (PageReported(page))
 154                         continue;
 155
 156                 /*
 157                  * If we fully consumed our budget then update our
 158                  * state to indicate that we are requesting additional
 159                  * processing and exit this list.
 160                  */
 161                 if (budget < 0) {
 162                         atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
 163                         next = page;
 164                         break;
 165                 }
 166
 167                 /* Attempt to pull page from list and place in scatterlist */
 168                 if (*offset) {
 169                         if (!__isolate_free_page(page, order)) {
 170                                 next = page;
 171                                 break;
 172                         }
 173
 174                         /* Add page to scatter list */
 175                         --(*offset);
 176                         sg_set_page(&sgl[*offset], page, page_len, 0);
 177
 178                         continue;
 179                 }
 180
 181                 /*
 182                  * Make the first non-reported page in the free list
 183                  * the new head of the free list before we release the
 184                  * zone lock.
 185                  */
 186                 if (!list_is_first(&page->lru, list))
 187                         list_rotate_to_front(&page->lru, list);
 188
 189                 /* release lock before waiting on report processing */
 190                 spin_unlock_irq(&zone->lock);
 191
 192                 /* begin processing pages in local list */
 193                 err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
 194
 195                 /* reset offset since the full list was reported */
 196                 *offset = PAGE_REPORTING_CAPACITY;
 197
 198                 /* update budget to reflect call to report function */
 199                 budget--;
 200
 201                 /* reacquire zone lock and resume processing */
 202                 spin_lock_irq(&zone->lock);
 203
 204                 /* flush reported pages from the sg list */
 205                 page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
 206
 207                 /*
 208                  * Reset next to first entry, the old next isn't valid
 209                  * since we dropped the lock to report the pages
 210                  */
 211                 next = list_first_entry(list, struct page, lru);
 212
 213                 /* exit on error */
 214                 if (err)
 215                         break;
 216         }
 217
 218         /* Rotate any leftover pages to the head of the freelist */
 219         if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
 220                 list_rotate_to_front(&next->lru, list);
 221
 222         spin_unlock_irq(&zone->lock);
 223
 224         return err;
 225 }
 226
 227 static int
 228 page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 229                             struct scatterlist *sgl, struct zone *zone)
 230 {
 231         unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
 232         unsigned long watermark;
 233         int err = 0;
 234
 235         /* Generate minimum watermark to be able to guarantee progress */
 236         watermark = low_wmark_pages(zone) +
 237                     (PAGE_REPORTING_CAPACITY << page_reporting_order);
 238
 239         /*
 240          * Cancel request if insufficient free memory or if we failed
 241          * to allocate page reporting statistics for the zone.
 242          */
 243         if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 244                 return err;
 245
 246         /* Process each free list starting from lowest order/mt */
 247         for (order = page_reporting_order; order < MAX_ORDER; order++) {
 248                 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 249                         /* We do not pull pages from the isolate free list */
 250                         if (is_migrate_isolate(mt))
 251                                 continue;
 252
 253                         err = page_reporting_cycle(prdev, zone, order, mt,
 254                                                    sgl, &offset);
 255                         if (err)
 256                                 return err;
 257                 }
 258         }
 259
 260         /* report the leftover pages before going idle */
 261         leftover = PAGE_REPORTING_CAPACITY - offset;
 262         if (leftover) {
 263                 sgl = &sgl[offset];
 264                 err = prdev->report(prdev, sgl, leftover);
 265
 266                 /* flush any remaining pages out from the last report */
 267                 spin_lock_irq(&zone->lock);
 268                 page_reporting_drain(prdev, sgl, leftover, !err);
 269                 spin_unlock_irq(&zone->lock);
 270         }
 271
 272         return err;
 273 }
 274
 275 static void page_reporting_process(struct work_struct *work)
 276 {
 277         struct delayed_work *d_work = to_delayed_work(work);
 278         struct page_reporting_dev_info *prdev =
 279                 container_of(d_work, struct page_reporting_dev_info, work);
 280         int err = 0, state = PAGE_REPORTING_ACTIVE;
 281         struct scatterlist *sgl;
 282         struct zone *zone;
 283
 284         /*
 285          * Change the state to "Active" so that we can track if there is
 286          * anyone requests page reporting after we complete our pass. If
 287          * the state is not altered by the end of the pass we will switch
 288          * to idle and quit scheduling reporting runs.
 289          */
 290         atomic_set(&prdev->state, state);
 291
 292         /* allocate scatterlist to store pages being reported on */
 293         sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
 294         if (!sgl)
 295                 goto err_out;
 296
 297         sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
 298
 299         for_each_zone(zone) {
 300                 err = page_reporting_process_zone(prdev, sgl, zone);
 301                 if (err)
 302                         break;
 303         }
 304
 305         kfree(sgl);
 306 err_out:
 307         /*
 308          * If the state has reverted back to requested then there may be
 309          * additional pages to be processed. We will defer for 2s to allow
 310          * more pages to accumulate.
 311          */
 312         state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
 313         if (state == PAGE_REPORTING_REQUESTED)
 314                 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
 315 }
 316
 317 static DEFINE_MUTEX(page_reporting_mutex);
 318 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
 319
 320 int page_reporting_register(struct page_reporting_dev_info *prdev)
 321 {
 322         int err = 0;
 323
 324         mutex_lock(&page_reporting_mutex);
 325
 326         /* nothing to do if already in use */
 327         if (rcu_access_pointer(pr_dev_info)) {
 328                 err = -EBUSY;
 329                 goto err_out;
 330         }
 331
 332         /*
 333          * Update the page reporting order if it's specified by driver.
 334          * Otherwise, it falls back to @pageblock_order.
 335          */
 336         page_reporting_order = prdev->order ? : pageblock_order;
 337
 338         /* initialize state and work structures */
 339         atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
 340         INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
 341
 342         /* Begin initial flush of zones */
 343         __page_reporting_request(prdev);
 344
 345         /* Assign device to allow notifications */
 346         rcu_assign_pointer(pr_dev_info, prdev);
 347
 348         /* enable page reporting notification */
 349         if (!static_key_enabled(&page_reporting_enabled)) {
 350                 static_branch_enable(&page_reporting_enabled);
 351                 pr_info("Free page reporting enabled\n");
 352         }
 353 err_out:
 354         mutex_unlock(&page_reporting_mutex);
 355
 356         return err;
 357 }
 358 EXPORT_SYMBOL_GPL(page_reporting_register);
 359
 360 void page_reporting_unregister(struct page_reporting_dev_info *prdev)
 361 {
 362         mutex_lock(&page_reporting_mutex);
 363
 364         if (rcu_access_pointer(pr_dev_info) == prdev) {
 365                 /* Disable page reporting notification */
 366                 RCU_INIT_POINTER(pr_dev_info, NULL);
 367                 synchronize_rcu();
 368
 369                 /* Flush any existing work, and lock it out */
 370                 cancel_delayed_work_sync(&prdev->work);
 371         }
 372
 373         mutex_unlock(&page_reporting_mutex);
 374 }
 375 EXPORT_SYMBOL_GPL(page_reporting_unregister);