mm/memory_hotplug.c

   1 /*
   2  *  linux/mm/memory_hotplug.c
   3  *
   4  *  Copyright (C)
   5  */
   6
   7 #include <linux/stddef.h>
   8 #include <linux/mm.h>
   9 #include <linux/sched/signal.h>
  10 #include <linux/swap.h>
  11 #include <linux/interrupt.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/compiler.h>
  14 #include <linux/export.h>
  15 #include <linux/pagevec.h>
  16 #include <linux/writeback.h>
  17 #include <linux/slab.h>
  18 #include <linux/sysctl.h>
  19 #include <linux/cpu.h>
  20 #include <linux/memory.h>
  21 #include <linux/memremap.h>
  22 #include <linux/memory_hotplug.h>
  23 #include <linux/highmem.h>
  24 #include <linux/vmalloc.h>
  25 #include <linux/ioport.h>
  26 #include <linux/delay.h>
  27 #include <linux/migrate.h>
  28 #include <linux/page-isolation.h>
  29 #include <linux/pfn.h>
  30 #include <linux/suspend.h>
  31 #include <linux/mm_inline.h>
  32 #include <linux/firmware-map.h>
  33 #include <linux/stop_machine.h>
  34 #include <linux/hugetlb.h>
  35 #include <linux/memblock.h>
  36 #include <linux/bootmem.h>
  37 #include <linux/compaction.h>
  38
  39 #include <asm/tlbflush.h>
  40
  41 #include "internal.h"
  42
  43 /*
  44  * online_page_callback contains pointer to current page onlining function.
  45  * Initially it is generic_online_page(). If it is required it could be
  46  * changed by calling set_online_page_callback() for callback registration
  47  * and restore_online_page_callback() for generic callback restore.
  48  */
  49
  50 static void generic_online_page(struct page *page);
  51
  52 static online_page_callback_t online_page_callback = generic_online_page;
  53 static DEFINE_MUTEX(online_page_callback_lock);
  54
  55 DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
  56
  57 static int default_kernel_zone = ZONE_NORMAL;
  58
  59 void get_online_mems(void)
  60 {
  61         percpu_down_read(&mem_hotplug_lock);
  62 }
  63
  64 void put_online_mems(void)
  65 {
  66         percpu_up_read(&mem_hotplug_lock);
  67 }
  68
  69 bool movable_node_enabled = false;
  70
  71 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
  72 bool memhp_auto_online;
  73 #else
  74 bool memhp_auto_online = true;
  75 #endif
  76 EXPORT_SYMBOL_GPL(memhp_auto_online);
  77
  78 static int __init setup_memhp_default_state(char *str)
  79 {
  80         if (!strcmp(str, "online"))
  81                 memhp_auto_online = true;
  82         else if (!strcmp(str, "offline"))
  83                 memhp_auto_online = false;
  84
  85         return 1;
  86 }
  87 __setup("memhp_default_state=", setup_memhp_default_state);
  88
  89 void mem_hotplug_begin(void)
  90 {
  91         cpus_read_lock();
  92         percpu_down_write(&mem_hotplug_lock);
  93 }
  94
  95 void mem_hotplug_done(void)
  96 {
  97         percpu_up_write(&mem_hotplug_lock);
  98         cpus_read_unlock();
  99 }
 100
 101 /* add this memory to iomem resource */
 102 static struct resource *register_memory_resource(u64 start, u64 size)
 103 {
 104         struct resource *res, *conflict;
 105         res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 106         if (!res)
 107                 return ERR_PTR(-ENOMEM);
 108
 109         res->name = "System RAM";
 110         res->start = start;
 111         res->end = start + size - 1;
 112         res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 113         conflict =  request_resource_conflict(&iomem_resource, res);
 114         if (conflict) {
 115                 if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
 116                         pr_debug("Device unaddressable memory block "
 117                                  "memory hotplug at %#010llx !\n",
 118                                  (unsigned long long)start);
 119                 }
 120                 pr_debug("System RAM resource %pR cannot be added\n", res);
 121                 kfree(res);
 122                 return ERR_PTR(-EEXIST);
 123         }
 124         return res;
 125 }
 126
 127 static void release_memory_resource(struct resource *res)
 128 {
 129         if (!res)
 130                 return;
 131         release_resource(res);
 132         kfree(res);
 133         return;
 134 }
 135
 136 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 137 void get_page_bootmem(unsigned long info,  struct page *page,
 138                       unsigned long type)
 139 {
 140         page->freelist = (void *)type;
 141         SetPagePrivate(page);
 142         set_page_private(page, info);
 143         page_ref_inc(page);
 144 }
 145
 146 void put_page_bootmem(struct page *page)
 147 {
 148         unsigned long type;
 149
 150         type = (unsigned long) page->freelist;
 151         BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
 152                type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 153
 154         if (page_ref_dec_return(page) == 1) {
 155                 page->freelist = NULL;
 156                 ClearPagePrivate(page);
 157                 set_page_private(page, 0);
 158                 INIT_LIST_HEAD(&page->lru);
 159                 free_reserved_page(page);
 160         }
 161 }
 162
 163 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 164 #ifndef CONFIG_SPARSEMEM_VMEMMAP
 165 static void register_page_bootmem_info_section(unsigned long start_pfn)
 166 {
 167         unsigned long *usemap, mapsize, section_nr, i;
 168         struct mem_section *ms;
 169         struct page *page, *memmap;
 170
 171         section_nr = pfn_to_section_nr(start_pfn);
 172         ms = __nr_to_section(section_nr);
 173
 174         /* Get section's memmap address */
 175         memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 176
 177         /*
 178          * Get page for the memmap's phys address
 179          * XXX: need more consideration for sparse_vmemmap...
 180          */
 181         page = virt_to_page(memmap);
 182         mapsize = sizeof(struct page) * PAGES_PER_SECTION;
 183         mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
 184
 185         /* remember memmap's page */
 186         for (i = 0; i < mapsize; i++, page++)
 187                 get_page_bootmem(section_nr, page, SECTION_INFO);
 188
 189         usemap = __nr_to_section(section_nr)->pageblock_flags;
 190         page = virt_to_page(usemap);
 191
 192         mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
 193
 194         for (i = 0; i < mapsize; i++, page++)
 195                 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 196
 197 }
 198 #else /* CONFIG_SPARSEMEM_VMEMMAP */
 199 static void register_page_bootmem_info_section(unsigned long start_pfn)
 200 {
 201         unsigned long *usemap, mapsize, section_nr, i;
 202         struct mem_section *ms;
 203         struct page *page, *memmap;
 204
 205         if (!pfn_valid(start_pfn))
 206                 return;
 207
 208         section_nr = pfn_to_section_nr(start_pfn);
 209         ms = __nr_to_section(section_nr);
 210
 211         memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 212
 213         register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 214
 215         usemap = __nr_to_section(section_nr)->pageblock_flags;
 216         page = virt_to_page(usemap);
 217
 218         mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
 219
 220         for (i = 0; i < mapsize; i++, page++)
 221                 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 222 }
 223 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 224
 225 void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 226 {
 227         unsigned long i, pfn, end_pfn, nr_pages;
 228         int node = pgdat->node_id;
 229         struct page *page;
 230
 231         nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
 232         page = virt_to_page(pgdat);
 233
 234         for (i = 0; i < nr_pages; i++, page++)
 235                 get_page_bootmem(node, page, NODE_INFO);
 236
 237         pfn = pgdat->node_start_pfn;
 238         end_pfn = pgdat_end_pfn(pgdat);
 239
 240         /* register section info */
 241         for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 242                 /*
 243                  * Some platforms can assign the same pfn to multiple nodes - on
 244                  * node0 as well as nodeN.  To avoid registering a pfn against
 245                  * multiple nodes we check that this pfn does not already
 246                  * reside in some other nodes.
 247                  */
 248                 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
 249                         register_page_bootmem_info_section(pfn);
 250         }
 251 }
 252 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 253
 254 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 255                 bool want_memblock)
 256 {
 257         int ret;
 258         int i;
 259
 260         if (pfn_valid(phys_start_pfn))
 261                 return -EEXIST;
 262
 263         ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
 264         if (ret < 0)
 265                 return ret;
 266
 267         /*
 268          * Make all the pages reserved so that nobody will stumble over half
 269          * initialized state.
 270          * FIXME: We also have to associate it with a node because page_to_nid
 271          * relies on having page with the proper node.
 272          */
 273         for (i = 0; i < PAGES_PER_SECTION; i++) {
 274                 unsigned long pfn = phys_start_pfn + i;
 275                 struct page *page;
 276                 if (!pfn_valid(pfn))
 277                         continue;
 278
 279                 page = pfn_to_page(pfn);
 280                 set_page_node(page, nid);
 281                 SetPageReserved(page);
 282         }
 283
 284         if (!want_memblock)
 285                 return 0;
 286
 287         return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 288 }
 289
 290 /*
 291  * Reasonably generic function for adding memory.  It is
 292  * expected that archs that support memory hotplug will
 293  * call this function after deciding the zone to which to
 294  * add the new pages.
 295  */
 296 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 297                         unsigned long nr_pages, bool want_memblock)
 298 {
 299         unsigned long i;
 300         int err = 0;
 301         int start_sec, end_sec;
 302         struct vmem_altmap *altmap;
 303
 304         /* during initialize mem_map, align hot-added range to section */
 305         start_sec = pfn_to_section_nr(phys_start_pfn);
 306         end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 307
 308         altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
 309         if (altmap) {
 310                 /*
 311                  * Validate altmap is within bounds of the total request
 312                  */
 313                 if (altmap->base_pfn != phys_start_pfn
 314                                 || vmem_altmap_offset(altmap) > nr_pages) {
 315                         pr_warn_once("memory add fail, invalid altmap\n");
 316                         err = -EINVAL;
 317                         goto out;
 318                 }
 319                 altmap->alloc = 0;
 320         }
 321
 322         for (i = start_sec; i <= end_sec; i++) {
 323                 err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
 324
 325                 /*
 326                  * EEXIST is finally dealt with by ioresource collision
 327                  * check. see add_memory() => register_memory_resource()
 328                  * Warning will be printed if there is collision.
 329                  */
 330                 if (err && (err != -EEXIST))
 331                         break;
 332                 err = 0;
 333                 cond_resched();
 334         }
 335         vmemmap_populate_print_last();
 336 out:
 337         return err;
 338 }
 339 EXPORT_SYMBOL_GPL(__add_pages);
 340
 341 #ifdef CONFIG_MEMORY_HOTREMOVE
 342 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 343 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 344                                      unsigned long start_pfn,
 345                                      unsigned long end_pfn)
 346 {
 347         struct mem_section *ms;
 348
 349         for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
 350                 ms = __pfn_to_section(start_pfn);
 351
 352                 if (unlikely(!valid_section(ms)))
 353                         continue;
 354
 355                 if (unlikely(pfn_to_nid(start_pfn) != nid))
 356                         continue;
 357
 358                 if (zone && zone != page_zone(pfn_to_page(start_pfn)))
 359                         continue;
 360
 361                 return start_pfn;
 362         }
 363
 364         return 0;
 365 }
 366
 367 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
 368 static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
 369                                     unsigned long start_pfn,
 370                                     unsigned long end_pfn)
 371 {
 372         struct mem_section *ms;
 373         unsigned long pfn;
 374
 375         /* pfn is the end pfn of a memory section. */
 376         pfn = end_pfn - 1;
 377         for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
 378                 ms = __pfn_to_section(pfn);
 379
 380                 if (unlikely(!valid_section(ms)))
 381                         continue;
 382
 383                 if (unlikely(pfn_to_nid(pfn) != nid))
 384                         continue;
 385
 386                 if (zone && zone != page_zone(pfn_to_page(pfn)))
 387                         continue;
 388
 389                 return pfn;
 390         }
 391
 392         return 0;
 393 }
 394
 395 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 396                              unsigned long end_pfn)
 397 {
 398         unsigned long zone_start_pfn = zone->zone_start_pfn;
 399         unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
 400         unsigned long zone_end_pfn = z;
 401         unsigned long pfn;
 402         struct mem_section *ms;
 403         int nid = zone_to_nid(zone);
 404
 405         zone_span_writelock(zone);
 406         if (zone_start_pfn == start_pfn) {
 407                 /*
 408                  * If the section is smallest section in the zone, it need
 409                  * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
 410                  * In this case, we find second smallest valid mem_section
 411                  * for shrinking zone.
 412                  */
 413                 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
 414                                                 zone_end_pfn);
 415                 if (pfn) {
 416                         zone->zone_start_pfn = pfn;
 417                         zone->spanned_pages = zone_end_pfn - pfn;
 418                 }
 419         } else if (zone_end_pfn == end_pfn) {
 420                 /*
 421                  * If the section is biggest section in the zone, it need
 422                  * shrink zone->spanned_pages.
 423                  * In this case, we find second biggest valid mem_section for
 424                  * shrinking zone.
 425                  */
 426                 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
 427                                                start_pfn);
 428                 if (pfn)
 429                         zone->spanned_pages = pfn - zone_start_pfn + 1;
 430         }
 431
 432         /*
 433          * The section is not biggest or smallest mem_section in the zone, it
 434          * only creates a hole in the zone. So in this case, we need not
 435          * change the zone. But perhaps, the zone has only hole data. Thus
 436          * it check the zone has only hole or not.
 437          */
 438         pfn = zone_start_pfn;
 439         for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
 440                 ms = __pfn_to_section(pfn);
 441
 442                 if (unlikely(!valid_section(ms)))
 443                         continue;
 444
 445                 if (page_zone(pfn_to_page(pfn)) != zone)
 446                         continue;
 447
 448                  /* If the section is current section, it continues the loop */
 449                 if (start_pfn == pfn)
 450                         continue;
 451
 452                 /* If we find valid section, we have nothing to do */
 453                 zone_span_writeunlock(zone);
 454                 return;
 455         }
 456
 457         /* The zone has no valid section */
 458         zone->zone_start_pfn = 0;
 459         zone->spanned_pages = 0;
 460         zone_span_writeunlock(zone);
 461 }
 462
 463 static void shrink_pgdat_span(struct pglist_data *pgdat,
 464                               unsigned long start_pfn, unsigned long end_pfn)
 465 {
 466         unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
 467         unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
 468         unsigned long pgdat_end_pfn = p;
 469         unsigned long pfn;
 470         struct mem_section *ms;
 471         int nid = pgdat->node_id;
 472
 473         if (pgdat_start_pfn == start_pfn) {
 474                 /*
 475                  * If the section is smallest section in the pgdat, it need
 476                  * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
 477                  * In this case, we find second smallest valid mem_section
 478                  * for shrinking zone.
 479                  */
 480                 pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
 481                                                 pgdat_end_pfn);
 482                 if (pfn) {
 483                         pgdat->node_start_pfn = pfn;
 484                         pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
 485                 }
 486         } else if (pgdat_end_pfn == end_pfn) {
 487                 /*
 488                  * If the section is biggest section in the pgdat, it need
 489                  * shrink pgdat->node_spanned_pages.
 490                  * In this case, we find second biggest valid mem_section for
 491                  * shrinking zone.
 492                  */
 493                 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
 494                                                start_pfn);
 495                 if (pfn)
 496                         pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
 497         }
 498
 499         /*
 500          * If the section is not biggest or smallest mem_section in the pgdat,
 501          * it only creates a hole in the pgdat. So in this case, we need not
 502          * change the pgdat.
 503          * But perhaps, the pgdat has only hole data. Thus it check the pgdat
 504          * has only hole or not.
 505          */
 506         pfn = pgdat_start_pfn;
 507         for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
 508                 ms = __pfn_to_section(pfn);
 509
 510                 if (unlikely(!valid_section(ms)))
 511                         continue;
 512
 513                 if (pfn_to_nid(pfn) != nid)
 514                         continue;
 515
 516                  /* If the section is current section, it continues the loop */
 517                 if (start_pfn == pfn)
 518                         continue;
 519
 520                 /* If we find valid section, we have nothing to do */
 521                 return;
 522         }
 523
 524         /* The pgdat has no valid section */
 525         pgdat->node_start_pfn = 0;
 526         pgdat->node_spanned_pages = 0;
 527 }
 528
 529 static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 530 {
 531         struct pglist_data *pgdat = zone->zone_pgdat;
 532         int nr_pages = PAGES_PER_SECTION;
 533         unsigned long flags;
 534
 535         pgdat_resize_lock(zone->zone_pgdat, &flags);
 536         shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
 537         shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
 538         pgdat_resize_unlock(zone->zone_pgdat, &flags);
 539 }
 540
 541 static int __remove_section(struct zone *zone, struct mem_section *ms,
 542                 unsigned long map_offset)
 543 {
 544         unsigned long start_pfn;
 545         int scn_nr;
 546         int ret = -EINVAL;
 547
 548         if (!valid_section(ms))
 549                 return ret;
 550
 551         ret = unregister_memory_section(ms);
 552         if (ret)
 553                 return ret;
 554
 555         scn_nr = __section_nr(ms);
 556         start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 557         __remove_zone(zone, start_pfn);
 558
 559         sparse_remove_one_section(zone, ms, map_offset);
 560         return 0;
 561 }
 562
 563 /**
 564  * __remove_pages() - remove sections of pages from a zone
 565  * @zone: zone from which pages need to be removed
 566  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 567  * @nr_pages: number of pages to remove (must be multiple of section size)
 568  *
 569  * Generic helper function to remove section mappings and sysfs entries
 570  * for the section of the memory we are removing. Caller needs to make
 571  * sure that pages are marked reserved and zones are adjust properly by
 572  * calling offline_pages().
 573  */
 574 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 575                  unsigned long nr_pages)
 576 {
 577         unsigned long i;
 578         unsigned long map_offset = 0;
 579         int sections_to_remove, ret = 0;
 580
 581         /* In the ZONE_DEVICE case device driver owns the memory region */
 582         if (is_dev_zone(zone)) {
 583                 struct page *page = pfn_to_page(phys_start_pfn);
 584                 struct vmem_altmap *altmap;
 585
 586                 altmap = to_vmem_altmap((unsigned long) page);
 587                 if (altmap)
 588                         map_offset = vmem_altmap_offset(altmap);
 589         } else {
 590                 resource_size_t start, size;
 591
 592                 start = phys_start_pfn << PAGE_SHIFT;
 593                 size = nr_pages * PAGE_SIZE;
 594
 595                 ret = release_mem_region_adjustable(&iomem_resource, start,
 596                                         size);
 597                 if (ret) {
 598                         resource_size_t endres = start + size - 1;
 599
 600                         pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
 601                                         &start, &endres, ret);
 602                 }
 603         }
 604
 605         clear_zone_contiguous(zone);
 606
 607         /*
 608          * We can only remove entire sections
 609          */
 610         BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 611         BUG_ON(nr_pages % PAGES_PER_SECTION);
 612
 613         sections_to_remove = nr_pages / PAGES_PER_SECTION;
 614         for (i = 0; i < sections_to_remove; i++) {
 615                 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
 616
 617                 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
 618                 map_offset = 0;
 619                 if (ret)
 620                         break;
 621         }
 622
 623         set_zone_contiguous(zone);
 624
 625         return ret;
 626 }
 627 #endif /* CONFIG_MEMORY_HOTREMOVE */
 628
 629 int set_online_page_callback(online_page_callback_t callback)
 630 {
 631         int rc = -EINVAL;
 632
 633         get_online_mems();
 634         mutex_lock(&online_page_callback_lock);
 635
 636         if (online_page_callback == generic_online_page) {
 637                 online_page_callback = callback;
 638                 rc = 0;
 639         }
 640
 641         mutex_unlock(&online_page_callback_lock);
 642         put_online_mems();
 643
 644         return rc;
 645 }
 646 EXPORT_SYMBOL_GPL(set_online_page_callback);
 647
 648 int restore_online_page_callback(online_page_callback_t callback)
 649 {
 650         int rc = -EINVAL;
 651
 652         get_online_mems();
 653         mutex_lock(&online_page_callback_lock);
 654
 655         if (online_page_callback == callback) {
 656                 online_page_callback = generic_online_page;
 657                 rc = 0;
 658         }
 659
 660         mutex_unlock(&online_page_callback_lock);
 661         put_online_mems();
 662
 663         return rc;
 664 }
 665 EXPORT_SYMBOL_GPL(restore_online_page_callback);
 666
 667 void __online_page_set_limits(struct page *page)
 668 {
 669 }
 670 EXPORT_SYMBOL_GPL(__online_page_set_limits);
 671
 672 void __online_page_increment_counters(struct page *page)
 673 {
 674         adjust_managed_page_count(page, 1);
 675 }
 676 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 677
 678 void __online_page_free(struct page *page)
 679 {
 680         __free_reserved_page(page);
 681 }
 682 EXPORT_SYMBOL_GPL(__online_page_free);
 683
 684 static void generic_online_page(struct page *page)
 685 {
 686         __online_page_set_limits(page);
 687         __online_page_increment_counters(page);
 688         __online_page_free(page);
 689 }
 690
 691 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 692                         void *arg)
 693 {
 694         unsigned long i;
 695         unsigned long onlined_pages = *(unsigned long *)arg;
 696         struct page *page;
 697
 698         if (PageReserved(pfn_to_page(start_pfn)))
 699                 for (i = 0; i < nr_pages; i++) {
 700                         page = pfn_to_page(start_pfn + i);
 701                         (*online_page_callback)(page);
 702                         onlined_pages++;
 703                 }
 704
 705         online_mem_sections(start_pfn, start_pfn + nr_pages);
 706
 707         *(unsigned long *)arg = onlined_pages;
 708         return 0;
 709 }
 710
 711 /* check which state of node_states will be changed when online memory */
 712 static void node_states_check_changes_online(unsigned long nr_pages,
 713         struct zone *zone, struct memory_notify *arg)
 714 {
 715         int nid = zone_to_nid(zone);
 716         enum zone_type zone_last = ZONE_NORMAL;
 717
 718         /*
 719          * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
 720          * contains nodes which have zones of 0...ZONE_NORMAL,
 721          * set zone_last to ZONE_NORMAL.
 722          *
 723          * If we don't have HIGHMEM nor movable node,
 724          * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
 725          * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
 726          */
 727         if (N_MEMORY == N_NORMAL_MEMORY)
 728                 zone_last = ZONE_MOVABLE;
 729
 730         /*
 731          * if the memory to be online is in a zone of 0...zone_last, and
 732          * the zones of 0...zone_last don't have memory before online, we will
 733          * need to set the node to node_states[N_NORMAL_MEMORY] after
 734          * the memory is online.
 735          */
 736         if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
 737                 arg->status_change_nid_normal = nid;
 738         else
 739                 arg->status_change_nid_normal = -1;
 740
 741 #ifdef CONFIG_HIGHMEM
 742         /*
 743          * If we have movable node, node_states[N_HIGH_MEMORY]
 744          * contains nodes which have zones of 0...ZONE_HIGHMEM,
 745          * set zone_last to ZONE_HIGHMEM.
 746          *
 747          * If we don't have movable node, node_states[N_NORMAL_MEMORY]
 748          * contains nodes which have zones of 0...ZONE_MOVABLE,
 749          * set zone_last to ZONE_MOVABLE.
 750          */
 751         zone_last = ZONE_HIGHMEM;
 752         if (N_MEMORY == N_HIGH_MEMORY)
 753                 zone_last = ZONE_MOVABLE;
 754
 755         if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
 756                 arg->status_change_nid_high = nid;
 757         else
 758                 arg->status_change_nid_high = -1;
 759 #else
 760         arg->status_change_nid_high = arg->status_change_nid_normal;
 761 #endif
 762
 763         /*
 764          * if the node don't have memory befor online, we will need to
 765          * set the node to node_states[N_MEMORY] after the memory
 766          * is online.
 767          */
 768         if (!node_state(nid, N_MEMORY))
 769                 arg->status_change_nid = nid;
 770         else
 771                 arg->status_change_nid = -1;
 772 }
 773
 774 static void node_states_set_node(int node, struct memory_notify *arg)
 775 {
 776         if (arg->status_change_nid_normal >= 0)
 777                 node_set_state(node, N_NORMAL_MEMORY);
 778
 779         if (arg->status_change_nid_high >= 0)
 780                 node_set_state(node, N_HIGH_MEMORY);
 781
 782         node_set_state(node, N_MEMORY);
 783 }
 784
 785 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
 786                 unsigned long nr_pages)
 787 {
 788         unsigned long old_end_pfn = zone_end_pfn(zone);
 789
 790         if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
 791                 zone->zone_start_pfn = start_pfn;
 792
 793         zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
 794 }
 795
 796 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
 797                                      unsigned long nr_pages)
 798 {
 799         unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
 800
 801         if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
 802                 pgdat->node_start_pfn = start_pfn;
 803
 804         pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
 805 }
 806
 807 void __ref move_pfn_range_to_zone(struct zone *zone,
 808                 unsigned long start_pfn, unsigned long nr_pages)
 809 {
 810         struct pglist_data *pgdat = zone->zone_pgdat;
 811         int nid = pgdat->node_id;
 812         unsigned long flags;
 813
 814         if (zone_is_empty(zone))
 815                 init_currently_empty_zone(zone, start_pfn, nr_pages);
 816
 817         clear_zone_contiguous(zone);
 818
 819         /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
 820         pgdat_resize_lock(pgdat, &flags);
 821         zone_span_writelock(zone);
 822         resize_zone_range(zone, start_pfn, nr_pages);
 823         zone_span_writeunlock(zone);
 824         resize_pgdat_range(pgdat, start_pfn, nr_pages);
 825         pgdat_resize_unlock(pgdat, &flags);
 826
 827         /*
 828          * TODO now we have a visible range of pages which are not associated
 829          * with their zone properly. Not nice but set_pfnblock_flags_mask
 830          * expects the zone spans the pfn range. All the pages in the range
 831          * are reserved so nobody should be touching them so we should be safe
 832          */
 833         memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
 834
 835         set_zone_contiguous(zone);
 836 }
 837
 838 void set_default_mem_hotplug_zone(enum zone_type zone)
 839 {
 840         default_kernel_zone = zone;
 841 }
 842
 843 #ifdef CONFIG_HIGHMEM
 844 #define MAX_KERNEL_ZONE ZONE_HIGHMEM
 845 #else
 846 #define MAX_KERNEL_ZONE ZONE_NORMAL
 847 #endif
 848
 849 /*
 850  * Returns a default kernel memory zone for the given pfn range.
 851  * If no kernel zone covers this pfn range it will automatically go
 852  * to the MAX_KERNEL_ZONE.
 853  */
 854 static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
 855                 unsigned long nr_pages)
 856 {
 857         struct pglist_data *pgdat = NODE_DATA(nid);
 858         int zid;
 859
 860         for (zid = 0; zid <= MAX_KERNEL_ZONE; zid++) {
 861                 struct zone *zone = &pgdat->node_zones[zid];
 862
 863                 if (zone_intersects(zone, start_pfn, nr_pages))
 864                         return zone;
 865         }
 866
 867         return &pgdat->node_zones[default_kernel_zone];
 868 }
 869
 870 static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
 871                 unsigned long nr_pages)
 872 {
 873         struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
 874                         nr_pages);
 875         struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
 876         bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
 877         bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
 878
 879         /*
 880          * We inherit the existing zone in a simple case where zones do not
 881          * overlap in the given range
 882          */
 883         if (in_kernel ^ in_movable)
 884                 return (in_kernel) ? kernel_zone : movable_zone;
 885
 886         /*
 887          * If the range doesn't belong to any zone or two zones overlap in the
 888          * given range then we use movable zone only if movable_node is
 889          * enabled because we always online to a kernel zone by default.
 890          */
 891         return movable_node_enabled ? movable_zone : kernel_zone;
 892 }
 893
 894 struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 895                 unsigned long nr_pages)
 896 {
 897         if (online_type == MMOP_ONLINE_KERNEL)
 898                 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
 899
 900         if (online_type == MMOP_ONLINE_MOVABLE)
 901                 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
 902
 903         return default_zone_for_pfn(nid, start_pfn, nr_pages);
 904 }
 905
 906 /*
 907  * Associates the given pfn range with the given node and the zone appropriate
 908  * for the given online type.
 909  */
 910 static struct zone * __meminit move_pfn_range(int online_type, int nid,
 911                 unsigned long start_pfn, unsigned long nr_pages)
 912 {
 913         struct zone *zone;
 914
 915         zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
 916         move_pfn_range_to_zone(zone, start_pfn, nr_pages);
 917         return zone;
 918 }
 919
 920 /* Must be protected by mem_hotplug_begin() or a device_lock */
 921 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 922 {
 923         unsigned long flags;
 924         unsigned long onlined_pages = 0;
 925         struct zone *zone;
 926         int need_zonelists_rebuild = 0;
 927         int nid;
 928         int ret;
 929         struct memory_notify arg;
 930
 931         nid = pfn_to_nid(pfn);
 932         /* associate pfn range with the zone */
 933         zone = move_pfn_range(online_type, nid, pfn, nr_pages);
 934
 935         arg.start_pfn = pfn;
 936         arg.nr_pages = nr_pages;
 937         node_states_check_changes_online(nr_pages, zone, &arg);
 938
 939         ret = memory_notify(MEM_GOING_ONLINE, &arg);
 940         ret = notifier_to_errno(ret);
 941         if (ret)
 942                 goto failed_addition;
 943
 944         /*
 945          * If this zone is not populated, then it is not in zonelist.
 946          * This means the page allocator ignores this zone.
 947          * So, zonelist must be updated after online.
 948          */
 949         if (!populated_zone(zone)) {
 950                 need_zonelists_rebuild = 1;
 951                 setup_zone_pageset(zone);
 952         }
 953
 954         ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 955                 online_pages_range);
 956         if (ret) {
 957                 if (need_zonelists_rebuild)
 958                         zone_pcp_reset(zone);
 959                 goto failed_addition;
 960         }
 961
 962         zone->present_pages += onlined_pages;
 963
 964         pgdat_resize_lock(zone->zone_pgdat, &flags);
 965         zone->zone_pgdat->node_present_pages += onlined_pages;
 966         pgdat_resize_unlock(zone->zone_pgdat, &flags);
 967
 968         if (onlined_pages) {
 969                 node_states_set_node(nid, &arg);
 970                 if (need_zonelists_rebuild)
 971                         build_all_zonelists(NULL);
 972                 else
 973                         zone_pcp_update(zone);
 974         }
 975
 976         init_per_zone_wmark_min();
 977
 978         if (onlined_pages) {
 979                 kswapd_run(nid);
 980                 kcompactd_run(nid);
 981         }
 982
 983         vm_total_pages = nr_free_pagecache_pages();
 984
 985         writeback_set_ratelimit();
 986
 987         if (onlined_pages)
 988                 memory_notify(MEM_ONLINE, &arg);
 989         return 0;
 990
 991 failed_addition:
 992         pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
 993                  (unsigned long long) pfn << PAGE_SHIFT,
 994                  (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
 995         memory_notify(MEM_CANCEL_ONLINE, &arg);
 996         return ret;
 997 }
 998 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 999
1000 static void reset_node_present_pages(pg_data_t *pgdat)
1001 {
1002         struct zone *z;
1003
1004         for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1005                 z->present_pages = 0;
1006
1007         pgdat->node_present_pages = 0;
1008 }
1009
1010 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1011 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1012 {
1013         struct pglist_data *pgdat;
1014         unsigned long zones_size[MAX_NR_ZONES] = {0};
1015         unsigned long zholes_size[MAX_NR_ZONES] = {0};
1016         unsigned long start_pfn = PFN_DOWN(start);
1017
1018         pgdat = NODE_DATA(nid);
1019         if (!pgdat) {
1020                 pgdat = arch_alloc_nodedata(nid);
1021                 if (!pgdat)
1022                         return NULL;
1023
1024                 arch_refresh_nodedata(nid, pgdat);
1025         } else {
1026                 /*
1027                  * Reset the nr_zones, order and classzone_idx before reuse.
1028                  * Note that kswapd will init kswapd_classzone_idx properly
1029                  * when it starts in the near future.
1030                  */
1031                 pgdat->nr_zones = 0;
1032                 pgdat->kswapd_order = 0;
1033                 pgdat->kswapd_classzone_idx = 0;
1034         }
1035
1036         /* we can use NODE_DATA(nid) from here */
1037
1038         /* init node's zones as empty zones, we don't have any present pages.*/
1039         free_area_init_node(nid, zones_size, start_pfn, zholes_size);
1040         pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1041
1042         /*
1043          * The node we allocated has no zone fallback lists. For avoiding
1044          * to access not-initialized zonelist, build here.
1045          */
1046         build_all_zonelists(pgdat);
1047
1048         /*
1049          * zone->managed_pages is set to an approximate value in
1050          * free_area_init_core(), which will cause
1051          * /sys/device/system/node/nodeX/meminfo has wrong data.
1052          * So reset it to 0 before any memory is onlined.
1053          */
1054         reset_node_managed_pages(pgdat);
1055
1056         /*
1057          * When memory is hot-added, all the memory is in offline state. So
1058          * clear all zones' present_pages because they will be updated in
1059          * online_pages() and offline_pages().
1060          */
1061         reset_node_present_pages(pgdat);
1062
1063         return pgdat;
1064 }
1065
1066 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1067 {
1068         arch_refresh_nodedata(nid, NULL);
1069         free_percpu(pgdat->per_cpu_nodestats);
1070         arch_free_nodedata(pgdat);
1071         return;
1072 }
1073
1074
1075 /**
1076  * try_online_node - online a node if offlined
1077  *
1078  * called by cpu_up() to online a node without onlined memory.
1079  */
1080 int try_online_node(int nid)
1081 {
1082         pg_data_t       *pgdat;
1083         int     ret;
1084
1085         if (node_online(nid))
1086                 return 0;
1087
1088         mem_hotplug_begin();
1089         pgdat = hotadd_new_pgdat(nid, 0);
1090         if (!pgdat) {
1091                 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1092                 ret = -ENOMEM;
1093                 goto out;
1094         }
1095         node_set_online(nid);
1096         ret = register_one_node(nid);
1097         BUG_ON(ret);
1098 out:
1099         mem_hotplug_done();
1100         return ret;
1101 }
1102
1103 static int check_hotplug_memory_range(u64 start, u64 size)
1104 {
1105         u64 start_pfn = PFN_DOWN(start);
1106         u64 nr_pages = size >> PAGE_SHIFT;
1107
1108         /* Memory range must be aligned with section */
1109         if ((start_pfn & ~PAGE_SECTION_MASK) ||
1110             (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
1111                 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
1112                                 (unsigned long long)start,
1113                                 (unsigned long long)size);
1114                 return -EINVAL;
1115         }
1116
1117         return 0;
1118 }
1119
1120 static int online_memory_block(struct memory_block *mem, void *arg)
1121 {
1122         return device_online(&mem->dev);
1123 }
1124
1125 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1126 int __ref add_memory_resource(int nid, struct resource *res, bool online)
1127 {
1128         u64 start, size;
1129         pg_data_t *pgdat = NULL;
1130         bool new_pgdat;
1131         bool new_node;
1132         int ret;
1133
1134         start = res->start;
1135         size = resource_size(res);
1136
1137         ret = check_hotplug_memory_range(start, size);
1138         if (ret)
1139                 return ret;
1140
1141         {       /* Stupid hack to suppress address-never-null warning */
1142                 void *p = NODE_DATA(nid);
1143                 new_pgdat = !p;
1144         }
1145
1146         mem_hotplug_begin();
1147
1148         /*
1149          * Add new range to memblock so that when hotadd_new_pgdat() is called
1150          * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1151          * this new range and calculate total pages correctly.  The range will
1152          * be removed at hot-remove time.
1153          */
1154         memblock_add_node(start, size, nid);
1155
1156         new_node = !node_online(nid);
1157         if (new_node) {
1158                 pgdat = hotadd_new_pgdat(nid, start);
1159                 ret = -ENOMEM;
1160                 if (!pgdat)
1161                         goto error;
1162         }
1163
1164         /* call arch's memory hotadd */
1165         ret = arch_add_memory(nid, start, size, true);
1166
1167         if (ret < 0)
1168                 goto error;
1169
1170         /* we online node here. we can't roll back from here. */
1171         node_set_online(nid);
1172
1173         if (new_node) {
1174                 unsigned long start_pfn = start >> PAGE_SHIFT;
1175                 unsigned long nr_pages = size >> PAGE_SHIFT;
1176
1177                 ret = __register_one_node(nid);
1178                 if (ret)
1179                         goto register_fail;
1180
1181                 /*
1182                  * link memory sections under this node. This is already
1183                  * done when creatig memory section in register_new_memory
1184                  * but that depends to have the node registered so offline
1185                  * nodes have to go through register_node.
1186                  * TODO clean up this mess.
1187                  */
1188                 ret = link_mem_sections(nid, start_pfn, nr_pages);
1189 register_fail:
1190                 /*
1191                  * If sysfs file of new node can't create, cpu on the node
1192                  * can't be hot-added. There is no rollback way now.
1193                  * So, check by BUG_ON() to catch it reluctantly..
1194                  */
1195                 BUG_ON(ret);
1196         }
1197
1198         /* create new memmap entry */
1199         firmware_map_add_hotplug(start, start + size, "System RAM");
1200
1201         /* online pages if requested */
1202         if (online)
1203                 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1204                                   NULL, online_memory_block);
1205
1206         goto out;
1207
1208 error:
1209         /* rollback pgdat allocation and others */
1210         if (new_pgdat && pgdat)
1211                 rollback_node_hotadd(nid, pgdat);
1212         memblock_remove(start, size);
1213
1214 out:
1215         mem_hotplug_done();
1216         return ret;
1217 }
1218 EXPORT_SYMBOL_GPL(add_memory_resource);
1219
1220 int __ref add_memory(int nid, u64 start, u64 size)
1221 {
1222         struct resource *res;
1223         int ret;
1224
1225         res = register_memory_resource(start, size);
1226         if (IS_ERR(res))
1227                 return PTR_ERR(res);
1228
1229         ret = add_memory_resource(nid, res, memhp_auto_online);
1230         if (ret < 0)
1231                 release_memory_resource(res);
1232         return ret;
1233 }
1234 EXPORT_SYMBOL_GPL(add_memory);
1235
1236 #ifdef CONFIG_MEMORY_HOTREMOVE
1237 /*
1238  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1239  * set and the size of the free page is given by page_order(). Using this,
1240  * the function determines if the pageblock contains only free pages.
1241  * Due to buddy contraints, a free page at least the size of a pageblock will
1242  * be located at the start of the pageblock
1243  */
1244 static inline int pageblock_free(struct page *page)
1245 {
1246         return PageBuddy(page) && page_order(page) >= pageblock_order;
1247 }
1248
1249 /* Return the start of the next active pageblock after a given page */
1250 static struct page *next_active_pageblock(struct page *page)
1251 {
1252         /* Ensure the starting page is pageblock-aligned */
1253         BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
1254
1255         /* If the entire pageblock is free, move to the end of free page */
1256         if (pageblock_free(page)) {
1257                 int order;
1258                 /* be careful. we don't have locks, page_order can be changed.*/
1259                 order = page_order(page);
1260                 if ((order < MAX_ORDER) && (order >= pageblock_order))
1261                         return page + (1 << order);
1262         }
1263
1264         return page + pageblock_nr_pages;
1265 }
1266
1267 /* Checks if this range of memory is likely to be hot-removable. */
1268 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1269 {
1270         struct page *page = pfn_to_page(start_pfn);
1271         struct page *end_page = page + nr_pages;
1272
1273         /* Check the starting page of each pageblock within the range */
1274         for (; page < end_page; page = next_active_pageblock(page)) {
1275                 if (!is_pageblock_removable_nolock(page))
1276                         return false;
1277                 cond_resched();
1278         }
1279
1280         /* All pageblocks in the memory block are likely to be hot-removable */
1281         return true;
1282 }
1283
1284 /*
1285  * Confirm all pages in a range [start, end) belong to the same zone.
1286  * When true, return its valid [start, end).
1287  */
1288 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1289                          unsigned long *valid_start, unsigned long *valid_end)
1290 {
1291         unsigned long pfn, sec_end_pfn;
1292         unsigned long start, end;
1293         struct zone *zone = NULL;
1294         struct page *page;
1295         int i;
1296         for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1297              pfn < end_pfn;
1298              pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1299                 /* Make sure the memory section is present first */
1300                 if (!present_section_nr(pfn_to_section_nr(pfn)))
1301                         continue;
1302                 for (; pfn < sec_end_pfn && pfn < end_pfn;
1303                      pfn += MAX_ORDER_NR_PAGES) {
1304                         i = 0;
1305                         /* This is just a CONFIG_HOLES_IN_ZONE check.*/
1306                         while ((i < MAX_ORDER_NR_PAGES) &&
1307                                 !pfn_valid_within(pfn + i))
1308                                 i++;
1309                         if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1310                                 continue;
1311                         page = pfn_to_page(pfn + i);
1312                         if (zone && page_zone(page) != zone)
1313                                 return 0;
1314                         if (!zone)
1315                                 start = pfn + i;
1316                         zone = page_zone(page);
1317                         end = pfn + MAX_ORDER_NR_PAGES;
1318                 }
1319         }
1320
1321         if (zone) {
1322                 *valid_start = start;
1323                 *valid_end = min(end, end_pfn);
1324                 return 1;
1325         } else {
1326                 return 0;
1327         }
1328 }
1329
1330 /*
1331  * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1332  * non-lru movable pages and hugepages). We scan pfn because it's much
1333  * easier than scanning over linked list. This function returns the pfn
1334  * of the first found movable page if it's found, otherwise 0.
1335  */
1336 static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1337 {
1338         unsigned long pfn;
1339         struct page *page;
1340         for (pfn = start; pfn < end; pfn++) {
1341                 if (pfn_valid(pfn)) {
1342                         page = pfn_to_page(pfn);
1343                         if (PageLRU(page))
1344                                 return pfn;
1345                         if (__PageMovable(page))
1346                                 return pfn;
1347                         if (PageHuge(page)) {
1348                                 if (page_huge_active(page))
1349                                         return pfn;
1350                                 else
1351                                         pfn = round_up(pfn + 1,
1352                                                 1 << compound_order(page)) - 1;
1353                         }
1354                 }
1355         }
1356         return 0;
1357 }
1358
1359 static struct page *new_node_page(struct page *page, unsigned long private,
1360                 int **result)
1361 {
1362         int nid = page_to_nid(page);
1363         nodemask_t nmask = node_states[N_MEMORY];
1364
1365         /*
1366          * try to allocate from a different node but reuse this node if there
1367          * are no other online nodes to be used (e.g. we are offlining a part
1368          * of the only existing node)
1369          */
1370         node_clear(nid, nmask);
1371         if (nodes_empty(nmask))
1372                 node_set(nid, nmask);
1373
1374         return new_page_nodemask(page, nid, &nmask);
1375 }
1376
1377 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
1378 static int
1379 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1380 {
1381         unsigned long pfn;
1382         struct page *page;
1383         int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1384         int not_managed = 0;
1385         int ret = 0;
1386         LIST_HEAD(source);
1387
1388         for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
1389                 if (!pfn_valid(pfn))
1390                         continue;
1391                 page = pfn_to_page(pfn);
1392
1393                 if (PageHuge(page)) {
1394                         struct page *head = compound_head(page);
1395                         pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1396                         if (compound_order(head) > PFN_SECTION_SHIFT) {
1397                                 ret = -EBUSY;
1398                                 break;
1399                         }
1400                         if (isolate_huge_page(page, &source))
1401                                 move_pages -= 1 << compound_order(head);
1402                         continue;
1403                 } else if (thp_migration_supported() && PageTransHuge(page))
1404                         pfn = page_to_pfn(compound_head(page))
1405                                 + hpage_nr_pages(page) - 1;
1406
1407                 if (!get_page_unless_zero(page))
1408                         continue;
1409                 /*
1410                  * We can skip free pages. And we can deal with pages on
1411                  * LRU and non-lru movable pages.
1412                  */
1413                 if (PageLRU(page))
1414                         ret = isolate_lru_page(page);
1415                 else
1416                         ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1417                 if (!ret) { /* Success */
1418                         put_page(page);
1419                         list_add_tail(&page->lru, &source);
1420                         move_pages--;
1421                         if (!__PageMovable(page))
1422                                 inc_node_page_state(page, NR_ISOLATED_ANON +
1423                                                     page_is_file_cache(page));
1424
1425                 } else {
1426 #ifdef CONFIG_DEBUG_VM
1427                         pr_alert("failed to isolate pfn %lx\n", pfn);
1428                         dump_page(page, "isolation failed");
1429 #endif
1430                         put_page(page);
1431                         /* Because we don't have big zone->lock. we should
1432                            check this again here. */
1433                         if (page_count(page)) {
1434                                 not_managed++;
1435                                 ret = -EBUSY;
1436                                 break;
1437                         }
1438                 }
1439         }
1440         if (!list_empty(&source)) {
1441                 if (not_managed) {
1442                         putback_movable_pages(&source);
1443                         goto out;
1444                 }
1445
1446                 /* Allocate a new page from the nearest neighbor node */
1447                 ret = migrate_pages(&source, new_node_page, NULL, 0,
1448                                         MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1449                 if (ret)
1450                         putback_movable_pages(&source);
1451         }
1452 out:
1453         return ret;
1454 }
1455
1456 /*
1457  * remove from free_area[] and mark all as Reserved.
1458  */
1459 static int
1460 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1461                         void *data)
1462 {
1463         __offline_isolated_pages(start, start + nr_pages);
1464         return 0;
1465 }
1466
1467 static void
1468 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1469 {
1470         walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1471                                 offline_isolated_pages_cb);
1472 }
1473
1474 /*
1475  * Check all pages in range, recoreded as memory resource, are isolated.
1476  */
1477 static int
1478 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1479                         void *data)
1480 {
1481         int ret;
1482         long offlined = *(long *)data;
1483         ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1484         offlined = nr_pages;
1485         if (!ret)
1486                 *(long *)data += offlined;
1487         return ret;
1488 }
1489
1490 static long
1491 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1492 {
1493         long offlined = 0;
1494         int ret;
1495
1496         ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1497                         check_pages_isolated_cb);
1498         if (ret < 0)
1499                 offlined = (long)ret;
1500         return offlined;
1501 }
1502
1503 static int __init cmdline_parse_movable_node(char *p)
1504 {
1505 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1506         movable_node_enabled = true;
1507 #else
1508         pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
1509 #endif
1510         return 0;
1511 }
1512 early_param("movable_node", cmdline_parse_movable_node);
1513
1514 /* check which state of node_states will be changed when offline memory */
1515 static void node_states_check_changes_offline(unsigned long nr_pages,
1516                 struct zone *zone, struct memory_notify *arg)
1517 {
1518         struct pglist_data *pgdat = zone->zone_pgdat;
1519         unsigned long present_pages = 0;
1520         enum zone_type zt, zone_last = ZONE_NORMAL;
1521
1522         /*
1523          * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1524          * contains nodes which have zones of 0...ZONE_NORMAL,
1525          * set zone_last to ZONE_NORMAL.
1526          *
1527          * If we don't have HIGHMEM nor movable node,
1528          * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1529          * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1530          */
1531         if (N_MEMORY == N_NORMAL_MEMORY)
1532                 zone_last = ZONE_MOVABLE;
1533
1534         /*
1535          * check whether node_states[N_NORMAL_MEMORY] will be changed.
1536          * If the memory to be offline is in a zone of 0...zone_last,
1537          * and it is the last present memory, 0...zone_last will
1538          * become empty after offline , thus we can determind we will
1539          * need to clear the node from node_states[N_NORMAL_MEMORY].
1540          */
1541         for (zt = 0; zt <= zone_last; zt++)
1542                 present_pages += pgdat->node_zones[zt].present_pages;
1543         if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1544                 arg->status_change_nid_normal = zone_to_nid(zone);
1545         else
1546                 arg->status_change_nid_normal = -1;
1547
1548 #ifdef CONFIG_HIGHMEM
1549         /*
1550          * If we have movable node, node_states[N_HIGH_MEMORY]
1551          * contains nodes which have zones of 0...ZONE_HIGHMEM,
1552          * set zone_last to ZONE_HIGHMEM.
1553          *
1554          * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1555          * contains nodes which have zones of 0...ZONE_MOVABLE,
1556          * set zone_last to ZONE_MOVABLE.
1557          */
1558         zone_last = ZONE_HIGHMEM;
1559         if (N_MEMORY == N_HIGH_MEMORY)
1560                 zone_last = ZONE_MOVABLE;
1561
1562         for (; zt <= zone_last; zt++)
1563                 present_pages += pgdat->node_zones[zt].present_pages;
1564         if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1565                 arg->status_change_nid_high = zone_to_nid(zone);
1566         else
1567                 arg->status_change_nid_high = -1;
1568 #else
1569         arg->status_change_nid_high = arg->status_change_nid_normal;
1570 #endif
1571
1572         /*
1573          * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1574          */
1575         zone_last = ZONE_MOVABLE;
1576
1577         /*
1578          * check whether node_states[N_HIGH_MEMORY] will be changed
1579          * If we try to offline the last present @nr_pages from the node,
1580          * we can determind we will need to clear the node from
1581          * node_states[N_HIGH_MEMORY].
1582          */
1583         for (; zt <= zone_last; zt++)
1584                 present_pages += pgdat->node_zones[zt].present_pages;
1585         if (nr_pages >= present_pages)
1586                 arg->status_change_nid = zone_to_nid(zone);
1587         else
1588                 arg->status_change_nid = -1;
1589 }
1590
1591 static void node_states_clear_node(int node, struct memory_notify *arg)
1592 {
1593         if (arg->status_change_nid_normal >= 0)
1594                 node_clear_state(node, N_NORMAL_MEMORY);
1595
1596         if ((N_MEMORY != N_NORMAL_MEMORY) &&
1597             (arg->status_change_nid_high >= 0))
1598                 node_clear_state(node, N_HIGH_MEMORY);
1599
1600         if ((N_MEMORY != N_HIGH_MEMORY) &&
1601             (arg->status_change_nid >= 0))
1602                 node_clear_state(node, N_MEMORY);
1603 }
1604
1605 static int __ref __offline_pages(unsigned long start_pfn,
1606                   unsigned long end_pfn)
1607 {
1608         unsigned long pfn, nr_pages;
1609         long offlined_pages;
1610         int ret, node;
1611         unsigned long flags;
1612         unsigned long valid_start, valid_end;
1613         struct zone *zone;
1614         struct memory_notify arg;
1615
1616         /* at least, alignment against pageblock is necessary */
1617         if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1618                 return -EINVAL;
1619         if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1620                 return -EINVAL;
1621         /* This makes hotplug much easier...and readable.
1622            we assume this for now. .*/
1623         if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
1624                 return -EINVAL;
1625
1626         zone = page_zone(pfn_to_page(valid_start));
1627         node = zone_to_nid(zone);
1628         nr_pages = end_pfn - start_pfn;
1629
1630         /* set above range as isolated */
1631         ret = start_isolate_page_range(start_pfn, end_pfn,
1632                                        MIGRATE_MOVABLE, true);
1633         if (ret)
1634                 return ret;
1635
1636         arg.start_pfn = start_pfn;
1637         arg.nr_pages = nr_pages;
1638         node_states_check_changes_offline(nr_pages, zone, &arg);
1639
1640         ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1641         ret = notifier_to_errno(ret);
1642         if (ret)
1643                 goto failed_removal;
1644
1645         pfn = start_pfn;
1646 repeat:
1647         /* start memory hot removal */
1648         ret = -EINTR;
1649         if (signal_pending(current))
1650                 goto failed_removal;
1651
1652         cond_resched();
1653         lru_add_drain_all_cpuslocked();
1654         drain_all_pages(zone);
1655
1656         pfn = scan_movable_pages(start_pfn, end_pfn);
1657         if (pfn) { /* We have movable pages */
1658                 ret = do_migrate_range(pfn, end_pfn);
1659                 goto repeat;
1660         }
1661
1662         /*
1663          * dissolve free hugepages in the memory block before doing offlining
1664          * actually in order to make hugetlbfs's object counting consistent.
1665          */
1666         ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1667         if (ret)
1668                 goto failed_removal;
1669         /* check again */
1670         offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1671         if (offlined_pages < 0)
1672                 goto repeat;
1673         pr_info("Offlined Pages %ld\n", offlined_pages);
1674         /* Ok, all of our target is isolated.
1675            We cannot do rollback at this point. */
1676         offline_isolated_pages(start_pfn, end_pfn);
1677         /* reset pagetype flags and makes migrate type to be MOVABLE */
1678         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1679         /* removal success */
1680         adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1681         zone->present_pages -= offlined_pages;
1682
1683         pgdat_resize_lock(zone->zone_pgdat, &flags);
1684         zone->zone_pgdat->node_present_pages -= offlined_pages;
1685         pgdat_resize_unlock(zone->zone_pgdat, &flags);
1686
1687         init_per_zone_wmark_min();
1688
1689         if (!populated_zone(zone)) {
1690                 zone_pcp_reset(zone);
1691                 build_all_zonelists(NULL);
1692         } else
1693                 zone_pcp_update(zone);
1694
1695         node_states_clear_node(node, &arg);
1696         if (arg.status_change_nid >= 0) {
1697                 kswapd_stop(node);
1698                 kcompactd_stop(node);
1699         }
1700
1701         vm_total_pages = nr_free_pagecache_pages();
1702         writeback_set_ratelimit();
1703
1704         memory_notify(MEM_OFFLINE, &arg);
1705         return 0;
1706
1707 failed_removal:
1708         pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1709                  (unsigned long long) start_pfn << PAGE_SHIFT,
1710                  ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1711         memory_notify(MEM_CANCEL_OFFLINE, &arg);
1712         /* pushback to free area */
1713         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1714         return ret;
1715 }
1716
1717 /* Must be protected by mem_hotplug_begin() or a device_lock */
1718 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1719 {
1720         return __offline_pages(start_pfn, start_pfn + nr_pages);
1721 }
1722 #endif /* CONFIG_MEMORY_HOTREMOVE */
1723
1724 /**
1725  * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1726  * @start_pfn: start pfn of the memory range
1727  * @end_pfn: end pfn of the memory range
1728  * @arg: argument passed to func
1729  * @func: callback for each memory section walked
1730  *
1731  * This function walks through all present mem sections in range
1732  * [start_pfn, end_pfn) and call func on each mem section.
1733  *
1734  * Returns the return value of func.
1735  */
1736 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1737                 void *arg, int (*func)(struct memory_block *, void *))
1738 {
1739         struct memory_block *mem = NULL;
1740         struct mem_section *section;
1741         unsigned long pfn, section_nr;
1742         int ret;
1743
1744         for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1745                 section_nr = pfn_to_section_nr(pfn);
1746                 if (!present_section_nr(section_nr))
1747                         continue;
1748
1749                 section = __nr_to_section(section_nr);
1750                 /* same memblock? */
1751                 if (mem)
1752                         if ((section_nr >= mem->start_section_nr) &&
1753                             (section_nr <= mem->end_section_nr))
1754                                 continue;
1755
1756                 mem = find_memory_block_hinted(section, mem);
1757                 if (!mem)
1758                         continue;
1759
1760                 ret = func(mem, arg);
1761                 if (ret) {
1762                         kobject_put(&mem->dev.kobj);
1763                         return ret;
1764                 }
1765         }
1766
1767         if (mem)
1768                 kobject_put(&mem->dev.kobj);
1769
1770         return 0;
1771 }
1772
1773 #ifdef CONFIG_MEMORY_HOTREMOVE
1774 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1775 {
1776         int ret = !is_memblock_offlined(mem);
1777
1778         if (unlikely(ret)) {
1779                 phys_addr_t beginpa, endpa;
1780
1781                 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1782                 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1783                 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1784                         &beginpa, &endpa);
1785         }
1786
1787         return ret;
1788 }
1789
1790 static int check_cpu_on_node(pg_data_t *pgdat)
1791 {
1792         int cpu;
1793
1794         for_each_present_cpu(cpu) {
1795                 if (cpu_to_node(cpu) == pgdat->node_id)
1796                         /*
1797                          * the cpu on this node isn't removed, and we can't
1798                          * offline this node.
1799                          */
1800                         return -EBUSY;
1801         }
1802
1803         return 0;
1804 }
1805
1806 static void unmap_cpu_on_node(pg_data_t *pgdat)
1807 {
1808 #ifdef CONFIG_ACPI_NUMA
1809         int cpu;
1810
1811         for_each_possible_cpu(cpu)
1812                 if (cpu_to_node(cpu) == pgdat->node_id)
1813                         numa_clear_node(cpu);
1814 #endif
1815 }
1816
1817 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1818 {
1819         int ret;
1820
1821         ret = check_cpu_on_node(pgdat);
1822         if (ret)
1823                 return ret;
1824
1825         /*
1826          * the node will be offlined when we come here, so we can clear
1827          * the cpu_to_node() now.
1828          */
1829
1830         unmap_cpu_on_node(pgdat);
1831         return 0;
1832 }
1833
1834 /**
1835  * try_offline_node
1836  *
1837  * Offline a node if all memory sections and cpus of the node are removed.
1838  *
1839  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1840  * and online/offline operations before this call.
1841  */
1842 void try_offline_node(int nid)
1843 {
1844         pg_data_t *pgdat = NODE_DATA(nid);
1845         unsigned long start_pfn = pgdat->node_start_pfn;
1846         unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1847         unsigned long pfn;
1848
1849         for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1850                 unsigned long section_nr = pfn_to_section_nr(pfn);
1851
1852                 if (!present_section_nr(section_nr))
1853                         continue;
1854
1855                 if (pfn_to_nid(pfn) != nid)
1856                         continue;
1857
1858                 /*
1859                  * some memory sections of this node are not removed, and we
1860                  * can't offline node now.
1861                  */
1862                 return;
1863         }
1864
1865         if (check_and_unmap_cpu_on_node(pgdat))
1866                 return;
1867
1868         /*
1869          * all memory/cpu of this node are removed, we can offline this
1870          * node now.
1871          */
1872         node_set_offline(nid);
1873         unregister_one_node(nid);
1874 }
1875 EXPORT_SYMBOL(try_offline_node);
1876
1877 /**
1878  * remove_memory
1879  *
1880  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1881  * and online/offline operations before this call, as required by
1882  * try_offline_node().
1883  */
1884 void __ref remove_memory(int nid, u64 start, u64 size)
1885 {
1886         int ret;
1887
1888         BUG_ON(check_hotplug_memory_range(start, size));
1889
1890         mem_hotplug_begin();
1891
1892         /*
1893          * All memory blocks must be offlined before removing memory.  Check
1894          * whether all memory blocks in question are offline and trigger a BUG()
1895          * if this is not the case.
1896          */
1897         ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1898                                 check_memblock_offlined_cb);
1899         if (ret)
1900                 BUG();
1901
1902         /* remove memmap entry */
1903         firmware_map_remove(start, start + size, "System RAM");
1904         memblock_free(start, size);
1905         memblock_remove(start, size);
1906
1907         arch_remove_memory(start, size);
1908
1909         try_offline_node(nid);
1910
1911         mem_hotplug_done();
1912 }
1913 EXPORT_SYMBOL_GPL(remove_memory);
1914 #endif /* CONFIG_MEMORY_HOTREMOVE */