]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/commitdiff
Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Sep 2015 00:52:23 +0000 (17:52 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Sep 2015 00:52:23 +0000 (17:52 -0700)
Merge second patch-bomb from Andrew Morton:
 "Almost all of the rest of MM.  There was an unusually large amount of
  MM material this time"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits)
  zpool: remove no-op module init/exit
  mm: zbud: constify the zbud_ops
  mm: zpool: constify the zpool_ops
  mm: swap: zswap: maybe_preload & refactoring
  zram: unify error reporting
  zsmalloc: remove null check from destroy_handle_cache()
  zsmalloc: do not take class lock in zs_shrinker_count()
  zsmalloc: use class->pages_per_zspage
  zsmalloc: consider ZS_ALMOST_FULL as migrate source
  zsmalloc: partial page ordering within a fullness_list
  zsmalloc: use shrinker to trigger auto-compaction
  zsmalloc: account the number of compacted pages
  zsmalloc/zram: introduce zs_pool_stats api
  zsmalloc: cosmetic compaction code adjustments
  zsmalloc: introduce zs_can_compact() function
  zsmalloc: always keep per-class stats
  zsmalloc: drop unused variable `nr_to_migrate'
  mm/memblock.c: fix comment in __next_mem_range()
  mm/page_alloc.c: fix type information of memoryless node
  memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node()
  ...

1  2 
fs/block_dev.c
fs/dax.c
include/asm-generic/early_ioremap.h
include/linux/mm.h
mm/early_ioremap.c
mm/page_alloc.c
tools/testing/selftests/vm/userfaultfd.c

diff --combined fs/block_dev.c
index f77da0ec0e64d0959f74a1831f3cdd2d2198b068,28cc525b8d59a541c544c4a98dd1168065a4daa8..22ea424ee741ea1a967676298e7877332fc324b2
@@@ -28,6 -28,7 +28,7 @@@
  #include <linux/namei.h>
  #include <linux/log2.h>
  #include <linux/cleancache.h>
+ #include <linux/dax.h>
  #include <asm/uaccess.h>
  #include "internal.h"
  
@@@ -441,7 -442,7 +442,7 @@@ EXPORT_SYMBOL_GPL(bdev_write_page)
   * accessible at this address.
   */
  long bdev_direct_access(struct block_device *bdev, sector_t sector,
 -                      void **addr, unsigned long *pfn, long size)
 +                      void __pmem **addr, unsigned long *pfn, long size)
  {
        long avail;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
 -      avail = ops->direct_access(bdev, sector, addr, pfn, size);
 +      avail = ops->direct_access(bdev, sector, addr, pfn);
        if (!avail)
                return -ERANGE;
        return min(avail, size);
diff --combined fs/dax.c
index 57bb70b4af70f75f422dff8beff13ce7545e2b61,ed54efedade621c86facd108f91dfa623ed958a5..e43389c74bbcea88956b0c5972f72e94cef58313
+++ b/fs/dax.c
@@@ -23,7 -23,6 +23,7 @@@
  #include <linux/memcontrol.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
 +#include <linux/pmem.h>
  #include <linux/sched.h>
  #include <linux/uio.h>
  #include <linux/vmstat.h>
@@@ -35,7 -34,7 +35,7 @@@ int dax_clear_blocks(struct inode *inod
  
        might_sleep();
        do {
 -              void *addr;
 +              void __pmem *addr;
                unsigned long pfn;
                long count;
  
                        unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
                        if (pgsz > count)
                                pgsz = count;
 -                      if (pgsz < PAGE_SIZE)
 -                              memset(addr, 0, pgsz);
 -                      else
 -                              clear_page(addr);
 +                      clear_pmem(addr, pgsz);
                        addr += pgsz;
                        size -= pgsz;
                        count -= pgsz;
                }
        } while (size);
  
 +      wmb_pmem();
        return 0;
  }
  EXPORT_SYMBOL_GPL(dax_clear_blocks);
  
 -static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
 +static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
 +              unsigned blkbits)
  {
        unsigned long pfn;
        sector_t sector = bh->b_blocknr << (blkbits - 9);
        return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
  }
  
 -static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
 -                      loff_t end)
 +/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
 +static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
 +              loff_t pos, loff_t end)
  {
        loff_t final = end - pos + first; /* The final byte of the buffer */
  
        if (first > 0)
 -              memset(addr, 0, first);
 +              clear_pmem(addr, first);
        if (final < size)
 -              memset(addr + final, 0, size - final);
 +              clear_pmem(addr + final, size - final);
  }
  
  static bool buffer_written(struct buffer_head *bh)
@@@ -107,15 -106,14 +107,15 @@@ static ssize_t dax_io(struct inode *ino
        loff_t pos = start;
        loff_t max = start;
        loff_t bh_max = start;
 -      void *addr;
 +      void __pmem *addr;
        bool hole = false;
 +      bool need_wmb = false;
  
        if (iov_iter_rw(iter) != WRITE)
                end = min(end, i_size_read(inode));
  
        while (pos < end) {
 -              unsigned len;
 +              size_t len;
                if (pos == max) {
                        unsigned blkbits = inode->i_blkbits;
                        sector_t block = pos >> blkbits;
                                retval = dax_get_addr(bh, &addr, blkbits);
                                if (retval < 0)
                                        break;
 -                              if (buffer_unwritten(bh) || buffer_new(bh))
 +                              if (buffer_unwritten(bh) || buffer_new(bh)) {
                                        dax_new_buf(addr, retval, first, pos,
                                                                        end);
 +                                      need_wmb = true;
 +                              }
                                addr += first;
                                size = retval - first;
                        }
                        max = min(pos + size, end);
                }
  
 -              if (iov_iter_rw(iter) == WRITE)
 -                      len = copy_from_iter_nocache(addr, max - pos, iter);
 -              else if (!hole)
 -                      len = copy_to_iter(addr, max - pos, iter);
 +              if (iov_iter_rw(iter) == WRITE) {
 +                      len = copy_from_iter_pmem(addr, max - pos, iter);
 +                      need_wmb = true;
 +              } else if (!hole)
 +                      len = copy_to_iter((void __force *)addr, max - pos,
 +                                      iter);
                else
                        len = iov_iter_zero(max - pos, iter);
  
                addr += len;
        }
  
 +      if (need_wmb)
 +              wmb_pmem();
 +
        return (pos == start) ? retval : pos - start;
  }
  
@@@ -269,13 -260,11 +269,13 @@@ static int dax_load_hole(struct address
  static int copy_user_bh(struct page *to, struct buffer_head *bh,
                        unsigned blkbits, unsigned long vaddr)
  {
 -      void *vfrom, *vto;
 +      void __pmem *vfrom;
 +      void *vto;
 +
        if (dax_get_addr(bh, &vfrom, blkbits) < 0)
                return -EIO;
        vto = kmap_atomic(to);
 -      copy_user_page(vto, vfrom, vaddr, to);
 +      copy_user_page(vto, (void __force *)vfrom, vaddr, to);
        kunmap_atomic(vto);
        return 0;
  }
  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       struct address_space *mapping = inode->i_mapping;
        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
 -      void *addr;
 +      void __pmem *addr;
        unsigned long pfn;
        pgoff_t size;
        int error;
  
-       i_mmap_lock_read(mapping);
        /*
         * Check truncate didn't happen while we were allocating a block.
         * If it did, this block may or may not be still allocated to the
                goto out;
        }
  
 -      if (buffer_unwritten(bh) || buffer_new(bh))
 -              clear_page(addr);
 +      if (buffer_unwritten(bh) || buffer_new(bh)) {
 +              clear_pmem(addr, PAGE_SIZE);
 +              wmb_pmem();
 +      }
  
        error = vm_insert_mixed(vma, vaddr, pfn);
  
   out:
-       i_mmap_unlock_read(mapping);
        return error;
  }
  
@@@ -385,15 -367,17 +380,17 @@@ int __dax_fault(struct vm_area_struct *
                         * from a read fault and we've raced with a truncate
                         */
                        error = -EIO;
-                       goto unlock_page;
+                       goto unlock;
                }
+       } else {
+               i_mmap_lock_write(mapping);
        }
  
        error = get_block(inode, block, &bh, 0);
        if (!error && (bh.b_size < PAGE_SIZE))
                error = -EIO;           /* fs corruption? */
        if (error)
-               goto unlock_page;
+               goto unlock;
  
        if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        if (!error && (bh.b_size < PAGE_SIZE))
                                error = -EIO;
                        if (error)
-                               goto unlock_page;
+                               goto unlock;
                } else {
+                       i_mmap_unlock_write(mapping);
                        return dax_load_hole(mapping, page, vmf);
                }
        }
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
-                       goto unlock_page;
+                       goto unlock;
                vmf->page = page;
                if (!page) {
-                       i_mmap_lock_read(mapping);
                        /* Check we didn't race with truncate */
                        size = (i_size_read(inode) + PAGE_SIZE - 1) >>
                                                                PAGE_SHIFT;
                        if (vmf->pgoff >= size) {
-                               i_mmap_unlock_read(mapping);
                                error = -EIO;
-                               goto out;
+                               goto unlock;
                        }
                }
                return VM_FAULT_LOCKED;
                        WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
        }
  
+       if (!page)
+               i_mmap_unlock_write(mapping);
   out:
        if (error == -ENOMEM)
                return VM_FAULT_OOM | major;
                return VM_FAULT_SIGBUS | major;
        return VM_FAULT_NOPAGE | major;
  
-  unlock_page:
+  unlock:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
+       } else {
+               i_mmap_unlock_write(mapping);
        }
        goto out;
  }
  EXPORT_SYMBOL(__dax_fault);
@@@ -507,6 -495,176 +508,176 @@@ int dax_fault(struct vm_area_struct *vm
  }
  EXPORT_SYMBOL_GPL(dax_fault);
  
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+  * more often than one might expect in the below function.
+  */
+ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+               pmd_t *pmd, unsigned int flags, get_block_t get_block,
+               dax_iodone_t complete_unwritten)
+ {
+       struct file *file = vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+       struct buffer_head bh;
+       unsigned blkbits = inode->i_blkbits;
+       unsigned long pmd_addr = address & PMD_MASK;
+       bool write = flags & FAULT_FLAG_WRITE;
+       long length;
+       void *kaddr;
+       pgoff_t size, pgoff;
+       sector_t block, sector;
+       unsigned long pfn;
+       int result = 0;
+       /* Fall back to PTEs if we're going to COW */
+       if (write && !(vma->vm_flags & VM_SHARED))
+               return VM_FAULT_FALLBACK;
+       /* If the PMD would extend outside the VMA */
+       if (pmd_addr < vma->vm_start)
+               return VM_FAULT_FALLBACK;
+       if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+               return VM_FAULT_FALLBACK;
+       pgoff = linear_page_index(vma, pmd_addr);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (pgoff >= size)
+               return VM_FAULT_SIGBUS;
+       /* If the PMD would cover blocks out of the file */
+       if ((pgoff | PG_PMD_COLOUR) >= size)
+               return VM_FAULT_FALLBACK;
+       memset(&bh, 0, sizeof(bh));
+       block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+       bh.b_size = PMD_SIZE;
+       i_mmap_lock_write(mapping);
+       length = get_block(inode, block, &bh, write);
+       if (length)
+               return VM_FAULT_SIGBUS;
+       /*
+        * If the filesystem isn't willing to tell us the length of a hole,
+        * just fall back to PTEs.  Calling get_block 512 times in a loop
+        * would be silly.
+        */
+       if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+               goto fallback;
+       if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+               int i;
+               for (i = 0; i < PTRS_PER_PMD; i++)
+                       clear_page(kaddr + i * PAGE_SIZE);
+               count_vm_event(PGMAJFAULT);
+               mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+               result |= VM_FAULT_MAJOR;
+       }
+       /*
+        * If we allocated new storage, make sure no process has any
+        * zero pages covering this hole
+        */
+       if (buffer_new(&bh)) {
+               i_mmap_unlock_write(mapping);
+               unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+               i_mmap_lock_write(mapping);
+       }
+       /*
+        * If a truncate happened while we were allocating blocks, we may
+        * leave blocks allocated to the file that are beyond EOF.  We can't
+        * take i_mutex here, so just leave them hanging; they'll be freed
+        * when the file is deleted.
+        */
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (pgoff >= size) {
+               result = VM_FAULT_SIGBUS;
+               goto out;
+       }
+       if ((pgoff | PG_PMD_COLOUR) >= size)
+               goto fallback;
+       if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+               spinlock_t *ptl;
+               pmd_t entry;
+               struct page *zero_page = get_huge_zero_page();
+               if (unlikely(!zero_page))
+                       goto fallback;
+               ptl = pmd_lock(vma->vm_mm, pmd);
+               if (!pmd_none(*pmd)) {
+                       spin_unlock(ptl);
+                       goto fallback;
+               }
+               entry = mk_pmd(zero_page, vma->vm_page_prot);
+               entry = pmd_mkhuge(entry);
+               set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+               result = VM_FAULT_NOPAGE;
+               spin_unlock(ptl);
+       } else {
+               sector = bh.b_blocknr << (blkbits - 9);
+               length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+                                               bh.b_size);
+               if (length < 0) {
+                       result = VM_FAULT_SIGBUS;
+                       goto out;
+               }
+               if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+                       goto fallback;
+               result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+       }
+  out:
+       if (buffer_unwritten(&bh))
+               complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+       i_mmap_unlock_write(mapping);
+       return result;
+  fallback:
+       count_vm_event(THP_FAULT_FALLBACK);
+       result = VM_FAULT_FALLBACK;
+       goto out;
+ }
+ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+ /**
+  * dax_pmd_fault - handle a PMD fault on a DAX file
+  * @vma: The virtual memory area where the fault occurred
+  * @vmf: The description of the fault
+  * @get_block: The filesystem method used to translate file offsets to blocks
+  *
+  * When a page fault occurs, filesystems may call this helper in their
+  * pmd_fault handler for DAX files.
+  */
+ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmd, unsigned int flags, get_block_t get_block,
+                       dax_iodone_t complete_unwritten)
+ {
+       int result;
+       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(sb);
+               file_update_time(vma->vm_file);
+       }
+       result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+                               complete_unwritten);
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(sb);
+       return result;
+ }
+ EXPORT_SYMBOL_GPL(dax_pmd_fault);
+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  /**
   * dax_pfn_mkwrite - handle first write to DAX page
   * @vma: The virtual memory area where the fault occurred
@@@ -561,12 -719,11 +732,12 @@@ int dax_zero_page_range(struct inode *i
        if (err < 0)
                return err;
        if (buffer_written(&bh)) {
 -              void *addr;
 +              void __pmem *addr;
                err = dax_get_addr(&bh, &addr, inode->i_blkbits);
                if (err < 0)
                        return err;
 -              memset(addr + offset, 0, length);
 +              clear_pmem(addr + offset, length);
 +              wmb_pmem();
        }
  
        return 0;
index 316bd043319ebaf36dc1a7d33e1cb18ce0490b0e,e539f27ec51ba614fa72e3701acfc67606b20312..734ad4db388c6d922fb812391f913cbdda710f12
@@@ -11,8 -11,6 +11,8 @@@ extern void __iomem *early_ioremap(reso
                                   unsigned long size);
  extern void *early_memremap(resource_size_t phys_addr,
                            unsigned long size);
 +extern void *early_memremap_ro(resource_size_t phys_addr,
 +                             unsigned long size);
  extern void early_iounmap(void __iomem *addr, unsigned long size);
  extern void early_memunmap(void *addr, unsigned long size);
  
@@@ -35,6 -33,12 +35,12 @@@ extern void early_ioremap_setup(void)
   */
  extern void early_ioremap_reset(void);
  
+ /*
+  * Early copy from unmapped memory to kernel mapped memory.
+  */
+ extern void copy_from_early_mem(void *dest, phys_addr_t src,
+                               unsigned long size);
  #else
  static inline void early_ioremap_init(void) { }
  static inline void early_ioremap_setup(void) { }
diff --combined include/linux/mm.h
index 1171a292e06e48813e79349d7bc82bba675710f7,11df1a8ea38bd81c0f428fb625e481dddb30482f..f25a957bf0ab68967cc57d5f7d9472831f65cad2
@@@ -249,6 -249,8 +249,8 @@@ struct vm_operations_struct 
        void (*close)(struct vm_area_struct * area);
        int (*mremap)(struct vm_area_struct * area);
        int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+       int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+                                               pmd_t *, unsigned int flags);
        void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
  
        /* notification that a previously read-only page is about to become
@@@ -307,18 -309,6 +309,6 @@@ struct inode
  #define page_private(page)            ((page)->private)
  #define set_page_private(page, v)     ((page)->private = (v))
  
- /* It's valid only if the page is free path or free_list */
- static inline void set_freepage_migratetype(struct page *page, int migratetype)
- {
-       page->index = migratetype;
- }
- /* It's valid only if the page is free path or free_list */
- static inline int get_freepage_migratetype(struct page *page)
- {
-       return page->index;
- }
  /*
   * FIXME: take this include out, include page-flags.h in
   * files which need it (119 of them)
@@@ -359,27 -349,8 +349,15 @@@ static inline int get_page_unless_zero(
        return atomic_inc_not_zero(&page->_count);
  }
  
- /*
-  * Try to drop a ref unless the page has a refcount of one, return false if
-  * that is the case.
-  * This is to make sure that the refcount won't become zero after this drop.
-  * This can be called when MMU is off so it must not access
-  * any of the virtual mappings.
-  */
- static inline int put_page_unless_one(struct page *page)
- {
-       return atomic_add_unless(&page->_count, -1, 1);
- }
  extern int page_is_ram(unsigned long pfn);
 -extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
 +
 +enum {
 +      REGION_INTERSECTS,
 +      REGION_DISJOINT,
 +      REGION_MIXED,
 +};
 +
 +int region_intersects(resource_size_t offset, size_t size, const char *type);
  
  /* Support for virtually mapped pages */
  struct page *vmalloc_to_page(const void *addr);
@@@ -1267,6 -1238,11 +1245,11 @@@ static inline int vma_growsdown(struct 
        return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
  }
  
+ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+ {
+       return !vma->vm_ops;
+ }
  static inline int stack_guard_page_start(struct vm_area_struct *vma,
                                             unsigned long addr)
  {
@@@ -2193,6 -2169,7 +2176,7 @@@ extern int memory_failure(unsigned lon
  extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
  extern int unpoison_memory(unsigned long pfn);
  extern int get_hwpoison_page(struct page *page);
+ extern void put_hwpoison_page(struct page *page);
  extern int sysctl_memory_failure_early_kill;
  extern int sysctl_memory_failure_recovery;
  extern void shake_page(struct page *p, int access);
diff --combined mm/early_ioremap.c
index 0cfadafb3fb00bd9cccb373449857443c443f3a6,a0baeb4be934b3737493d6ad3638a81c8fec879e..23f744d77ce0022dbb46bd0232904851094f75e6
@@@ -217,13 -217,28 +217,35 @@@ early_memremap(resource_size_t phys_add
        return (__force void *)__early_ioremap(phys_addr, size,
                                               FIXMAP_PAGE_NORMAL);
  }
 +#ifdef FIXMAP_PAGE_RO
 +void __init *
 +early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 +{
 +      return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
 +}
 +#endif
+ #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+ {
+       unsigned long slop, clen;
+       char *p;
+       while (size) {
+               slop = src & ~PAGE_MASK;
+               clen = size;
+               if (clen > MAX_MAP_CHUNK - slop)
+                       clen = MAX_MAP_CHUNK - slop;
+               p = early_memremap(src & PAGE_MASK, clen + slop);
+               memcpy(dest, p + slop, clen);
+               early_memunmap(p, clen + slop);
+               dest += clen;
+               src += clen;
+               size -= clen;
+       }
+ }
  #else /* CONFIG_MMU */
  
  void __init __iomem *
@@@ -238,11 -253,6 +260,11 @@@ early_memremap(resource_size_t phys_add
  {
        return (void *)phys_addr;
  }
 +void __init *
 +early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 +{
 +      return (void *)phys_addr;
 +}
  
  void __init early_iounmap(void __iomem *addr, unsigned long size)
  {
diff --combined mm/page_alloc.c
index b401d40cb4fd46e4fd9d2863919dadb15c4ad801,4a4c399bacebe122db03f5fe63d0198fae17df94..48aaf7b9f253e6ea68587caa1e7e3e254905936a
@@@ -125,6 -125,24 +125,24 @@@ unsigned long dirty_balance_reserve __r
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  
+ /*
+  * A cached value of the page's pageblock's migratetype, used when the page is
+  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+  * Also the migratetype set in the page does not necessarily match the pcplist
+  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+  * other index - this ensures that it will be put on the correct CMA freelist.
+  */
+ static inline int get_pcppage_migratetype(struct page *page)
+ {
+       return page->index;
+ }
+ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+ {
+       page->index = migratetype;
+ }
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
@@@ -206,9 -224,6 +224,9 @@@ static char * const zone_names[MAX_NR_Z
         "HighMem",
  #endif
         "Movable",
 +#ifdef CONFIG_ZONE_DEVICE
 +       "Device",
 +#endif
  };
  
  int min_free_kbytes = 1024;
@@@ -791,7 -806,11 +809,11 @@@ static void free_pcppages_bulk(struct z
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                       mt = get_freepage_migratetype(page);
+                       mt = get_pcppage_migratetype(page);
+                       /* MIGRATE_ISOLATE page should not go to pcplists */
+                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+                       /* Pageblock could have been isolated meanwhile */
                        if (unlikely(has_isolate_pageblock(zone)))
                                mt = get_pageblock_migratetype(page);
  
@@@ -955,7 -974,6 +977,6 @@@ static void __free_pages_ok(struct pag
        migratetype = get_pfnblock_migratetype(page, pfn);
        local_irq_save(flags);
        __count_vm_events(PGFREE, 1 << order);
-       set_freepage_migratetype(page, migratetype);
        free_one_page(page_zone(page), page, pfn, order, migratetype);
        local_irq_restore(flags);
  }
@@@ -1383,7 -1401,7 +1404,7 @@@ struct page *__rmqueue_smallest(struct 
                rmv_page_order(page);
                area->nr_free--;
                expand(zone, page, order, current_order, area, migratetype);
-               set_freepage_migratetype(page, migratetype);
+               set_pcppage_migratetype(page, migratetype);
                return page;
        }
  
@@@ -1460,7 -1478,6 +1481,6 @@@ int move_freepages(struct zone *zone
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
-               set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@@ -1630,14 -1647,13 +1650,13 @@@ __rmqueue_fallback(struct zone *zone, u
                expand(zone, page, order, current_order, area,
                                        start_migratetype);
                /*
-                * The freepage_migratetype may differ from pageblock's
+                * The pcppage_migratetype may differ from pageblock's
                 * migratetype depending on the decisions in
-                * try_to_steal_freepages(). This is OK as long as it
-                * does not differ for MIGRATE_CMA pageblocks. For CMA
-                * we need to make sure unallocated pages flushed from
-                * pcp lists are returned to the correct freelist.
+                * find_suitable_fallback(). This is OK as long as it does not
+                * differ for MIGRATE_CMA pageblocks. Those can be used as
+                * fallback only via special __rmqueue_cma_fallback() function
                 */
-               set_freepage_migratetype(page, start_migratetype);
+               set_pcppage_migratetype(page, start_migratetype);
  
                trace_mm_page_alloc_extfrag(page, order, current_order,
                        start_migratetype, fallback_mt);
@@@ -1713,7 -1729,7 +1732,7 @@@ static int rmqueue_bulk(struct zone *zo
                else
                        list_add_tail(&page->lru, list);
                list = &page->lru;
-               if (is_migrate_cma(get_freepage_migratetype(page)))
+               if (is_migrate_cma(get_pcppage_migratetype(page)))
                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                              -(1 << order));
        }
@@@ -1910,7 -1926,7 +1929,7 @@@ void free_hot_cold_page(struct page *pa
                return;
  
        migratetype = get_pfnblock_migratetype(page, pfn);
-       set_freepage_migratetype(page, migratetype);
+       set_pcppage_migratetype(page, migratetype);
        local_irq_save(flags);
        __count_vm_event(PGFREE);
  
@@@ -2115,7 -2131,7 +2134,7 @@@ struct page *buffered_rmqueue(struct zo
                if (!page)
                        goto failed;
                __mod_zone_freepage_state(zone, -(1 << order),
-                                         get_freepage_migratetype(page));
+                                         get_pcppage_migratetype(page));
        }
  
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@@ -2696,6 -2712,12 +2715,12 @@@ static inline struct page 
  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        const struct alloc_context *ac, unsigned long *did_some_progress)
  {
+       struct oom_control oc = {
+               .zonelist = ac->zonelist,
+               .nodemask = ac->nodemask,
+               .gfp_mask = gfp_mask,
+               .order = order,
+       };
        struct page *page;
  
        *did_some_progress = 0;
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-       if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
-                       || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+       if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                *did_some_progress = 1;
  out:
        mutex_unlock(&oom_lock);
@@@ -3490,8 -3511,6 +3514,6 @@@ EXPORT_SYMBOL(alloc_pages_exact)
   *
   * Like alloc_pages_exact(), but try to allocate on node nid first before falling
   * back.
-  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
-  * but is not exact.
   */
  void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
@@@ -5066,7 -5085,7 +5088,7 @@@ static unsigned long __meminit zone_spa
  {
        unsigned long zone_start_pfn, zone_end_pfn;
  
-       /* When hotadd a new node, the node should be empty */
+       /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
  
@@@ -5133,7 -5152,7 +5155,7 @@@ static unsigned long __meminit zone_abs
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long zone_start_pfn, zone_end_pfn;
  
-       /* When hotadd a new node, the node should be empty */
+       /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
  
@@@ -5306,8 -5325,7 +5328,7 @@@ static unsigned long __paginginit calc_
   *
   * NOTE: pgdat should get zeroed by caller.
   */
- static void __paginginit free_area_init_core(struct pglist_data *pgdat,
-               unsigned long node_start_pfn, unsigned long node_end_pfn)
+ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
  {
        enum zone_type j;
        int nid = pgdat->node_id;
@@@ -5458,7 -5476,8 +5479,8 @@@ void __paginginit free_area_init_node(i
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-               (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+               (u64)start_pfn << PAGE_SHIFT,
+               end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
  #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
                (unsigned long)pgdat->node_mem_map);
  #endif
  
-       free_area_init_core(pgdat, start_pfn, end_pfn);
+       free_area_init_core(pgdat);
  }
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   */
  void __init setup_nr_node_ids(void)
  {
-       unsigned int node;
-       unsigned int highest = 0;
+       unsigned int highest;
  
-       for_each_node_mask(node, node_possible_map)
-               highest = node;
+       highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
        nr_node_ids = highest + 1;
  }
  #endif
@@@ -6006,7 -6023,7 +6026,7 @@@ void __init mem_init_print_info(const c
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
   *
-  * The per-cpu batchsize and zone watermarks are determined by present_pages.
+  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
   * function may optionally be used to account for unfreeable pages in the
@@@ -6059,7 -6076,7 +6079,7 @@@ void __init page_alloc_init(void
  }
  
  /*
-  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
   *    or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
  
  /*
   * setup_per_zone_lowmem_reserve - called whenever
-  *    sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+  *    sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
   *    has a correct pages reserved value, so an adequate number of
   *    pages are left in the zone after a successful __alloc_pages().
   */
index 76071b14cb93857a01a3e4e97324702e4227633d,b619f672131ea787f8211c0706bfed4fec5d641f..2c7cca6f26a45c215b43b44a7f5c9dddceee301c
@@@ -69,7 -69,7 +69,7 @@@
  #ifdef __x86_64__
  #define __NR_userfaultfd 323
  #elif defined(__i386__)
 -#define __NR_userfaultfd 359
 +#define __NR_userfaultfd 374
  #elif defined(__powewrpc__)
  #define __NR_userfaultfd 364
  #else
@@@ -147,7 -147,8 +147,8 @@@ static void *locking_thread(void *arg
                        if (sizeof(page_nr) > sizeof(rand_nr)) {
                                if (random_r(&rand, &rand_nr))
                                        fprintf(stderr, "random_r 2 error\n"), exit(1);
-                               page_nr |= ((unsigned long) rand_nr) << 32;
+                               page_nr |= (((unsigned long) rand_nr) << 16) <<
+                                          16;
                        }
                } else
                        page_nr += 1;
@@@ -290,7 -291,8 +291,8 @@@ static void *uffd_poll_thread(void *arg
                                msg.event), exit(1);
                if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
                        fprintf(stderr, "unexpected write fault\n"), exit(1);
-               offset = (char *)msg.arg.pagefault.address - area_dst;
+               offset = (char *)(unsigned long)msg.arg.pagefault.address -
+                        area_dst;
                offset &= ~(page_size-1);
                if (copy_page(offset))
                        userfaults++;
@@@ -327,7 -329,8 +329,8 @@@ static void *uffd_read_thread(void *arg
                if (bounces & BOUNCE_VERIFY &&
                    msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
                        fprintf(stderr, "unexpected write fault\n"), exit(1);
-               offset = (char *)msg.arg.pagefault.address - area_dst;
+               offset = (char *)(unsigned long)msg.arg.pagefault.address -
+                        area_dst;
                offset &= ~(page_size-1);
                if (copy_page(offset))
                        (*this_cpu_userfaults)++;