]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 17 Jan 2016 20:58:52 +0000 (12:58 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 17 Jan 2016 20:58:52 +0000 (12:58 -0800)
Merge second patch-bomb from Andrew Morton:

 - more MM stuff:

    - Kirill's page-flags rework

    - Kirill's now-allegedly-fixed THP rework

    - MADV_FREE implementation

    - DAX feature work (msync/fsync).  This isn't quite complete but DAX
      is new and it's good enough and the guys have a handle on what
      needs to be done - I expect this to be wrapped in the next week or
      two.

  - some vsprintf maintenance work

  - various other misc bits

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (145 commits)
  printk: change recursion_bug type to bool
  lib/vsprintf: factor out %pN[F] handler as netdev_bits()
  lib/vsprintf: refactor duplicate code to special_hex_number()
  printk-formats.txt: remove unimplemented %pT
  printk: help pr_debug and pr_devel to optimize out arguments
  lib/test_printf.c: test dentry printing
  lib/test_printf.c: add test for large bitmaps
  lib/test_printf.c: account for kvasprintf tests
  lib/test_printf.c: add a few number() tests
  lib/test_printf.c: test precision quirks
  lib/test_printf.c: check for out-of-bound writes
  lib/test_printf.c: don't BUG
  lib/kasprintf.c: add sanity check to kvasprintf
  lib/vsprintf.c: warn about too large precisions and field widths
  lib/vsprintf.c: help gcc make number() smaller
  lib/vsprintf.c: expand field_width to 24 bits
  lib/vsprintf.c: eliminate potential race in string()
  lib/vsprintf.c: move string() below widen_string()
  lib/vsprintf.c: pull out padding code from dentry_name()
  printk: do cond_resched() between lines while outputting to consoles
  ...

189 files changed:
Documentation/features/vm/pmdp_splitting_flush/arch-support.txt [deleted file]
Documentation/printk-formats.txt
Documentation/vm/transhuge.txt
arch/alpha/include/uapi/asm/mman.h
arch/arc/Kconfig
arch/arc/mm/cache.c
arch/arm/Kconfig
arch/arm/include/asm/kvm_mmu.h
arch/arm/include/asm/pgtable-3level.h
arch/arm/kvm/mmu.c
arch/arm/lib/uaccess_with_memcpy.c
arch/arm/mm/flush.c
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/pgtable.h
arch/arm64/mm/flush.c
arch/avr32/include/asm/page.h
arch/frv/include/asm/page.h
arch/ia64/include/asm/page.h
arch/metag/Kconfig
arch/microblaze/Kconfig
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/pgtable-bits.h
arch/mips/include/asm/pgtable.h
arch/mips/include/uapi/asm/mman.h
arch/mips/kvm/emulate.c
arch/mips/kvm/tlb.c
arch/mips/mm/c-r4k.c
arch/mips/mm/cache.c
arch/mips/mm/gup.c
arch/mips/mm/init.c
arch/mips/mm/pgtable-64.c
arch/mips/mm/tlbex.c
arch/mn10300/include/asm/page.h
arch/parisc/Kconfig
arch/parisc/include/uapi/asm/mman.h
arch/powerpc/Kconfig
arch/powerpc/include/asm/book3s/64/hash-64k.h
arch/powerpc/include/asm/book3s/64/hash.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/e500.h
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/kvm/trace_pr.h
arch/powerpc/mm/hugepage-hash64.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/subpage-prot.c
arch/powerpc/sysdev/axonram.c
arch/s390/Kconfig
arch/s390/include/asm/pgtable.h
arch/s390/mm/gup.c
arch/s390/mm/pgtable.c
arch/sh/Kconfig
arch/sh/mm/cache-sh4.c
arch/sh/mm/cache.c
arch/sparc/Kconfig
arch/sparc/include/asm/pgtable_64.h
arch/sparc/mm/fault_64.c
arch/sparc/mm/gup.c
arch/tile/include/asm/pgtable.h
arch/um/include/asm/page.h
arch/um/include/asm/pgtable-3level.h
arch/um/include/asm/pgtable.h
arch/unicore32/Kconfig
arch/x86/Kconfig
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/pmem.h
arch/x86/kernel/vm86_32.c
arch/x86/kvm/iommu.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/mm/gup.c
arch/x86/mm/init_64.c
arch/x86/mm/pat.c
arch/x86/mm/pgtable.c
arch/xtensa/include/uapi/asm/mman.h
arch/xtensa/mm/tlb.c
drivers/base/memory.c
drivers/block/brd.c
drivers/block/zram/zram_drv.c
drivers/gpu/drm/exynos/exynos_drm_gem.c
drivers/gpu/drm/gma500/framebuffer.c
drivers/gpu/drm/msm/msm_gem.c
drivers/gpu/drm/omapdrm/omap_gem.c
drivers/gpu/drm/ttm/ttm_bo_vm.c
drivers/iio/industrialio-core.c
drivers/net/wireless/intel/iwlwifi/dvm/calib.c
drivers/nvdimm/pfn_devs.c
drivers/nvdimm/pmem.c
drivers/s390/block/dcssblk.c
fs/Kconfig
fs/block_dev.c
fs/cifs/file.c
fs/dax.c
fs/fs-writeback.c
fs/hugetlbfs/inode.c
fs/proc/page.c
fs/proc/task_mmu.c
fs/stat.c
include/asm-generic/pgtable.h
include/asm-generic/sections.h
include/linux/blkdev.h
include/linux/console.h
include/linux/err.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/io.h
include/linux/kdev_t.h
include/linux/kernel.h
include/linux/kvm_host.h
include/linux/kvm_types.h
include/linux/list.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memory_hotplug.h
include/linux/memremap.h [new file with mode: 0644]
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmdebug.h
include/linux/page-flags.h
include/linux/pagemap.h
include/linux/pfn.h
include/linux/pfn_t.h [new file with mode: 0644]
include/linux/poison.h
include/linux/printk.h
include/linux/rmap.h
include/linux/swap.h
include/linux/vm_event_item.h
include/trace/events/huge_memory.h
include/uapi/asm-generic/mman-common.h
init/Kconfig
kernel/events/uprobes.c
kernel/futex.c
kernel/memremap.c
kernel/panic.c
kernel/printk/printk.c
kernel/stop_machine.c
lib/Kconfig.debug
lib/kasprintf.c
lib/list_debug.c
lib/test_printf.c
lib/vsprintf.c
mm/debug.c
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/ksm.c
mm/madvise.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mincore.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/page_alloc.c
mm/page_idle.c
mm/page_isolation.c
mm/pagewalk.c
mm/pgtable-generic.c
mm/rmap.c
mm/shmem.c
mm/slub.c
mm/sparse-vmemmap.c
mm/sparse.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/userfaultfd.c
mm/util.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
scripts/tags.sh
virt/kvm/kvm_main.c

diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
deleted file mode 100644 (file)
index 26f74b4..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Feature name:          pmdp_splitting_flush
-#         Kconfig:       __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-#         description:   arch supports the pmdp_splitting_flush() VM API
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: | TODO |
-    |         arc: | TODO |
-    |         arm: |  ok  |
-    |       arm64: |  ok  |
-    |       avr32: | TODO |
-    |    blackfin: | TODO |
-    |         c6x: | TODO |
-    |        cris: | TODO |
-    |         frv: | TODO |
-    |       h8300: | TODO |
-    |     hexagon: | TODO |
-    |        ia64: | TODO |
-    |        m32r: | TODO |
-    |        m68k: | TODO |
-    |       metag: | TODO |
-    |  microblaze: | TODO |
-    |        mips: |  ok  |
-    |     mn10300: | TODO |
-    |       nios2: | TODO |
-    |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: |  ok  |
-    |        s390: |  ok  |
-    |       score: | TODO |
-    |          sh: | TODO |
-    |       sparc: | TODO |
-    |        tile: | TODO |
-    |          um: | TODO |
-    |   unicore32: | TODO |
-    |         x86: |  ok  |
-    |      xtensa: | TODO |
-    -----------------------
index 6389551bbad6a7971b89e53451428d3b77b4e8d0..5d1128bf02824aaf2e16ef332e7cef014cccf7b6 100644 (file)
@@ -306,15 +306,6 @@ Network device features:
 
        Passed by reference.
 
-Command from struct task_struct
-
-       %pT     ls
-
-       For printing executable name excluding path from struct
-       task_struct.
-
-       Passed by reference.
-
 If you add other %p extensions, please extend lib/test_printf.c with
 one or more test cases, if at all feasible.
 
index 8a282687ee069ffc415e3c717d0991c6baf7f7fc..21cf34f3ddb268c5a64478db776d05333027cc54 100644 (file)
@@ -35,10 +35,10 @@ miss is going to run faster.
 
 == Design ==
 
-- "graceful fallback": mm components which don't have transparent
-  hugepage knowledge fall back to breaking a transparent hugepage and
-  working on the regular pages and their respective regular pmd/pte
-  mappings
+- "graceful fallback": mm components which don't have transparent hugepage
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
+  if necessary, split a transparent hugepage. Therefore these components
+  can continue working on the regular pages or regular pte mappings.
 
 - if a hugepage allocation fails because of memory fragmentation,
   regular pages should be gracefully allocated instead and mixed in
@@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
        of pages that should be collapsed into one huge page but failed
        the allocation.
 
-thp_split is incremented every time a huge page is split into base
+thp_split_page is incremented every time a huge page is split into base
        pages. This can happen for a variety of reasons but a common
        reason is that a huge page is old and is being reclaimed.
+       This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed is is incremented if kernel fails to split huge
+       page. This can happen if the page was pinned by somebody.
+
+thp_split_pmd is incremented every time a PMD split into table of PTEs.
+       This can happen, for instance, when application calls mprotect() or
+       munmap() on part of huge page. It doesn't split huge page, only
+       page table entry.
 
 thp_zero_page_alloc is incremented every time a huge zero page is
        successfully allocated. It includes allocations which where
@@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But
 if any driver is going to mangle over the page structure of the tail
 page (like for checking page->mapping or other bits that are relevant
 for the head page and not the tail page), it should be updated to jump
-to check head page instead (while serializing properly against
-split_huge_page() to avoid the head and tail pages to disappear from
-under it, see the futex code to see an example of that, hugetlbfs also
-needed special handling in futex code for similar reasons).
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
 
 NOTE: these aren't new constraints to the GUP API, and they match the
 same constrains that applies to hugetlbfs too, so any driver capable
@@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual.
 == Graceful fallback ==
 
 Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
 pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+by just grepping for "pmd_offset" and adding split_huge_pmd where
 missing after pmd_offset returns the pmd. Thanks to the graceful
 fallback design, with a one liner change, you can avoid to write
 hundred if not thousand of lines of complex code to make your code
@@ -323,7 +330,8 @@ hugepage aware.
 If you're not walking pagetables but you run into a physical hugepage
 but you can't handle it natively in your code, you can split it by
 calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example.
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
 
 Example to make mremap.c transparent hugepage aware with a one liner
 change:
@@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c
                return NULL;
 
        pmd = pmd_offset(pud, addr);
-+      split_huge_page_pmd(vma, addr, pmd);
++      split_huge_pmd(vma, pmd, addr);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
 
 == Locking in hugepage aware code ==
 
 We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_page_pmd() has a cost.
+split_huge_page() or split_huge_pmd() has a cost.
 
 To make pagetable walks huge pmd aware, all you need to do is to call
 pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
@@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page
 takes the mmap_sem in write mode in addition to the anon_vma lock). If
 pmd_trans_huge returns false, you just fallback in the old code
 paths. If instead pmd_trans_huge returns true, you have to take the
-mm->page_table_lock and re-run pmd_trans_huge. Taking the
-page_table_lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_page can run in parallel to the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
 pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page_table_lock and fallback to the old code as
-before. Otherwise you should run pmd_trans_splitting on the pmd. In
-case pmd_trans_splitting returns true, it means split_huge_page is
-already in the middle of splitting the page. So if pmd_trans_splitting
-returns true it's enough to drop the page_table_lock and call
-wait_split_huge_page and then fallback the old code paths. You are
-guaranteed by the time wait_split_huge_page returns, the pmd isn't
-huge anymore. If pmd_trans_splitting returns false, you can proceed to
-process the huge pmd and the hugepage natively. Once finished you can
-drop the page_table_lock.
-
-== compound_lock, get_user_pages and put_page ==
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+== Refcounts and transparent huge pages ==
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+  - get_page()/put_page() and GUP operate in head page's ->_count.
+
+  - ->_count in tail pages is always zero: get_page_unless_zero() never
+    succeed on tail pages.
+
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+
+  - map/unmap of the whole compound page accounted in compound_mapcount
+    (stored in first tail page).
+
+PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
+This additional reference is required to get race-free detection of unmap of
+subpages when we have them mapped with both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+We set PG_double_map when a PMD of the page got split for the first time,
+but still have PMD mapping. The addtional references go away with last
+compound_mapcount.
 
 split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the
-page structures. It can do that easily for refcounts taken by huge pmd
-mappings. But the GUI API as created by hugetlbfs (that returns head
-and tail pages if running get_user_pages on an address backed by any
-hugepage), requires the refcount to be accounted on the tail pages and
-not only in the head pages, if we want to be able to run
-split_huge_page while there are gup pins established on any tail
-page. Failure to be able to run split_huge_page if there's any gup pin
-on any tail page, would mean having to split all hugepages upfront in
-get_user_pages which is unacceptable as too many gup users are
-performance critical and they must work natively on hugepages like
-they work natively on hugetlbfs already (hugetlbfs is simpler because
-hugetlbfs pages cannot be split so there wouldn't be requirement of
-accounting the pins on the tail pages for hugetlbfs). If we wouldn't
-account the gup refcounts on the tail pages during gup, we won't know
-anymore which tail page is pinned by gup and which is not while we run
-split_huge_page. But we still have to add the gup pin to the head page
-too, to know when we can free the compound page in case it's never
-split during its lifetime. That requires changing not just
-get_page, but put_page as well so that when put_page runs on a tail
-page (and only on a tail page) it will find its respective head page,
-and then it will decrease the head page refcount in addition to the
-tail page refcount. To obtain a head page reliably and to decrease its
-refcount without race conditions, put_page has to serialize against
-__split_huge_page_refcount using a special per-page lock called
-compound_lock.
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_count and
+page->_mapcount.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages has zero ->_count until atomic_add(). It prevent scanner
+from geting reference to tail page up to the point. After the atomic_add()
+we don't care about ->_count value.  We already known how many references
+with should uncharge from head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+== Partial unmap and deferred_split_huge_page() ==
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap unmap happens during
+exit(2) if an THP crosses VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
index f2f9496717981766aa0ac53ef5054a9250dd1218..ab336c06153e63c3aba2adc36a622e8a5d4c3423 100644 (file)
 #define MADV_WILLNEED  3               /* will need these pages */
 #define        MADV_SPACEAVAIL 5               /* ensure resources are available */
 #define MADV_DONTNEED  6               /* don't need these pages */
+#define MADV_FREE      7               /* free pages only if memory pressure */
 
 /* common/generic parameters */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 #define MADV_REMOVE    9               /* remove these pages & resources */
 #define MADV_DONTFORK  10              /* don't inherit across fork */
 #define MADV_DOFORK    11              /* do inherit across fork */
index 6312f607932fd2241782827cce2ec7968ea3c8b6..76dde9db79349d0977687623307c934de91e314a 100644 (file)
@@ -73,9 +73,6 @@ config STACKTRACE_SUPPORT
        def_bool y
        select STACKTRACE
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
        def_bool y
        depends on ARC_MMU_V4
index ff7ff6cbb8112408c05a38a2f8e001265d5d3726..b65f797e9ad6723abd7c38bba09e382df52450b4 100644 (file)
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page)
         */
        if (!mapping_mapped(mapping)) {
                clear_bit(PG_dc_clean, &page->flags);
-       } else if (page_mapped(page)) {
+       } else if (page_mapcount(page)) {
 
                /* kernel reading from page with U-mapping */
                phys_addr_t paddr = (unsigned long)page_address(page);
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from,
         * For !VIPT cache, all of this gets compiled out as
         * addr_not_cache_congruent() is 0
         */
-       if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+       if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
                __flush_dcache_page((unsigned long)kfrom, u_vaddr);
                clean_src_k_mappings = 1;
        }
index 4e489cc5c45e5f4e1f648b5acc79cae8fdb81c5d..6a889afa6a2cfc9055af453b7d9637e5bc7f1c8d 100644 (file)
@@ -168,11 +168,6 @@ config STACKTRACE_SUPPORT
        bool
        default y
 
-config HAVE_LATENCYTOP_SUPPORT
-       bool
-       depends on !SMP
-       default y
-
 config LOCKDEP_SUPPORT
        bool
        default y
index 9203c21b4673fd8a73b5f5797ed5190305b363ae..a520b7987a29c3626c0f58b86d896fefaf69ad06 100644 (file)
@@ -182,7 +182,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
        return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
 }
 
-static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
+                                              kvm_pfn_t pfn,
                                               unsigned long size,
                                               bool ipa_uncached)
 {
@@ -246,7 +247,7 @@ static inline void __kvm_flush_dcache_pte(pte_t pte)
 static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
 {
        unsigned long size = PMD_SIZE;
-       pfn_t pfn = pmd_pfn(pmd);
+       kvm_pfn_t pfn = pmd_pfn(pmd);
 
        while (size) {
                void *va = kmap_atomic_pfn(pfn);
index a745a2a53853c384f688ebab2cd6d254b87a6017..dc46398bc3a528ccf51fb01cdabd143036df10de 100644 (file)
@@ -88,7 +88,6 @@
 
 #define L_PMD_SECT_VALID       (_AT(pmdval_t, 1) << 0)
 #define L_PMD_SECT_DIRTY       (_AT(pmdval_t, 1) << 55)
-#define L_PMD_SECT_SPLITTING   (_AT(pmdval_t, 1) << 56)
 #define L_PMD_SECT_NONE                (_AT(pmdval_t, 1) << 57)
 #define L_PMD_SECT_RDONLY      (_AT(pteval_t, 1) << 58)
 
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)    (pmd_val(pmd) && !pmd_table(pmd))
-#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp);
-#endif
 #endif
 
 #define PMD_BIT_FUNC(fn,op) \
@@ -246,9 +238,9 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
 
 PMD_BIT_FUNC(wrprotect,        |= L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkold,    &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
 PMD_BIT_FUNC(mkwrite,   &= ~L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkdirty,   |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean,   &= ~L_PMD_SECT_DIRTY);
 PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 
 #define pmd_mkhuge(pmd)                (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
index 22f7fa0124ec1d80c550fea0ecf55a2c7d603091..aba61fd3697aa6260f6b0b3626434e2859bc3248 100644 (file)
@@ -992,9 +992,9 @@ out:
        return ret;
 }
 
-static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
+static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
 {
-       pfn_t pfn = *pfnp;
+       kvm_pfn_t pfn = *pfnp;
        gfn_t gfn = *ipap >> PAGE_SHIFT;
 
        if (PageTransCompound(pfn_to_page(pfn))) {
@@ -1201,7 +1201,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
        kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
                                      unsigned long size, bool uncached)
 {
        __coherent_cache_guest_page(vcpu, pfn, size, uncached);
@@ -1218,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
        struct vm_area_struct *vma;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        pgprot_t mem_type = PAGE_S2;
        bool fault_ipa_uncached;
        bool logging_active = memslot_is_logging(memslot);
@@ -1346,7 +1346,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 {
        pmd_t *pmd;
        pte_t *pte;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        bool pfn_valid = false;
 
        trace_kvm_access_fault(fault_ipa);
index 588bbc288396ae52fe0d801bc537906b3e5dbfce..6bd1089b07e0960830ed6bd6a8345202b7efd8b0 100644 (file)
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
         *
         * Lock the page table for the destination and check
         * to see that it's still huge and whether or not we will
-        * need to fault on write, or if we have a splitting THP.
+        * need to fault on write.
         */
        if (unlikely(pmd_thp_or_huge(*pmd))) {
                ptl = &current->mm->page_table_lock;
                spin_lock(ptl);
                if (unlikely(!pmd_thp_or_huge(*pmd)
-                       || pmd_hugewillfault(*pmd)
-                       || pmd_trans_splitting(*pmd))) {
+                       || pmd_hugewillfault(*pmd))) {
                        spin_unlock(ptl);
                        return 0;
                }
index 1ec8e7590fc6823bf1d1ffe87c1901f645ffcd07..d0ba3551d49a4b05371db7a12c02e151be9d4eae 100644 (file)
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
        mapping = page_mapping(page);
 
        if (!cache_ops_need_broadcast() &&
-           mapping && !page_mapped(page))
+           mapping && !page_mapcount(page))
                clear_bit(PG_dcache_clean, &page->flags);
        else {
                __flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
         */
        __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
 }
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-       VM_BUG_ON(address & ~PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
-       /* dummy IPI to serialise against fast_gup */
-       kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
index 0bf8b4320a9154fda9e6fd3d999ef37a356bee85..736433912a1eb69a0398fa2f01d78cff88533893 100644 (file)
@@ -230,7 +230,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
        return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
+                                              kvm_pfn_t pfn,
                                               unsigned long size,
                                               bool ipa_uncached)
 {
index 69d2e2f86bce3f65a76637e9ab3cd9251d6f3d16..2d545d7aa80ba715f65cc82b671213fa5505b495 100644 (file)
@@ -353,21 +353,14 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd)       pte_special(pmd_pte(pmd))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-struct vm_area_struct;
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp);
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)     pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd)   pte_pmd(pte_mkspecial(pmd_pte(pmd)))
 #define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)       pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mknotpresent(pmd)  (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
index 46649d6e6c5a5608caa84015d3ce4f09d3d47eee..60585bde1264a172e4e598ce73a550272d0a40f2 100644 (file)
@@ -102,19 +102,3 @@ EXPORT_SYMBOL(flush_dcache_page);
  * Additional functions defined in assembly.
  */
 EXPORT_SYMBOL(flush_icache_range);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-
-       VM_BUG_ON(address & ~PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
-       /* dummy IPI to serialise against fast_gup */
-       kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
index f805d1cb11bc199b86055552900630371ee9f562..c5d2a3e2c62f3487979f2f738e42d34098066191 100644 (file)
@@ -83,11 +83,9 @@ static inline int get_order(unsigned long size)
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 
-#define PHYS_PFN_OFFSET                (CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
+#define ARCH_PFN_OFFSET                (CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
 
-#define pfn_to_page(pfn)       (mem_map + ((pfn) - PHYS_PFN_OFFSET))
-#define page_to_pfn(page)      ((unsigned long)((page) - mem_map) + PHYS_PFN_OFFSET)
-#define pfn_valid(pfn)         ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr))
+#define pfn_valid(pfn)         ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (ARCH_PFN_OFFSET + max_mapnr))
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
@@ -101,4 +99,6 @@ static inline int get_order(unsigned long size)
  */
 #define HIGHMEM_START          0x20000000UL
 
+#include <asm-generic/memory_model.h>
+
 #endif /* __ASM_AVR32_PAGE_H */
index 8c97068ac8fc4543cf288629cf9e85db57887431..688d8076a43a8cbdaaa2bd5e29c816888cee5f6f 100644 (file)
@@ -34,7 +34,7 @@ typedef struct page *pgtable_t;
 #define pgprot_val(x)  ((x).pgprot)
 
 #define __pte(x)       ((pte_t) { (x) } )
-#define __pmd(x)       ((pmd_t) { (x) } )
+#define __pmd(x)       ((pmd_t) { { (x) } } )
 #define __pud(x)       ((pud_t) { (x) } )
 #define __pgd(x)       ((pgd_t) { (x) } )
 #define __pgprot(x)    ((pgprot_t) { (x) } )
index ec48bb9f95e18da086dcf5db6a3e80f63430fff4..e8c486ef0d76b1352b12fab907223139e6479c71 100644 (file)
@@ -105,6 +105,7 @@ extern struct page *vmem_map;
 #ifdef CONFIG_DISCONTIGMEM
 # define page_to_pfn(page)     ((unsigned long) (page - vmem_map))
 # define pfn_to_page(pfn)      (vmem_map + (pfn))
+# define __pfn_to_phys(pfn)    PFN_PHYS(pfn)
 #else
 # include <asm-generic/memory_model.h>
 #endif
index 0b389a81c43a2cbc31ccd430e41df552578646e2..a0fa88da3e31a4c8806309154333bb39a2bd6e94 100644 (file)
@@ -36,9 +36,6 @@ config STACKTRACE_SUPPORT
 config LOCKDEP_SUPPORT
        def_bool y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config RWSEM_GENERIC_SPINLOCK
        def_bool y
 
index 0bce820428fca2e32d5ae5e04714b372fdeb4adc..5ecd0287a87428ff9340a1ed88908d00897b5a24 100644 (file)
@@ -67,9 +67,6 @@ config STACKTRACE_SUPPORT
 config LOCKDEP_SUPPORT
        def_bool y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
index 6ded8d347af966da7693529050d7c95bf739b182..7c191443c7ea199f876a2c495b2b6d3f41bfc1d5 100644 (file)
 #define CAUSEF_DC                      (_ULCAST_(1) << 27)
 
 extern atomic_t kvm_mips_instance;
-extern pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean) (pfn_t pfn);
-extern bool(*kvm_mips_is_error_pfn) (pfn_t pfn);
+extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
+extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
+extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
index ff7ad91c85db325b27dacb8987c9e1b930088855..97b313882678084857129b811cea812a723c6a3a 100644 (file)
 /* Huge TLB page */
 #define _PAGE_HUGE_SHIFT       (_PAGE_MODIFIED_SHIFT + 1)
 #define _PAGE_HUGE             (1 << _PAGE_HUGE_SHIFT)
-#define _PAGE_SPLITTING_SHIFT  (_PAGE_HUGE_SHIFT + 1)
-#define _PAGE_SPLITTING                (1 << _PAGE_SPLITTING_SHIFT)
 #endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
 
 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 /* XI - page cannot be executed */
-#ifdef _PAGE_SPLITTING_SHIFT
-#define _PAGE_NO_EXEC_SHIFT    (_PAGE_SPLITTING_SHIFT + 1)
+#ifdef _PAGE_HUGE_SHIFT
+#define _PAGE_NO_EXEC_SHIFT    (_PAGE_HUGE_SHIFT + 1)
 #else
 #define _PAGE_NO_EXEC_SHIFT    (_PAGE_MODIFIED_SHIFT + 1)
 #endif
 
 #if defined(_PAGE_NO_READ_SHIFT)
 #define _PAGE_GLOBAL_SHIFT     (_PAGE_NO_READ_SHIFT + 1)
-#elif defined(_PAGE_SPLITTING_SHIFT)
-#define _PAGE_GLOBAL_SHIFT     (_PAGE_SPLITTING_SHIFT + 1)
+#elif defined(_PAGE_HUGE_SHIFT)
+#define _PAGE_GLOBAL_SHIFT     (_PAGE_HUGE_SHIFT + 1)
 #else
 #define _PAGE_GLOBAL_SHIFT     (_PAGE_MODIFIED_SHIFT + 1)
 #endif
index 8957f15e21ec4c911e8ebe017ea8cb4ea1276ad4..6995b4a02e2359bf6e2e1b8493bcae55443753f1 100644 (file)
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
        return pmd;
 }
 
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return !!(pmd_val(pmd) & _PAGE_SPLITTING);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-       pmd_val(pmd) |= _PAGE_SPLITTING;
-
-       return pmd;
-}
-
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                       pmd_t *pmdp, pmd_t pmd);
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-/* Extern to avoid header file madness */
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                       unsigned long address,
-                                       pmd_t *pmdp);
-
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
index 97c03f4689243be4e7b6157df73683fb47480d39..b0ebe59f73fdfa6b69df58010c4a7ca4f672255a 100644 (file)
 #define MADV_SEQUENTIAL 2              /* expect sequential page references */
 #define MADV_WILLNEED  3               /* will need these pages */
 #define MADV_DONTNEED  4               /* don't need these pages */
+#define MADV_FREE      5               /* free pages only if memory pressure */
 
 /* common parameters: try to keep these consistent across architectures */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 #define MADV_REMOVE    9               /* remove these pages & resources */
 #define MADV_DONTFORK  10              /* don't inherit across fork */
 #define MADV_DOFORK    11              /* do inherit across fork */
index 41b1b090f56f6b73afc50240318634a4988ef427..1b675c7ce89f89d25f3457659405419505a8158d 100644 (file)
@@ -1525,7 +1525,7 @@ int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
        struct kvm *kvm = vcpu->kvm;
        unsigned long pa;
        gfn_t gfn;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        gfn = va >> PAGE_SHIFT;
 
index aed0ac2a4972cd1daf0f2992db6c100e9912fb70..570479c03bdc35009f48985e70d17f00686b5012 100644 (file)
@@ -38,13 +38,13 @@ atomic_t kvm_mips_instance;
 EXPORT_SYMBOL(kvm_mips_instance);
 
 /* These function pointers are initialized once the KVM module is loaded */
-pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
+kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
 EXPORT_SYMBOL(kvm_mips_gfn_to_pfn);
 
-void (*kvm_mips_release_pfn_clean)(pfn_t pfn);
+void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
 EXPORT_SYMBOL(kvm_mips_release_pfn_clean);
 
-bool (*kvm_mips_is_error_pfn)(pfn_t pfn);
+bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 EXPORT_SYMBOL(kvm_mips_is_error_pfn);
 
 uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(kvm_mips_dump_guest_tlbs);
 static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
 {
        int srcu_idx, err = 0;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
                return 0;
@@ -262,7 +262,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
                                    struct kvm_vcpu *vcpu)
 {
        gfn_t gfn;
-       pfn_t pfn0, pfn1;
+       kvm_pfn_t pfn0, pfn1;
        unsigned long vaddr = 0;
        unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
        int even;
@@ -313,7 +313,7 @@ EXPORT_SYMBOL(kvm_mips_handle_kseg0_tlb_fault);
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        struct kvm_vcpu *vcpu)
 {
-       pfn_t pfn0, pfn1;
+       kvm_pfn_t pfn0, pfn1;
        unsigned long flags, old_entryhi = 0, vaddr = 0;
        unsigned long entrylo0 = 0, entrylo1 = 0;
 
@@ -360,7 +360,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 {
        unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
        struct kvm *kvm = vcpu->kvm;
-       pfn_t pfn0, pfn1;
+       kvm_pfn_t pfn0, pfn1;
 
        if ((tlb->tlb_hi & VPN2_MASK) == 0) {
                pfn0 = 0;
index 5d3a25e1cfaea62cf7859f3408e3d101f9bf4060..caac3d747a909dbd75d9f0935b8a67766795174c 100644 (file)
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args)
                 * another ASID than the current one.
                 */
                map_coherent = (cpu_has_dc_aliases &&
-                               page_mapped(page) && !Page_dcache_dirty(page));
+                               page_mapcount(page) &&
+                               !Page_dcache_dirty(page));
                if (map_coherent)
                        vaddr = kmap_coherent(page, addr);
                else
index aab218c36e0d3e2f7669c47343e583e527103169..3f159caf6dbc902d20d2284913aea498c67cd596 100644 (file)
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
        unsigned long addr = (unsigned long) page_address(page);
 
        if (pages_do_alias(addr, vmaddr)) {
-               if (page_mapped(page) && !Page_dcache_dirty(page)) {
+               if (page_mapcount(page) && !Page_dcache_dirty(page)) {
                        void *kaddr;
 
                        kaddr = kmap_coherent(page, vmaddr);
index 349995d19c7f2c85ee1eeb83d7882d9558d96e03..1afd87c999b0c22f1ab08508a233126b5d6d9f2b 100644 (file)
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush has to flush the tlb, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state. gup-fast
-                * can't because it has irq disabled and
-                * wait_split_huge_page() would never return as the
-                * tlb flush IPI wouldn't run.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_huge(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
index 8770e619185eb034b317ce3de837c5185ba05511..7e5fa0938c2174cefe9d5bfb0f04249626b14af1 100644 (file)
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from,
 
        vto = kmap_atomic(to);
        if (cpu_has_dc_aliases &&
-           page_mapped(from) && !Page_dcache_dirty(from)) {
+           page_mapcount(from) && !Page_dcache_dirty(from)) {
                vfrom = kmap_coherent(from, vaddr);
                copy_page(vto, vfrom);
                kunmap_coherent();
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma,
        unsigned long len)
 {
        if (cpu_has_dc_aliases &&
-           page_mapped(page) && !Page_dcache_dirty(page)) {
+           page_mapcount(page) && !Page_dcache_dirty(page)) {
                void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(vto, src, len);
                kunmap_coherent();
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma,
        unsigned long len)
 {
        if (cpu_has_dc_aliases &&
-           page_mapped(page) && !Page_dcache_dirty(page)) {
+           page_mapcount(page) && !Page_dcache_dirty(page)) {
                void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(dst, vfrom, len);
                kunmap_coherent();
index e8adc0069d66f17fcc6e27915fa1d5eb8e8258a6..ce4473e7c0d261b04d7bf44fcfc8ddc6414435d9 100644 (file)
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
 }
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                        unsigned long address,
-                        pmd_t *pmdp)
-{
-       if (!pmd_trans_splitting(*pmdp)) {
-               pmd_t pmd = pmd_mksplitting(*pmdp);
-               set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-       }
-}
-
-#endif
-
 pmd_t mk_pmd(struct page *page, pgprot_t prot)
 {
        pmd_t pmd;
index 32e0be27673fefbeca6839929e61a581c8980902..482192cc8f2b88ae89f4cf1495c0dd6055a5bda6 100644 (file)
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void)
        pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
        pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
-       pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
 #endif
 #ifdef CONFIG_CPU_MIPSR2
        if (cpu_has_rixi) {
index 8288e124165b50fbd83f73bec88dd391227a9941..3810a6f740fdf67ffa6622fcf4706c1c841aa09a 100644 (file)
@@ -107,6 +107,7 @@ static inline int get_order(unsigned long size)
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 #define pfn_to_page(pfn)       (mem_map + ((pfn) - __pfn_disp))
 #define page_to_pfn(page)      ((unsigned long)((page) - mem_map) + __pfn_disp)
+#define __pfn_to_phys(pfn)     PFN_PHYS(pfn)
 
 #define pfn_valid(pfn)                                 \
 ({                                                     \
index 729f89163bc32113dba77e309c8ce767ed3d15e8..7c34cafdf3012e6f9b29d4a912ec064e4cefe068 100644 (file)
@@ -79,9 +79,6 @@ config TIME_LOW_RES
        depends on SMP
        default y
 
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
-
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
        bool
index dd4d1876a020c1e0150cc87547e3f75f9940f16b..cf830d465f75fd2cf55afb74c73adf593f3fb7cf 100644 (file)
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 
 /* common/generic parameters */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 #define MADV_REMOVE    9               /* remove these pages & resources */
 #define MADV_DONTFORK  10              /* don't inherit across fork */
 #define MADV_DOFORK    11              /* do inherit across fork */
index 7d5a8350f9132b47aee3a7a93bc721b19c135ee4..94f6c5089e0cc8d8c03d1ce658bcbd9953d3899d 100644 (file)
@@ -47,9 +47,6 @@ config STACKTRACE_SUPPORT
        bool
        default y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config TRACE_IRQFLAGS_SUPPORT
        bool
        default y
index 9e55e3b1fef0efac40efd02a183d8b141fb90823..849bbec80f7bb1afd2a8622f75801fe2f7a67cb1 100644 (file)
@@ -256,13 +256,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
                  (_PAGE_PTE | _PAGE_THP_HUGE));
 }
 
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       if (pmd_trans_huge(pmd))
-               return pmd_val(pmd) & _PAGE_SPLITTING;
-       return 0;
-}
-
 static inline int pmd_large(pmd_t pmd)
 {
        return !!(pmd_val(pmd) & _PAGE_PTE);
@@ -273,11 +266,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
        return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
 }
 
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-       return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
-}
-
 #define __HAVE_ARCH_PMD_SAME
 static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
index 2ff8b3df553dab8a02141f1c8381fc02386c811e..06f17e778c2750edd0b2380288da820f0c40f130 100644 (file)
 #define _PAGE_SOFT_DIRTY       0x00000
 #endif
 
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
@@ -54,9 +49,8 @@
 /*
  * set of bits not changed in pmd_modify.
  */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |              \
-                        _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-                        _PAGE_THP_HUGE | _PAGE_PTE | _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                        _PAGE_ACCESSED | _PAGE_THP_HUGE)
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
index b3a5badab69fa264090244210bab4de86c4efd58..8204b0c393aac69285a666d1b70fae0961053614 100644 (file)
@@ -223,9 +223,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
 #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_wrprotect(pmd)     pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)       pte_pmd(pte_mkwrite(pmd_pte(pmd)))
 
@@ -266,10 +268,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                     unsigned long addr, pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
-
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 #define pmdp_collapse_flush pmdp_collapse_flush
index 9fac01cb89c14df3fd4ae8bb51ef51acb56ec54b..8f39796c9da8dffaede2751f607375b1d5a4f24b 100644 (file)
@@ -154,8 +154,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
-extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
-                       bool *writable);
+extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
+                       bool writing, bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
                        unsigned long *rmap, long pte_index, int realmode);
 extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
index c6ef05bd0765dd17fe77af03b1a736a671ffc2c9..2241d53571291a4ba24fcd40ab800d74aaa039ba 100644 (file)
@@ -515,7 +515,7 @@ void kvmppc_claim_lpid(long lpid);
 void kvmppc_free_lpid(long lpid);
 void kvmppc_init_lpid(unsigned long nr_lpids);
 
-static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
+static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn)
 {
        struct page *page;
        /*
index 099c79d8c160fd59c2d0e28217c1348f03c08cbf..638c6d9be9e08bec96542312aaf18ce8e21900e6 100644 (file)
@@ -366,7 +366,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter);
 
-pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
+kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
                        bool *writable)
 {
        ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM;
@@ -379,9 +379,9 @@ pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
        gpa &= ~0xFFFULL;
        if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) {
                ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
-               pfn_t pfn;
+               kvm_pfn_t pfn;
 
-               pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
+               pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
                get_page(pfn_to_page(pfn));
                if (writable)
                        *writable = true;
index d5c9bfeb0c9c7e926aa5a40761959ce5e51f90eb..55c4d51ea3e2bbbad7305b2ab0eca65d4b27bad9 100644 (file)
@@ -142,7 +142,7 @@ extern char etext[];
 int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
                        bool iswrite)
 {
-       pfn_t hpaddr;
+       kvm_pfn_t hpaddr;
        u64 vpn;
        u64 vsid;
        struct kvmppc_sid_map *map;
index 79ad35abd1967c0ea68c900f9b5bb83d25a488a6..913cd2198fa6df6f96daceda2f92958123c3a5dd 100644 (file)
@@ -83,7 +83,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
                        bool iswrite)
 {
        unsigned long vpn;
-       pfn_t hpaddr;
+       kvm_pfn_t hpaddr;
        ulong hash, hpteg;
        u64 vsid;
        int ret;
index 72920bed3ac6c77393ce898b55cfa7c0eb05652f..94f04fcb373e15654e243e073134f9eec8e3f4af 100644 (file)
@@ -41,7 +41,7 @@ enum vcpu_ftr {
 #define E500_TLB_MAS2_ATTR     (0x7f)
 
 struct tlbe_ref {
-       pfn_t pfn;              /* valid only for TLB0, except briefly */
+       kvm_pfn_t pfn;          /* valid only for TLB0, except briefly */
        unsigned int flags;     /* E500_TLB_* */
 };
 
index 34c43fff4adbfc6269bb00f1488f7a76387d049c..b0333cc737dd67dbce01046b32cb23aa568fc5fc 100644 (file)
@@ -163,9 +163,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
        struct kvm_book3e_206_tlb_entry magic;
        ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
        unsigned int stid;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
-       pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+       pfn = (kvm_pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
        get_page(pfn_to_page(pfn));
 
        preempt_disable();
@@ -246,7 +246,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
 
 static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
                                         struct kvm_book3e_206_tlb_entry *gtlbe,
-                                        pfn_t pfn, unsigned int wimg)
+                                        kvm_pfn_t pfn, unsigned int wimg)
 {
        ref->pfn = pfn;
        ref->flags = E500_TLB_VALID;
@@ -309,7 +309,7 @@ static void kvmppc_e500_setup_stlbe(
        int tsize, struct tlbe_ref *ref, u64 gvaddr,
        struct kvm_book3e_206_tlb_entry *stlbe)
 {
-       pfn_t pfn = ref->pfn;
+       kvm_pfn_t pfn = ref->pfn;
        u32 pr = vcpu->arch.shared->msr & MSR_PR;
 
        BUG_ON(!(ref->flags & E500_TLB_VALID));
index 810507cb688aaa3aa6d3b01c8e0f39e8df82a09f..d44f324184fb77974121258c61f3b658d699af09 100644 (file)
@@ -30,7 +30,7 @@ TRACE_EVENT(kvm_book3s_reenter,
 #ifdef CONFIG_PPC_BOOK3S_64
 
 TRACE_EVENT(kvm_book3s_64_mmu_map,
-       TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
+       TP_PROTO(int rflags, ulong hpteg, ulong va, kvm_pfn_t hpaddr,
                 struct kvmppc_pte *orig_pte),
        TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
 
index baf1301ded0ccf4ae5c3280923aa65985f42d8fe..49b152b0f926289e1dcaa595988932a036df5325 100644 (file)
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                /* If PMD busy, retry the access */
                if (unlikely(old_pmd & _PAGE_BUSY))
                        return 0;
-               /* If PMD is trans splitting retry the access */
-               if (unlikely(old_pmd & _PAGE_SPLITTING))
-                       return 0;
                /* If PMD permissions don't match, take page fault */
                if (unlikely(access & ~old_pmd))
                        return 1;
index 61b8b7ccea4f55882ccf00f000879dda6e2dd493..744e24bcb85c4445dcaf58b8d3334ef0a2702cbb 100644 (file)
@@ -958,10 +958,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
                        /*
                         * A hugepage collapse is captured by pmd_none, because
                         * it mark the pmd none and do a hpte invalidate.
-                        *
-                        * We don't worry about pmd_trans_splitting here, The
-                        * caller if it needs to handle the splitting case
-                        * should check for that.
                         */
                        if (pmd_none(pmd))
                                return NULL;
@@ -999,7 +995,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 {
        unsigned long mask;
        unsigned long pte_end;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        pte_t pte;
        int refs;
 
@@ -1022,7 +1018,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
        head = pte_page(pte);
 
        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -1044,15 +1039,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                return 0;
        }
 
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
index ea6bc31debb05562cf235d0696f5aaadbc698982..3124a20d0fab7a66b3a170356037da0d18c9da85 100644 (file)
@@ -603,55 +603,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
 }
 
-/*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                         unsigned long address, pmd_t *pmdp)
-{
-       unsigned long old, tmp;
-
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!pmd_trans_huge(*pmdp));
-       assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-
-       __asm__ __volatile__(
-       "1:     ldarx   %0,0,%3\n\
-               andi.   %1,%0,%6\n\
-               bne-    1b \n\
-               oris    %1,%0,%4@h \n\
-               stdcx.  %1,0,%3 \n\
-               bne-    1b"
-       : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
-       : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
-       : "cc" );
-#else
-       old = pmd_val(*pmdp);
-       *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
-       /*
-        * If we didn't had the splitting flag set, go and flush the
-        * HPTE entries.
-        */
-       trace_hugepage_splitting(address, old);
-       if (!(old & _PAGE_SPLITTING)) {
-               /* We need to flush the hpte */
-               if (old & _PAGE_HASHPTE)
-                       hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
-       }
-       /*
-        * This ensures that generic code that rely on IRQ disabling
-        * to prevent a parallel THP split work as expected.
-        */
-       kick_all_cpus_sync();
-}
-
 /*
  * We want to put the pgtable in pmd and use pgtable for tracking
  * the base page size hptes
index fa9fb5b4c66cf8b29fe550b4795fb4a3ec3aeeb7..d5543514c1dfe0d3f53c9ed19bda7e11896dd4b1 100644 (file)
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
                                  unsigned long end, struct mm_walk *walk)
 {
        struct vm_area_struct *vma = walk->vma;
-       split_huge_page_pmd(vma, addr, pmd);
+       split_huge_pmd(vma, pmd, addr);
        return 0;
 }
 
index c713b349d967946fe1b95e8ae9ff1ec18f7761bc..0d112b94d91d3ef00d6ac69a3abf61a238ddcb76 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/types.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
+#include <linux/pfn_t.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -142,15 +143,13 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-                      void __pmem **kaddr, unsigned long *pfn)
+                      void __pmem **kaddr, pfn_t *pfn)
 {
        struct axon_ram_bank *bank = device->bd_disk->private_data;
        loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
-       void *addr = (void *)(bank->ph_addr + offset);
-
-       *kaddr = (void __pmem *)addr;
-       *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
 
+       *kaddr = (void __pmem __force *) bank->io_addr + offset;
+       *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
        return bank->size - offset;
 }
 
index 24490344c30fea87afab3fdc8b98cec3423b267d..dbeeb3a049f21f8f24d08560b1afc9ee6ec3690b 100644 (file)
@@ -10,9 +10,6 @@ config LOCKDEP_SUPPORT
 config STACKTRACE_SUPPORT
        def_bool y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config RWSEM_GENERIC_SPINLOCK
        bool
 
index 024f85f947aec50ea93c881e56a73ba3a5591d3c..64ead80912488b476e19a004eaf01924dbdc6b4c 100644 (file)
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr)
 
 #define _SEGMENT_ENTRY_DIRTY   0x2000  /* SW segment dirty bit */
 #define _SEGMENT_ENTRY_YOUNG   0x1000  /* SW segment young bit */
-#define _SEGMENT_ENTRY_SPLIT   0x0800  /* THP splitting bit */
 #define _SEGMENT_ENTRY_LARGE   0x0400  /* STE-format control, large page */
 #define _SEGMENT_ENTRY_READ    0x0002  /* SW segment read bit */
 #define _SEGMENT_ENTRY_WRITE   0x0001  /* SW segment write bit */
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr)
  * SW-bits: y young, d dirty, r read, w write
  */
 
-#define _SEGMENT_ENTRY_SPLIT_BIT 11    /* THP splitting bit number */
-
 /* Page status table bits for virtualization */
 #define PGSTE_ACC_BITS 0xf000000000000000UL
 #define PGSTE_FP_BIT   0x0800000000000000UL
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
        return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 }
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
-
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
        if (pmd_large(pmd)) {
                pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
                        _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-                       _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
-                       _SEGMENT_ENTRY_SOFT_DIRTY;
+                       _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
                pmd_val(pmd) |= massage_pgprot_pmd(newprot);
                if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
                        pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
-               (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
-}
-
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t entry)
 {
index 21c74a71e2ab29f7243f013769a7b90d96181364..13dab0c1645c12de6b284b1b948758a524154441 100644 (file)
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
        unsigned long mask, result;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                return 0;
        }
 
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                pmd = *pmdp;
                barrier();
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush() has to serialize with
-                * smp_call_function() against our disabled IRQs, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        /*
index aa34af0a0b263a13940b5de0f26cb4aef3ef966c..a809fa8e6f8bd01d2c0ad90891060d6aabd36a80 100644 (file)
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 {
        unsigned long vmaddr;
        int rc;
+       bool unlocked;
 
        down_read(&gmap->mm->mmap_sem);
+
+retry:
+       unlocked = false;
        vmaddr = __gmap_translate(gmap, gaddr);
        if (IS_ERR_VALUE(vmaddr)) {
                rc = vmaddr;
                goto out_up;
        }
-       if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+       if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+                            &unlocked)) {
                rc = -EFAULT;
                goto out_up;
        }
+       /*
+        * In the case that fixup_user_fault unlocked the mmap_sem during
+        * faultin redo __gmap_translate to not race with a map/unmap_segment.
+        */
+       if (unlocked)
+               goto retry;
+
        rc = __gmap_link(gmap, gaddr, vmaddr);
 out_up:
        up_read(&gmap->mm->mmap_sem);
@@ -714,12 +726,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
        spinlock_t *ptl;
        pte_t *ptep, entry;
        pgste_t pgste;
+       bool unlocked;
        int rc = 0;
 
        if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
                return -EINVAL;
        down_read(&gmap->mm->mmap_sem);
        while (len) {
+               unlocked = false;
                /* Convert gmap address and connect the page tables */
                addr = __gmap_translate(gmap, gaddr);
                if (IS_ERR_VALUE(addr)) {
@@ -727,10 +741,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
                        break;
                }
                /* Get the page mapped */
-               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
+               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+                                    &unlocked)) {
                        rc = -EFAULT;
                        break;
                }
+               /* While trying to map mmap_sem got unlocked. Let us retry */
+               if (unlocked)
+                       continue;
                rc = __gmap_link(gmap, gaddr, addr);
                if (rc)
                        break;
@@ -791,9 +809,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
        spinlock_t *ptl;
        pgste_t old, new;
        pte_t *ptep;
+       bool unlocked;
 
        down_read(&mm->mmap_sem);
 retry:
+       unlocked = false;
        ptep = get_locked_pte(mm, addr, &ptl);
        if (unlikely(!ptep)) {
                up_read(&mm->mmap_sem);
@@ -802,7 +822,12 @@ retry:
        if (!(pte_val(*ptep) & _PAGE_INVALID) &&
             (pte_val(*ptep) & _PAGE_PROTECT)) {
                pte_unmap_unlock(ptep, ptl);
-               if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+               /*
+                * We do not really care about unlocked. We will retry either
+                * way. But this allows fixup_user_fault to enable userfaultfd.
+                */
+               if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
+                                    &unlocked)) {
                        up_read(&mm->mmap_sem);
                        return -EFAULT;
                }
@@ -1305,22 +1330,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
        return 1;
 }
 
-static void pmdp_splitting_flush_sync(void *arg)
-{
-       /* Simply deliver the interrupt */
-}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
-                             (unsigned long *) pmdp)) {
-               /* need to serialize against gup-fast (IRQ disabled) */
-               smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
-       }
-}
-
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
 {
index d514df7e04dd4c866597bd0772bfdc6a7dc000f9..6c391a5d3e5cf63f90475e5c55a2a5f5c687716d 100644 (file)
@@ -130,9 +130,6 @@ config STACKTRACE_SUPPORT
 config LOCKDEP_SUPPORT
        def_bool y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config ARCH_HAS_ILOG2_U32
        def_bool n
 
index 51d8f7f31d1d797392ab2813f4fb3d4d33480598..58aaa4f33b8129b8dd7e8118ec2691478cf79757 100644 (file)
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args)
                 */
                map_coherent = (current_cpu_data.dcache.n_aliases &&
                        test_bit(PG_dcache_clean, &page->flags) &&
-                       page_mapped(page));
+                       page_mapcount(page));
                if (map_coherent)
                        vaddr = kmap_coherent(page, address);
                else
index f770e3992620e8a1673ee1a2bd47280be55cccd4..e58cfbf4515008c32f670519b0904bb4aafad706 100644 (file)
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                       unsigned long vaddr, void *dst, const void *src,
                       unsigned long len)
 {
-       if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+       if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
            test_bit(PG_dcache_clean, &page->flags)) {
                void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(vto, src, len);
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
                         unsigned long vaddr, void *dst, const void *src,
                         unsigned long len)
 {
-       if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+       if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
            test_bit(PG_dcache_clean, &page->flags)) {
                void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(dst, vfrom, len);
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from,
 
        vto = kmap_atomic(to);
 
-       if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+       if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
            test_bit(PG_dcache_clean, &from->flags)) {
                vfrom = kmap_coherent(from, vaddr);
                copy_page(vto, vfrom);
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
        unsigned long addr = (unsigned long) page_address(page);
 
        if (pages_do_alias(addr, vmaddr)) {
-               if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+               if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
                    test_bit(PG_dcache_clean, &page->flags)) {
                        void *kaddr;
 
index 56442d2d7bbca4181e8897bfa5d9719dbded43d1..3203e42190dd79f513a2670677d373b72672c8f7 100644 (file)
@@ -101,10 +101,6 @@ config LOCKDEP_SUPPORT
        bool
        default y if SPARC64
 
-config HAVE_LATENCYTOP_SUPPORT
-       bool
-       default y if SPARC64
-
 config ARCH_HIBERNATION_POSSIBLE
        def_bool y if SPARC64
 
index 131d36fcd07a60af83ae1b6e8968e48577df54f0..7a38d6a576c5e2ea718deb632e63c0939e60c011 100644 (file)
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
        return pte_val(pte) & _PAGE_PMD_HUGE;
 }
 
-static inline unsigned long pmd_trans_splitting(pmd_t pmd)
-{
-       pte_t pte = __pte(pmd_val(pmd));
-
-       return pmd_trans_huge(pmd) && pte_special(pte);
-}
-
 #define has_transparent_hugepage() 1
 
 static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -717,29 +710,29 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return __pmd(pte_val(pte));
 }
 
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
+static inline pmd_t pmd_mkclean(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       pte = pte_mkyoung(pte);
+       pte = pte_mkclean(pte);
 
        return __pmd(pte_val(pte));
 }
 
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       pte = pte_mkwrite(pte);
+       pte = pte_mkyoung(pte);
 
        return __pmd(pte_val(pte));
 }
 
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       pte = pte_mkspecial(pte);
+       pte = pte_mkwrite(pte);
 
        return __pmd(pte_val(pte));
 }
index dbabe5713a158eec17eb61de90d1cdfd6e974e80..cb841a33da59061d6f435cb8cb7c5f717284f817 100644 (file)
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmdp)) {
-               if (pmd_trans_splitting(*pmdp))
-                       goto out_irq_enable;
-
                pa  = pmd_pfn(*pmdp) << PAGE_SHIFT;
                pa += tpc & ~HPAGE_MASK;
 
index 2e5c4fc2daa91efa1dd4325ca001169fc37b4d89..eb3d8e8ebc6b064febae847c92ef5329a02b8ae4 100644 (file)
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        put_page(head);
                        return 0;
                }
-               if (head != page)
-                       get_huge_page_tail(page);
 
                pages[*nr] = page;
                (*nr)++;
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                        unsigned long end, int write, struct page **pages,
                        int *nr)
 {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        if (!(pmd_val(pmd) & _PAGE_VALID))
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                return 0;
        }
 
-       /* Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmdp, pmd, addr, next,
index 2b05ccbebed9b88623871eef69234a24915bcdbb..96cecf55522ef492f1afbdd143ca80536d63d31d 100644 (file)
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define has_transparent_hugepage() 1
 #define pmd_trans_huge pmd_huge_page
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-       return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return hv_pte_get_client2(pmd_pte(pmd));
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
index 71c5d132062aa32074b3bda4a30db2ca08b86c5b..e13d41c392ae4f4940a41e6c8049099c71d28a04 100644 (file)
@@ -18,6 +18,7 @@
 
 struct page;
 
+#include <linux/pfn.h>
 #include <linux/types.h>
 #include <asm/vm-flags.h>
 
@@ -52,7 +53,6 @@ typedef struct { unsigned long pgd; } pgd_t;
 #define pmd_val(x)     ((x).pmd)
 #define __pmd(x) ((pmd_t) { (x) } )
 
-typedef unsigned long long pfn_t;
 typedef unsigned long long phys_t;
 
 #else
@@ -76,7 +76,6 @@ typedef struct { unsigned long pmd; } pmd_t;
 #define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
 #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot))
 
-typedef unsigned long pfn_t;
 typedef unsigned long phys_t;
 
 #endif
@@ -109,8 +108,8 @@ extern unsigned long uml_physmem;
 #define __pa(virt) to_phys((void *) (unsigned long) (virt))
 #define __va(phys) to_virt((unsigned long) (phys))
 
-#define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT))
-#define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT))
+#define phys_to_pfn(p) ((p) >> PAGE_SHIFT)
+#define pfn_to_phys(pfn) PFN_PHYS(pfn)
 
 #define pfn_valid(pfn) ((pfn) < max_mapnr)
 #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v)))
index 2b4274e7c0955f6f96795278c30e5287d7705768..bae8523a162fd3b80067260ddfad400bdf480e5b 100644 (file)
@@ -98,7 +98,7 @@ static inline unsigned long pte_pfn(pte_t pte)
        return phys_to_pfn(pte_val(pte));
 }
 
-static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot)
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
        pte_t pte;
        phys_t phys = pfn_to_phys(page_nr);
@@ -107,7 +107,7 @@ static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot)
        return pte;
 }
 
-static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot)
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
        return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
 }
index 18eb9924dda382f08e8cb3cb5b88e0454d621c1e..7485398d07370034e361ea1fe4173eeb9bd51d1c 100644 (file)
@@ -271,7 +271,7 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
 
 #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
 #define __virt_to_page(virt) phys_to_page(__pa(virt))
-#define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page))
+#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
 #define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
 
 #define mk_pte(page, pgprot) \
index 5dc4c0a43ccdea7b8f1317a2a6a87d06c2fa9648..877342640b6e6da59db68ee9705ca8e00a62e7e4 100644 (file)
@@ -34,9 +34,6 @@ config NO_IOPORT_MAP
 config STACKTRACE_SUPPORT
        def_bool y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config LOCKDEP_SUPPORT
        def_bool y
 
index 24f362bf3ec632d117429061785b6f5865d441e1..4a10ba9e95daac1842b4fdd93ae814d07b444eeb 100644 (file)
@@ -180,9 +180,6 @@ config LOCKDEP_SUPPORT
 config STACKTRACE_SUPPORT
        def_bool y
 
-config HAVE_LATENCYTOP_SUPPORT
-       def_bool y
-
 config MMU
        def_bool y
 
index d3eee663c41fccdf031e0f72f9847b6a26f493cf..0687c4748b8f87690471c94bade518d2b768242d 100644 (file)
@@ -162,20 +162,22 @@ static inline int pmd_large(pmd_t pte)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-       return pmd_val(pmd) & _PAGE_PSE;
+       return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
 }
 
 static inline int has_transparent_hugepage(void)
 {
        return cpu_has_pse;
 }
+
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pmd_devmap(pmd_t pmd)
+{
+       return !!(pmd_val(pmd) & _PAGE_DEVMAP);
+}
+#endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -252,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
        return pte_set_flags(pte, _PAGE_SPECIAL);
 }
 
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
+}
+
 static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
 {
        pmdval_t v = native_pmd_val(pmd);
@@ -271,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
 }
 
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
        return pmd_clear_flags(pmd, _PAGE_RW);
@@ -281,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_DEVMAP);
+}
+
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
        return pmd_set_flags(pmd, _PAGE_PSE);
@@ -462,6 +479,13 @@ static inline int pte_present(pte_t a)
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t a)
+{
+       return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
+}
+#endif
+
 #define pte_accessible pte_accessible
 static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 {
@@ -808,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
 
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
-
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
index a471cadb9630e7b8f46340139efaf4f467a24aff..04c27a0131656db29e97805432e56d3a1aae5992 100644 (file)
 #define _PAGE_BIT_PAT_LARGE    12      /* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL      _PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST     _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING    _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
 #define _PAGE_BIT_HIDDEN       _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY   _PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_SOFTW4       58      /* available for programmer */
+#define _PAGE_BIT_DEVMAP               _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_NX           63      /* No execute: only valid after cpuid check */
 
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -46,7 +47,6 @@
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL  (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING        (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
 #define __HAVE_ARCH_PTE_SPECIAL
 
 #ifdef CONFIG_KMEMCHECK
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX       (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#define _PAGE_DEVMAP   (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX       (_AT(pteval_t, 0))
+#define _PAGE_DEVMAP   (_AT(pteval_t, 0))
 #endif
 
 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
index d8ce3ec816ab1a86d2d06aca917dae5aae762e40..1544fabcd7f9b7428a5d3ec7429f00f03b545945 100644 (file)
@@ -132,12 +132,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 {
        void *vaddr = (void __force *)addr;
 
-       /* TODO: implement the zeroing via non-temporal writes */
-       if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
-               clear_page(vaddr);
-       else
-               memset(vaddr, 0, size);
-
+       memset(vaddr, 0, size);
        __arch_wb_cache_pmem(vaddr, size);
 }
 
index 483231ebbb0b2e254bbd749997e7b3feea62bb44..e574b85465185fe273a839093452c26521ac3d1e 100644 (file)
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        if (pud_none_or_clear_bad(pud))
                goto out;
        pmd = pmd_offset(pud, 0xA0000);
-       split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+       if (pmd_trans_huge(*pmd)) {
+               struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+               split_huge_pmd(vma, pmd, 0xA0000);
+       }
        if (pmd_none_or_clear_bad(pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
index 5c520ebf6343270272679e213b19cd9380b63293..a22a488b46226834dd575e8df827a13df9201ea8 100644 (file)
@@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
                                gfn_t base_gfn, unsigned long npages);
 
-static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
                           unsigned long npages)
 {
        gfn_t end_gfn;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        pfn     = gfn_to_pfn_memslot(slot, gfn);
        end_gfn = gfn + npages;
@@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
        return pfn;
 }
 
-static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
+               unsigned long npages)
 {
        unsigned long i;
 
@@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        gfn_t gfn, end_gfn;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        int r = 0;
        struct iommu_domain *domain = kvm->arch.iommu_domain;
        int flags;
@@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 {
        struct iommu_domain *domain;
        gfn_t end_gfn, gfn;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        u64 phys;
 
        domain  = kvm->arch.iommu_domain;
index 420a5ca3c0ee445d83f8b155726806fbe90f0810..95a955de5964bcc3f4aa6791a004e29b28504c13 100644 (file)
@@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte)
 }
 
 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
-                         pfn_t pfn, unsigned access)
+                         kvm_pfn_t pfn, unsigned access)
 {
        if (unlikely(is_noslot_pfn(pfn))) {
                mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -320,7 +320,7 @@ static int is_last_spte(u64 pte, int level)
        return 0;
 }
 
-static pfn_t spte_to_pfn(u64 pte)
+static kvm_pfn_t spte_to_pfn(u64 pte)
 {
        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
@@ -582,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
  */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        u64 old_spte = *sptep;
 
        if (!spte_has_volatile_bits(old_spte))
@@ -1372,7 +1372,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        int need_flush = 0;
        u64 new_spte;
        pte_t *ptep = (pte_t *)data;
-       pfn_t new_pfn;
+       kvm_pfn_t new_pfn;
 
        WARN_ON(pte_huge(*ptep));
        new_pfn = pte_pfn(*ptep);
@@ -2450,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
        return 0;
 }
 
-static bool kvm_is_mmio_pfn(pfn_t pfn)
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 {
        if (pfn_valid(pfn))
                return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
@@ -2460,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn)
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int level,
-                   gfn_t gfn, pfn_t pfn, bool speculative,
+                   gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
        u64 spte;
@@ -2539,7 +2539,7 @@ done:
 }
 
 static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
-                        int write_fault, int level, gfn_t gfn, pfn_t pfn,
+                        int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
                         bool speculative, bool host_writable)
 {
        int was_rmapped = 0;
@@ -2602,7 +2602,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
        return emulate;
 }
 
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
        struct kvm_memory_slot *slot;
@@ -2684,7 +2684,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
-                       int level, gfn_t gfn, pfn_t pfn, bool prefault)
+                       int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
 {
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
@@ -2732,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
        send_sig_info(SIGBUS, &info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
 {
        /*
         * Do not cache the mmio info caused by writing the readonly gfn
@@ -2752,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                       gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+                                       gfn_t *gfnp, kvm_pfn_t *pfnp,
+                                       int *levelp)
 {
-       pfn_t pfn = *pfnp;
+       kvm_pfn_t pfn = *pfnp;
        gfn_t gfn = *gfnp;
        int level = *levelp;
 
@@ -2793,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 }
 
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
-                               pfn_t pfn, unsigned access, int *ret_val)
+                               kvm_pfn_t pfn, unsigned access, int *ret_val)
 {
        bool ret = true;
 
@@ -2947,7 +2948,7 @@ exit:
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, pfn_t *pfn, bool write, bool *writable);
+                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
 static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
@@ -2956,7 +2957,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
        int r;
        int level;
        bool force_pt_level = false;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        unsigned long mmu_seq;
        bool map_writable, write = error_code & PFERR_WRITE_MASK;
 
@@ -3410,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, pfn_t *pfn, bool write, bool *writable)
+                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
        struct kvm_memory_slot *slot;
        bool async;
@@ -3448,7 +3449,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
                          bool prefault)
 {
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        int r;
        int level;
        bool force_pt_level;
@@ -4601,7 +4602,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
        u64 *sptep;
        struct rmap_iterator iter;
        int need_tlb_flush = 0;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        struct kvm_mmu_page *sp;
 
 restart:
index 1cee3ec20dd2be5cf92dff8d34a4e4857477f617..dcce533d420c384f2081ab9197f3c4fa50cb8352 100644 (file)
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
        struct kvm_mmu_page *sp;
        gfn_t gfn;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        hpa_t hpa;
 
        sp = page_header(__pa(sptep));
index 91e939b486d178bc9f51a1668034add0e38bd096..6c9fed957cce1c897f0a490ec74c3f4e4f76d4dc 100644 (file)
@@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 {
        unsigned pte_access;
        gfn_t gfn;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                return false;
@@ -551,7 +551,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *gw,
                         int write_fault, int hlevel,
-                        pfn_t pfn, bool map_writable, bool prefault)
+                        kvm_pfn_t pfn, bool map_writable, bool prefault)
 {
        struct kvm_mmu_page *sp = NULL;
        struct kvm_shadow_walk_iterator it;
@@ -694,7 +694,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        int user_fault = error_code & PFERR_USER_MASK;
        struct guest_walker walker;
        int r;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
        bool force_pt_level = false;
        unsigned long mmu_seq;
index 04d61d496b14e36cef0c8818cce52fe36b6597db..e2951b6edbbce4deb5bad4fb38b8858c8093f8d5 100644 (file)
@@ -4251,7 +4251,7 @@ out:
 static int init_rmode_identity_map(struct kvm *kvm)
 {
        int i, idx, r = 0;
-       pfn_t identity_map_pfn;
+       kvm_pfn_t identity_map_pfn;
        u32 tmp;
 
        if (!enable_ept)
index f53f5b13c677c8bdde6c35e33bdb665bf4c2a732..4244c2baf57da55aa5dd266b63781642dcc8af12 100644 (file)
@@ -5148,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
                                  int emulation_type)
 {
        gpa_t gpa = cr2;
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        if (emulation_type & EMULTYPE_NO_REEXECUTE)
                return false;
index ae9a37bf13711460892584e67d02880291168f86..6d5eb59003721b79213a7661994be1a339bc5590 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/memremap.h>
 
 #include <asm/pgtable.h>
 
@@ -63,6 +64,16 @@ retry:
 #endif
 }
 
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+       while ((*nr) - nr_start) {
+               struct page *page = pages[--(*nr)];
+
+               ClearPageReferenced(page);
+               put_page(page);
+       }
+}
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -71,7 +82,9 @@ retry:
 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
+       struct dev_pagemap *pgmap = NULL;
        unsigned long mask;
+       int nr_start = *nr;
        pte_t *ptep;
 
        mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        return 0;
                }
 
-               if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+               page = pte_page(pte);
+               if (pte_devmap(pte)) {
+                       pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+                       if (unlikely(!pgmap)) {
+                               undo_dev_pagemap(nr, nr_start, pages);
+                               pte_unmap(ptep);
+                               return 0;
+                       }
+               } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
                        pte_unmap(ptep);
                        return 0;
                }
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-               page = pte_page(pte);
                get_page(page);
+               put_dev_pagemap(pgmap);
                SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;
@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
        SetPageReferenced(page);
 }
 
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+               unsigned long end, struct page **pages, int *nr)
+{
+       int nr_start = *nr;
+       unsigned long pfn = pmd_pfn(pmd);
+       struct dev_pagemap *pgmap = NULL;
+
+       pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+       do {
+               struct page *page = pfn_to_page(pfn);
+
+               pgmap = get_dev_pagemap(pfn, pgmap);
+               if (unlikely(!pgmap)) {
+                       undo_dev_pagemap(nr, nr_start, pages);
+                       return 0;
+               }
+               SetPageReferenced(page);
+               pages[*nr] = page;
+               get_page(page);
+               put_dev_pagemap(pgmap);
+               (*nr)++;
+               pfn++;
+       } while (addr += PAGE_SIZE, addr != end);
+       return 1;
+}
+
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
                mask |= _PAGE_RW;
        if ((pmd_flags(pmd) & mask) != mask)
                return 0;
+
+       VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
+       if (pmd_devmap(pmd))
+               return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
+
        /* hugepages are never "special" */
        VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-       VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
 
        refs = 0;
        head = pmd_page(pmd);
@@ -136,8 +187,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -158,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush has to flush the tlb, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state. gup-fast
-                * can't because it has irq disabled and
-                * wait_split_huge_page() would never return as the
-                * tlb flush IPI wouldn't run.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
                        /*
@@ -212,8 +250,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
index 8829482d69ec2872adaeaeb9bd68539c1b6dda38..5488d21123bd2edb3f71ded119495474f3e8b728 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
+#include <linux/memremap.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
 #include <linux/kcore.h>
@@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order)
 {
        unsigned long magic;
        unsigned int nr_pages = 1 << order;
+       struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
+
+       if (altmap) {
+               vmem_altmap_free(altmap, nr_pages);
+               return;
+       }
 
        /* bootmem page has reserved flag */
        if (PageReserved(page)) {
@@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct page *page = pfn_to_page(start_pfn);
+       struct vmem_altmap *altmap;
        struct zone *zone;
        int ret;
 
-       zone = page_zone(pfn_to_page(start_pfn));
-       kernel_physical_mapping_remove(start, start + size);
+       /* With altmap the first mapped page is offset from @start */
+       altmap = to_vmem_altmap((unsigned long) page);
+       if (altmap)
+               page += vmem_altmap_offset(altmap);
+       zone = page_zone(page);
        ret = __remove_pages(zone, start_pfn, nr_pages);
        WARN_ON_ONCE(ret);
+       kernel_physical_mapping_remove(start, start + size);
 
        return ret;
 }
@@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end;
 static int __meminitdata node_start;
 
 static int __meminit vmemmap_populate_hugepages(unsigned long start,
-                                               unsigned long end, int node)
+               unsigned long end, int node, struct vmem_altmap *altmap)
 {
        unsigned long addr;
        unsigned long next;
@@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
                if (pmd_none(*pmd)) {
                        void *p;
 
-                       p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+                       p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
                        if (p) {
                                pte_t entry;
 
@@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
                                addr_end = addr + PMD_SIZE;
                                p_end = p + PMD_SIZE;
                                continue;
-                       }
+                       } else if (altmap)
+                               return -ENOMEM; /* no fallback */
                } else if (pmd_large(*pmd)) {
                        vmemmap_verify((pte_t *)pmd, node, addr, next);
                        continue;
@@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
 
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
+       struct vmem_altmap *altmap = to_vmem_altmap(start);
        int err;
 
        if (cpu_has_pse)
-               err = vmemmap_populate_hugepages(start, end, node);
-       else
+               err = vmemmap_populate_hugepages(start, end, node, altmap);
+       else if (altmap) {
+               pr_err_once("%s: no cpu support for altmap allocations\n",
+                               __func__);
+               err = -ENOMEM;
+       } else
                err = vmemmap_populate_basepages(start, end, node);
        if (!err)
                sync_global_pgds(start, end - 1, 0);
index 031782e7423197ab4dbadb857eeed3349bcde396..f4ae536b0914db1db521feabe09ae0ab681a4b87 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
@@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 }
 
 int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-                    unsigned long pfn)
+                    pfn_t pfn)
 {
        enum page_cache_mode pcm;
 
@@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                return 0;
 
        /* Set prot based on lookup */
-       pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+       pcm = lookup_memtype(pfn_t_to_phys(pfn));
        *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
                         cachemode2protval(pcm));
 
index ee9c2e3a71999e547996e94a209954d8cc58c445..4eb287e25043ed17639832a1d6b3f71b528ab69e 100644 (file)
@@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 
        return young;
 }
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                         unsigned long address, pmd_t *pmdp)
-{
-       int set;
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
-                               (unsigned long *)pmdp);
-       if (set) {
-               /* need tlb flush only to serialize against gup-fast */
-               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-       }
-}
 #endif
 
 /**
index 360944e1da52a0f31572a423a72a16a33fc86537..d030594ed22b25390aee7eb0159b7e9063ab9558 100644 (file)
 #define MADV_SEQUENTIAL        2               /* expect sequential page references */
 #define MADV_WILLNEED  3               /* will need these pages */
 #define MADV_DONTNEED  4               /* don't need these pages */
+#define MADV_FREE      5               /* free pages only if memory pressure */
 
 /* common parameters: try to keep these consistent across architectures */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 #define MADV_REMOVE    9               /* remove these pages & resources */
 #define MADV_DONTFORK  10              /* don't inherit across fork */
 #define MADV_DOFORK    11              /* do inherit across fork */
index 5ece856c5725c7cc72d0a0175bf9229330fabec5..35c822286bbe8b194e654cceef50972c260922c6 100644 (file)
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
                                                page_mapcount(p));
                                if (!page_count(p))
                                        rc |= TLB_INSANE;
-                               else if (page_mapped(p))
+                               else if (page_mapcount(p))
                                        rc |= TLB_SUSPICIOUS;
                        } else {
                                rc |= TLB_INSANE;
index 619fe584a44ccbc294395f66448973b19595ed5e..213456c2b1236a6eb4949155f087a8d139c5aa4e 100644 (file)
@@ -647,6 +647,13 @@ static int add_memory_block(int base_section_nr)
        return 0;
 }
 
+static bool is_zone_device_section(struct mem_section *ms)
+{
+       struct page *page;
+
+       page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms));
+       return is_zone_device_page(page);
+}
 
 /*
  * need an interface for the VM to add new memory regions,
@@ -657,6 +664,9 @@ int register_new_memory(int nid, struct mem_section *section)
        int ret = 0;
        struct memory_block *mem;
 
+       if (is_zone_device_section(section))
+               return 0;
+
        mutex_lock(&mem_sysfs_mutex);
 
        mem = find_memory_block(section);
@@ -693,6 +703,9 @@ static int remove_memory_section(unsigned long node_id,
 {
        struct memory_block *mem;
 
+       if (is_zone_device_section(section))
+               return 0;
+
        mutex_lock(&mem_sysfs_mutex);
        mem = find_memory_block(section);
        unregister_mem_sect_under_nodes(mem, __section_nr(section));
index a5880f4ab40eb069bda60d6403b223d61ff1db80..cb27190e9f395f94a4cac646e3a4beb12b7fa6fc 100644 (file)
@@ -19,6 +19,9 @@
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+#include <linux/pfn_t.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -378,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-                       void __pmem **kaddr, unsigned long *pfn)
+                       void __pmem **kaddr, pfn_t *pfn)
 {
        struct brd_device *brd = bdev->bd_disk->private_data;
        struct page *page;
@@ -389,7 +392,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
        if (!page)
                return -ENOSPC;
        *kaddr = (void __pmem *)page_address(page);
-       *pfn = page_to_pfn(page);
+       *pfn = page_to_pfn_t(page);
 
        return PAGE_SIZE;
 }
index 47915d736f8d4fd2f145dca24f97c015fa76a245..370c2f76016d685820689f543ea96c1006230131 100644 (file)
@@ -1325,7 +1325,6 @@ static int zram_remove(struct zram *zram)
 
        pr_info("Removed device: %s\n", zram->disk->disk_name);
 
-       idr_remove(&zram_index_idr, zram->disk->first_minor);
        blk_cleanup_queue(zram->disk->queue);
        del_gendisk(zram->disk);
        put_disk(zram->disk);
@@ -1367,10 +1366,12 @@ static ssize_t hot_remove_store(struct class *class,
        mutex_lock(&zram_index_mutex);
 
        zram = idr_find(&zram_index_idr, dev_id);
-       if (zram)
+       if (zram) {
                ret = zram_remove(zram);
-       else
+               idr_remove(&zram_index_idr, dev_id);
+       } else {
                ret = -ENODEV;
+       }
 
        mutex_unlock(&zram_index_mutex);
        return ret ? ret : count;
index 252eb301470ce576df2daf0cd01b7942172da51b..32358c5e3db4be25e7127225fa86e343b97757c7 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
+#include <linux/pfn_t.h>
 #include <drm/exynos_drm.h>
 
 #include "exynos_drm_drv.h"
@@ -490,7 +491,8 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
 
        pfn = page_to_pfn(exynos_gem->pages[page_offset]);
-       ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn);
+       ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                       __pfn_to_pfn_t(pfn, PFN_DEV));
 
 out:
        switch (ret) {
index ee95c03a8c54fcb7948bcb761aa84c38d40c8b00..cb95765050cc0aade141941dfde9e77993373179 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/pfn_t.h>
 #include <linux/mm.h>
 #include <linux/tty.h>
 #include <linux/slab.h>
@@ -132,7 +133,8 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        for (i = 0; i < page_num; i++) {
                pfn = (phys_addr >> PAGE_SHIFT);
 
-               ret = vm_insert_mixed(vma, address, pfn);
+               ret = vm_insert_mixed(vma, address,
+                               __pfn_to_pfn_t(pfn, PFN_DEV));
                if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0)))
                        break;
                else if (unlikely(ret != 0)) {
index c76cc853b08a57effec626b8c6f537b270ca61ac..3cedb8d5c855ac26c3e775db521ab48ddf42fb39 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
+#include <linux/pfn_t.h>
 
 #include "msm_drv.h"
 #include "msm_gem.h"
@@ -222,7 +223,8 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
                        pfn, pfn << PAGE_SHIFT);
 
-       ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn);
+       ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                       __pfn_to_pfn_t(pfn, PFN_DEV));
 
 out_unlock:
        mutex_unlock(&dev->struct_mutex);
index 7ed08fdc4c4285eff29f109ce67de4b79e970de2..ceba5459ceb75b33bded69b6b56ffb9c5919a03c 100644 (file)
@@ -19,6 +19,7 @@
 
 #include <linux/shmem_fs.h>
 #include <linux/spinlock.h>
+#include <linux/pfn_t.h>
 
 #include <drm/drm_vma_manager.h>
 
@@ -385,7 +386,8 @@ static int fault_1d(struct drm_gem_object *obj,
        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
                        pfn, pfn << PAGE_SHIFT);
 
-       return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn);
+       return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                       __pfn_to_pfn_t(pfn, PFN_DEV));
 }
 
 /* Special handling for the case of faulting in 2d tiled buffers */
@@ -478,7 +480,8 @@ static int fault_2d(struct drm_gem_object *obj,
                        pfn, pfn << PAGE_SHIFT);
 
        for (i = n; i > 0; i--) {
-               vm_insert_mixed(vma, (unsigned long)vaddr, pfn);
+               vm_insert_mixed(vma, (unsigned long)vaddr,
+                               __pfn_to_pfn_t(pfn, PFN_DEV));
                pfn += usergart[fmt].stride_pfn;
                vaddr += PAGE_SIZE * m;
        }
index 8fb7213277cc9029d101f2006e7bc50289ab51df..06d26dc438b264dd76e7cbba28fd6b3d7c92fea3 100644 (file)
@@ -35,6 +35,7 @@
 #include <ttm/ttm_placement.h>
 #include <drm/drm_vma_manager.h>
 #include <linux/mm.h>
+#include <linux/pfn_t.h>
 #include <linux/rbtree.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
@@ -229,7 +230,8 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                }
 
                if (vma->vm_flags & VM_MIXEDMAP)
-                       ret = vm_insert_mixed(&cvma, address, pfn);
+                       ret = vm_insert_mixed(&cvma, address,
+                                       __pfn_to_pfn_t(pfn, PFN_DEV));
                else
                        ret = vm_insert_pfn(&cvma, address, pfn);
 
index fd01f3493fc7792792d015c604903197ba506073..af7cc1e65656e1bf0ac83f561945bbbcf45d9de9 100644 (file)
@@ -433,16 +433,15 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals)
                scale_db = true;
        case IIO_VAL_INT_PLUS_MICRO:
                if (vals[1] < 0)
-                       return sprintf(buf, "-%ld.%06u%s\n", abs(vals[0]),
-                                       -vals[1],
-                               scale_db ? " dB" : "");
+                       return sprintf(buf, "-%d.%06u%s\n", abs(vals[0]),
+                                      -vals[1], scale_db ? " dB" : "");
                else
                        return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1],
                                scale_db ? " dB" : "");
        case IIO_VAL_INT_PLUS_NANO:
                if (vals[1] < 0)
-                       return sprintf(buf, "-%ld.%09u\n", abs(vals[0]),
-                                       -vals[1]);
+                       return sprintf(buf, "-%d.%09u\n", abs(vals[0]),
+                                      -vals[1]);
                else
                        return sprintf(buf, "%d.%09u\n", vals[0], vals[1]);
        case IIO_VAL_FRACTIONAL:
index 07a4c644fb9be4b0aa3eef7f2ec889853eecd19c..e9cef9de9ed85fffbf02fd71219ffd57323e4a9a 100644 (file)
@@ -901,7 +901,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
                /* bound gain by 2 bits value max, 3rd bit is sign */
                data->delta_gain_code[i] =
                        min(abs(delta_g),
-                       (long) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
+                       (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
 
                if (delta_g < 0)
                        /*
index f9b674bc49db291f27b8504e803d2f0a47aa0740..0cc9048b86e23103900d36f75a0b64746521ee18 100644 (file)
@@ -83,8 +83,7 @@ static ssize_t mode_store(struct device *dev,
 
                if (strncmp(buf, "pmem\n", n) == 0
                                || strncmp(buf, "pmem", n) == 0) {
-                       /* TODO: allocate from PMEM support */
-                       rc = -ENOTTY;
+                       nd_pfn->mode = PFN_MODE_PMEM;
                } else if (strncmp(buf, "ram\n", n) == 0
                                || strncmp(buf, "ram", n) == 0)
                        nd_pfn->mode = PFN_MODE_RAM;
index b493ff3fccb2dcc85643711a5b02bc3bfc954836..7edf31671dabed8f8d193806f54f3d1b30401e7c 100644 (file)
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
-#include <linux/memory_hotplug.h>
 #include <linux/moduleparam.h>
 #include <linux/badblocks.h>
+#include <linux/memremap.h>
 #include <linux/vmalloc.h>
+#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
 #include <linux/nd.h>
@@ -40,6 +41,7 @@ struct pmem_device {
        phys_addr_t             phys_addr;
        /* when non-zero this device is hosting a 'pfn' instance */
        phys_addr_t             data_offset;
+       unsigned long           pfn_flags;
        void __pmem             *virt_addr;
        size_t                  size;
        struct badblocks        bb;
@@ -135,13 +137,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-                     void __pmem **kaddr, unsigned long *pfn)
+                     void __pmem **kaddr, pfn_t *pfn)
 {
        struct pmem_device *pmem = bdev->bd_disk->private_data;
        resource_size_t offset = sector * 512 + pmem->data_offset;
 
        *kaddr = pmem->virt_addr + offset;
-       *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+       *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
        return pmem->size - offset;
 }
@@ -157,6 +159,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
                struct resource *res, int id)
 {
        struct pmem_device *pmem;
+       struct request_queue *q;
 
        pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
        if (!pmem)
@@ -174,16 +177,26 @@ static struct pmem_device *pmem_alloc(struct device *dev,
                return ERR_PTR(-EBUSY);
        }
 
-       if (pmem_should_map_pages(dev))
-               pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res);
-       else
+       q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
+       if (!q)
+               return ERR_PTR(-ENOMEM);
+
+       pmem->pfn_flags = PFN_DEV;
+       if (pmem_should_map_pages(dev)) {
+               pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
+                               &q->q_usage_counter, NULL);
+               pmem->pfn_flags |= PFN_MAP;
+       } else
                pmem->virt_addr = (void __pmem *) devm_memremap(dev,
                                pmem->phys_addr, pmem->size,
                                ARCH_MEMREMAP_PMEM);
 
-       if (IS_ERR(pmem->virt_addr))
+       if (IS_ERR(pmem->virt_addr)) {
+               blk_cleanup_queue(q);
                return (void __force *) pmem->virt_addr;
+       }
 
+       pmem->pmem_queue = q;
        return pmem;
 }
 
@@ -203,10 +216,6 @@ static int pmem_attach_disk(struct device *dev,
        int nid = dev_to_node(dev);
        struct gendisk *disk;
 
-       pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid);
-       if (!pmem->pmem_queue)
-               return -ENOMEM;
-
        blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
        blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
        blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
@@ -352,12 +361,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
        struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
        struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
        struct device *dev = &nd_pfn->dev;
-       struct vmem_altmap *altmap;
        struct nd_region *nd_region;
+       struct vmem_altmap *altmap;
        struct nd_pfn_sb *pfn_sb;
        struct pmem_device *pmem;
+       struct request_queue *q;
        phys_addr_t offset;
        int rc;
+       struct vmem_altmap __altmap = {
+               .base_pfn = __phys_to_pfn(nsio->res.start),
+               .reserve = __phys_to_pfn(SZ_8K),
+       };
 
        if (!nd_pfn->uuid || !nd_pfn->ndns)
                return -ENODEV;
@@ -375,6 +389,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
                        return -EINVAL;
                nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
                altmap = NULL;
+       } else if (nd_pfn->mode == PFN_MODE_PMEM) {
+               nd_pfn->npfns = (resource_size(&nsio->res) - offset)
+                       / PAGE_SIZE;
+               if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
+                       dev_info(&nd_pfn->dev,
+                                       "number of pfns truncated from %lld to %ld\n",
+                                       le64_to_cpu(nd_pfn->pfn_sb->npfns),
+                                       nd_pfn->npfns);
+               altmap = & __altmap;
+               altmap->free = __phys_to_pfn(offset - SZ_8K);
+               altmap->alloc = 0;
        } else {
                rc = -ENXIO;
                goto err;
@@ -382,8 +407,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 
        /* establish pfn range for lookup, and switch to direct map */
        pmem = dev_get_drvdata(dev);
+       q = pmem->pmem_queue;
        devm_memunmap(dev, (void __force *) pmem->virt_addr);
-       pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res);
+       pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
+                       &q->q_usage_counter, altmap);
+       pmem->pfn_flags |= PFN_MAP;
        if (IS_ERR(pmem->virt_addr)) {
                rc = PTR_ERR(pmem->virt_addr);
                goto err;
@@ -424,19 +452,22 @@ static int nd_pmem_probe(struct device *dev)
                return -ENOMEM;
        nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
 
-       if (is_nd_btt(dev))
+       if (is_nd_btt(dev)) {
+               /* btt allocates its own request_queue */
+               blk_cleanup_queue(pmem->pmem_queue);
+               pmem->pmem_queue = NULL;
                return nvdimm_namespace_attach_btt(ndns);
+       }
 
        if (is_nd_pfn(dev))
                return nvdimm_namespace_attach_pfn(ndns);
 
-       if (nd_btt_probe(ndns, pmem) == 0) {
-               /* we'll come back as btt-pmem */
-               return -ENXIO;
-       }
-
-       if (nd_pfn_probe(ndns, pmem) == 0) {
-               /* we'll come back as pfn-pmem */
+       if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
+               /*
+                * We'll come back as either btt-pmem, or pfn-pmem, so
+                * drop the queue allocation for now.
+                */
+               blk_cleanup_queue(pmem->pmem_queue);
                return -ENXIO;
        }
 
index 94a8f4ab57bc4c5fa6ccf3fca0002daa25e6a010..ce7b7018174051998663377b0168aa221cc33185 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
+#include <linux/pfn_t.h>
 #include <asm/extmem.h>
 #include <asm/io.h>
 
@@ -30,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
                                                struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-                        void __pmem **kaddr, unsigned long *pfn);
+                        void __pmem **kaddr, pfn_t *pfn);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -883,20 +884,18 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-                       void __pmem **kaddr, unsigned long *pfn)
+                       void __pmem **kaddr, pfn_t *pfn)
 {
        struct dcssblk_dev_info *dev_info;
        unsigned long offset, dev_sz;
-       void *addr;
 
        dev_info = bdev->bd_disk->private_data;
        if (!dev_info)
                return -ENODEV;
        dev_sz = dev_info->end - dev_info->start;
        offset = secnum * 512;
-       addr = (void *) (dev_info->start + offset);
-       *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
-       *kaddr = (void __pmem *) addr;
+       *kaddr = (void __pmem *) (dev_info->start + offset);
+       *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
 
        return dev_sz - offset;
 }
index 2bb1ef86c411a5483087ba106778532dc661a081..9adee0d7536e11343c9050b84174be6d2b90bdfc 100644 (file)
@@ -50,7 +50,8 @@ config FS_DAX_PMD
        bool
        default FS_DAX
        depends on FS_DAX
-       depends on BROKEN
+       depends on ZONE_DEVICE
+       depends on TRANSPARENT_HUGEPAGE
 
 endif # BLOCK
 
index 81c0705558beb4cce129d673fb97c6a53e02c955..530145b607c4a9c8ec99515ef68d4788f36ea245 100644 (file)
@@ -455,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
 /**
  * bdev_direct_access() - Get the address for directly-accessibly memory
  * @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
  *
  * If a block device is made up of directly addressable memory, this function
  * will tell the caller the PFN and the address of the memory.  The address
@@ -469,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
  * Return: negative errno if an error occurs, otherwise the number of bytes
  * accessible at this address.
  */
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
-                       void __pmem **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
-       long avail;
+       sector_t sector = dax->sector;
+       long avail, size = dax->size;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
 
        /*
@@ -491,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
-       avail = ops->direct_access(bdev, sector, addr, pfn);
+       avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
        if (!avail)
                return -ERANGE;
+       if (avail > 0 && avail & ~PAGE_MASK)
+               return -ENXIO;
        return min(avail, size);
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
index 0068e82217c3f98b1fa5fbc4e95926ae52182a6e..0a2752b79e72cc2b7a083894843a8b3ae1dea23d 100644 (file)
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
         * should have access to this page, we're safe to simply set
         * PG_locked without checking it first.
         */
-       __set_page_locked(page);
+       __SetPageLocked(page);
        rc = add_to_page_cache_locked(page, mapping,
                                      page->index, gfp);
 
        /* give up if we can't stick it in the cache */
        if (rc) {
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
                return rc;
        }
 
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
                if (*bytes + PAGE_CACHE_SIZE > rsize)
                        break;
 
-               __set_page_locked(page);
+               __SetPageLocked(page);
                if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
-                       __clear_page_locked(page);
+                       __ClearPageLocked(page);
                        break;
                }
                list_move_tail(&page->lru, tmplist);
index 43671b68220ed968386f5c1ad9067f236fbab67e..7af8797590640e032c74f619b9fd1931fed77e15 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
 #include <linux/sched.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
+
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+{
+       struct request_queue *q = bdev->bd_queue;
+       long rc = -EIO;
+
+       dax->addr = (void __pmem *) ERR_PTR(-EIO);
+       if (blk_queue_enter(q, true) != 0)
+               return rc;
+
+       rc = bdev_direct_access(bdev, dax);
+       if (rc < 0) {
+               dax->addr = (void __pmem *) ERR_PTR(rc);
+               blk_queue_exit(q);
+               return rc;
+       }
+       return rc;
+}
+
+static void dax_unmap_atomic(struct block_device *bdev,
+               const struct blk_dax_ctl *dax)
+{
+       if (IS_ERR(dax->addr))
+               return;
+       blk_queue_exit(bdev->bd_queue);
+}
 
 /*
  * dax_clear_blocks() is called from within transaction context from XFS,
  * and hence this means the stack from this point must follow GFP_NOFS
  * semantics for all operations.
  */
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 {
        struct block_device *bdev = inode->i_sb->s_bdev;
-       sector_t sector = block << (inode->i_blkbits - 9);
+       struct blk_dax_ctl dax = {
+               .sector = block << (inode->i_blkbits - 9),
+               .size = _size,
+       };
 
        might_sleep();
        do {
-               void __pmem *addr;
-               unsigned long pfn;
-               long count;
+               long count, sz;
 
-               count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+               count = dax_map_atomic(bdev, &dax);
                if (count < 0)
                        return count;
-               BUG_ON(size < count);
-               while (count > 0) {
-                       unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
-                       if (pgsz > count)
-                               pgsz = count;
-                       clear_pmem(addr, pgsz);
-                       addr += pgsz;
-                       size -= pgsz;
-                       count -= pgsz;
-                       BUG_ON(pgsz & 511);
-                       sector += pgsz / 512;
-                       cond_resched();
-               }
-       } while (size);
+               sz = min_t(long, count, SZ_128K);
+               clear_pmem(dax.addr, sz);
+               dax.size -= sz;
+               dax.sector += sz / 512;
+               dax_unmap_atomic(bdev, &dax);
+               cond_resched();
+       } while (dax.size);
 
        wmb_pmem();
        return 0;
 }
 EXPORT_SYMBOL_GPL(dax_clear_blocks);
 
-static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
-               unsigned blkbits)
-{
-       unsigned long pfn;
-       sector_t sector = bh->b_blocknr << (blkbits - 9);
-       return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
-
 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
                loff_t pos, loff_t end)
@@ -105,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
        return bh->b_state != 0;
 }
 
+
+static sector_t to_sector(const struct buffer_head *bh,
+               const struct inode *inode)
+{
+       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+       return sector;
+}
+
 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                      loff_t start, loff_t end, get_block_t get_block,
                      struct buffer_head *bh)
 {
-       ssize_t retval = 0;
-       loff_t pos = start;
-       loff_t max = start;
-       loff_t bh_max = start;
-       void __pmem *addr;
-       bool hole = false;
-       bool need_wmb = false;
-
-       if (iov_iter_rw(iter) != WRITE)
+       loff_t pos = start, max = start, bh_max = start;
+       bool hole = false, need_wmb = false;
+       struct block_device *bdev = NULL;
+       int rw = iov_iter_rw(iter), rc;
+       long map_len = 0;
+       struct blk_dax_ctl dax = {
+               .addr = (void __pmem *) ERR_PTR(-EIO),
+       };
+
+       if (rw == READ)
                end = min(end, i_size_read(inode));
 
        while (pos < end) {
@@ -132,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                        if (pos == bh_max) {
                                bh->b_size = PAGE_ALIGN(end - pos);
                                bh->b_state = 0;
-                               retval = get_block(inode, block, bh,
-                                                  iov_iter_rw(iter) == WRITE);
-                               if (retval)
+                               rc = get_block(inode, block, bh, rw == WRITE);
+                               if (rc)
                                        break;
                                if (!buffer_size_valid(bh))
                                        bh->b_size = 1 << blkbits;
                                bh_max = pos - first + bh->b_size;
+                               bdev = bh->b_bdev;
                        } else {
                                unsigned done = bh->b_size -
                                                (bh_max - (pos - first));
@@ -146,47 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                bh->b_size -= done;
                        }
 
-                       hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+                       hole = rw == READ && !buffer_written(bh);
                        if (hole) {
-                               addr = NULL;
                                size = bh->b_size - first;
                        } else {
-                               retval = dax_get_addr(bh, &addr, blkbits);
-                               if (retval < 0)
+                               dax_unmap_atomic(bdev, &dax);
+                               dax.sector = to_sector(bh, inode);
+                               dax.size = bh->b_size;
+                               map_len = dax_map_atomic(bdev, &dax);
+                               if (map_len < 0) {
+                                       rc = map_len;
                                        break;
+                               }
                                if (buffer_unwritten(bh) || buffer_new(bh)) {
-                                       dax_new_buf(addr, retval, first, pos,
-                                                                       end);
+                                       dax_new_buf(dax.addr, map_len, first,
+                                                       pos, end);
                                        need_wmb = true;
                                }
-                               addr += first;
-                               size = retval - first;
+                               dax.addr += first;
+                               size = map_len - first;
                        }
                        max = min(pos + size, end);
                }
 
                if (iov_iter_rw(iter) == WRITE) {
-                       len = copy_from_iter_pmem(addr, max - pos, iter);
+                       len = copy_from_iter_pmem(dax.addr, max - pos, iter);
                        need_wmb = true;
                } else if (!hole)
-                       len = copy_to_iter((void __force *)addr, max - pos,
+                       len = copy_to_iter((void __force *) dax.addr, max - pos,
                                        iter);
                else
                        len = iov_iter_zero(max - pos, iter);
 
                if (!len) {
-                       retval = -EFAULT;
+                       rc = -EFAULT;
                        break;
                }
 
                pos += len;
-               addr += len;
+               if (!IS_ERR(dax.addr))
+                       dax.addr += len;
        }
 
        if (need_wmb)
                wmb_pmem();
+       dax_unmap_atomic(bdev, &dax);
 
-       return (pos == start) ? retval : pos - start;
+       return (pos == start) ? rc : pos - start;
 }
 
 /**
@@ -275,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
        return VM_FAULT_LOCKED;
 }
 
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
-                       unsigned blkbits, unsigned long vaddr)
+static int copy_user_bh(struct page *to, struct inode *inode,
+               struct buffer_head *bh, unsigned long vaddr)
 {
-       void __pmem *vfrom;
+       struct blk_dax_ctl dax = {
+               .sector = to_sector(bh, inode),
+               .size = bh->b_size,
+       };
+       struct block_device *bdev = bh->b_bdev;
        void *vto;
 
-       if (dax_get_addr(bh, &vfrom, blkbits) < 0)
-               return -EIO;
+       if (dax_map_atomic(bdev, &dax) < 0)
+               return PTR_ERR(dax.addr);
        vto = kmap_atomic(to);
-       copy_user_page(vto, (void __force *)vfrom, vaddr, to);
+       copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
        kunmap_atomic(vto);
+       dax_unmap_atomic(bdev, &dax);
        return 0;
 }
 
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       struct address_space *mapping = inode->i_mapping;
-       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       void __pmem *addr;
-       unsigned long pfn;
+       struct address_space *mapping = inode->i_mapping;
+       struct block_device *bdev = bh->b_bdev;
+       struct blk_dax_ctl dax = {
+               .sector = to_sector(bh, inode),
+               .size = bh->b_size,
+       };
        pgoff_t size;
        int error;
 
@@ -315,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                goto out;
        }
 
-       error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
-       if (error < 0)
-               goto out;
-       if (error < PAGE_SIZE) {
-               error = -EIO;
+       if (dax_map_atomic(bdev, &dax) < 0) {
+               error = PTR_ERR(dax.addr);
                goto out;
        }
 
        if (buffer_unwritten(bh) || buffer_new(bh)) {
-               clear_pmem(addr, PAGE_SIZE);
+               clear_pmem(dax.addr, PAGE_SIZE);
                wmb_pmem();
        }
+       dax_unmap_atomic(bdev, &dax);
 
-       error = vm_insert_mixed(vma, vaddr, pfn);
+       error = vm_insert_mixed(vma, vaddr, dax.pfn);
 
  out:
        i_mmap_unlock_read(mapping);
@@ -422,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (vmf->cow_page) {
                struct page *new_page = vmf->cow_page;
                if (buffer_written(&bh))
-                       error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+                       error = copy_user_bh(new_page, inode, &bh, vaddr);
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
@@ -523,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
  */
 #define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
 
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+               const char *reason, const char *fn)
+{
+       if (bh) {
+               char bname[BDEVNAME_SIZE];
+               bdevname(bh->b_bdev, bname);
+               pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+                       "length %zd fallback: %s\n", fn, current->comm,
+                       address, bname, bh->b_state, (u64)bh->b_blocknr,
+                       bh->b_size, reason);
+       } else {
+               pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+                       current->comm, address, reason);
+       }
+}
+
+#define dax_pmd_dbg(bh, address, reason)       __dax_dbg(bh, address, reason, "dax_pmd")
+
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                pmd_t *pmd, unsigned int flags, get_block_t get_block,
                dax_iodone_t complete_unwritten)
@@ -534,41 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        unsigned blkbits = inode->i_blkbits;
        unsigned long pmd_addr = address & PMD_MASK;
        bool write = flags & FAULT_FLAG_WRITE;
-       long length;
-       void __pmem *kaddr;
+       struct block_device *bdev;
        pgoff_t size, pgoff;
-       sector_t block, sector;
-       unsigned long pfn;
+       sector_t block;
        int result = 0;
 
-       /* dax pmd mappings are broken wrt gup and fork */
+       /* dax pmd mappings require pfn_t_devmap() */
        if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
                return VM_FAULT_FALLBACK;
 
        /* Fall back to PTEs if we're going to COW */
-       if (write && !(vma->vm_flags & VM_SHARED))
+       if (write && !(vma->vm_flags & VM_SHARED)) {
+               split_huge_pmd(vma, pmd, address);
+               dax_pmd_dbg(NULL, address, "cow write");
                return VM_FAULT_FALLBACK;
+       }
        /* If the PMD would extend outside the VMA */
-       if (pmd_addr < vma->vm_start)
+       if (pmd_addr < vma->vm_start) {
+               dax_pmd_dbg(NULL, address, "vma start unaligned");
                return VM_FAULT_FALLBACK;
-       if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+       }
+       if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+               dax_pmd_dbg(NULL, address, "vma end unaligned");
                return VM_FAULT_FALLBACK;
+       }
 
        pgoff = linear_page_index(vma, pmd_addr);
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (pgoff >= size)
                return VM_FAULT_SIGBUS;
        /* If the PMD would cover blocks out of the file */
-       if ((pgoff | PG_PMD_COLOUR) >= size)
+       if ((pgoff | PG_PMD_COLOUR) >= size) {
+               dax_pmd_dbg(NULL, address,
+                               "offset + huge page size > file size");
                return VM_FAULT_FALLBACK;
+       }
 
        memset(&bh, 0, sizeof(bh));
        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
        bh.b_size = PMD_SIZE;
-       length = get_block(inode, block, &bh, write);
-       if (length)
+       if (get_block(inode, block, &bh, write) != 0)
                return VM_FAULT_SIGBUS;
+       bdev = bh.b_bdev;
        i_mmap_lock_read(mapping);
 
        /*
@@ -576,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
         * just fall back to PTEs.  Calling get_block 512 times in a loop
         * would be silly.
         */
-       if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+       if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+               dax_pmd_dbg(&bh, address, "allocated block too small");
                goto fallback;
+       }
 
        /*
         * If we allocated new storage, make sure no process has any
@@ -600,57 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                result = VM_FAULT_SIGBUS;
                goto out;
        }
-       if ((pgoff | PG_PMD_COLOUR) >= size)
+       if ((pgoff | PG_PMD_COLOUR) >= size) {
+               dax_pmd_dbg(&bh, address, "pgoff unaligned");
                goto fallback;
+       }
 
        if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
                spinlock_t *ptl;
                pmd_t entry;
                struct page *zero_page = get_huge_zero_page();
 
-               if (unlikely(!zero_page))
+               if (unlikely(!zero_page)) {
+                       dax_pmd_dbg(&bh, address, "no zero page");
                        goto fallback;
+               }
 
                ptl = pmd_lock(vma->vm_mm, pmd);
                if (!pmd_none(*pmd)) {
                        spin_unlock(ptl);
+                       dax_pmd_dbg(&bh, address, "pmd already present");
                        goto fallback;
                }
 
+               dev_dbg(part_to_dev(bdev->bd_part),
+                               "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+                               __func__, current->comm, address,
+                               (unsigned long long) to_sector(&bh, inode));
+
                entry = mk_pmd(zero_page, vma->vm_page_prot);
                entry = pmd_mkhuge(entry);
                set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
                result = VM_FAULT_NOPAGE;
                spin_unlock(ptl);
        } else {
-               sector = bh.b_blocknr << (blkbits - 9);
-               length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
-                                               bh.b_size);
+               struct blk_dax_ctl dax = {
+                       .sector = to_sector(&bh, inode),
+                       .size = PMD_SIZE,
+               };
+               long length = dax_map_atomic(bdev, &dax);
+
                if (length < 0) {
                        result = VM_FAULT_SIGBUS;
                        goto out;
                }
-               if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+               if (length < PMD_SIZE) {
+                       dax_pmd_dbg(&bh, address, "dax-length too small");
+                       dax_unmap_atomic(bdev, &dax);
                        goto fallback;
+               }
+               if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+                       dax_pmd_dbg(&bh, address, "pfn unaligned");
+                       dax_unmap_atomic(bdev, &dax);
+                       goto fallback;
+               }
 
-               /*
-                * TODO: teach vmf_insert_pfn_pmd() to support
-                * 'pte_special' for pmds
-                */
-               if (pfn_valid(pfn))
+               if (!pfn_t_devmap(dax.pfn)) {
+                       dax_unmap_atomic(bdev, &dax);
+                       dax_pmd_dbg(&bh, address, "pfn not in memmap");
                        goto fallback;
+               }
 
                if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-                       int i;
-                       for (i = 0; i < PTRS_PER_PMD; i++)
-                               clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+                       clear_pmem(dax.addr, PMD_SIZE);
                        wmb_pmem();
                        count_vm_event(PGMAJFAULT);
                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                        result |= VM_FAULT_MAJOR;
                }
-
-               result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+               dax_unmap_atomic(bdev, &dax);
+
+               dev_dbg(part_to_dev(bdev->bd_part),
+                               "%s: %s addr: %lx pfn: %lx sect: %llx\n",
+                               __func__, current->comm, address,
+                               pfn_t_to_pfn(dax.pfn),
+                               (unsigned long long) dax.sector);
+               result |= vmf_insert_pfn_pmd(vma, address, pmd,
+                               dax.pfn, write);
        }
 
  out:
@@ -752,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
        if (err < 0)
                return err;
        if (buffer_written(&bh)) {
-               void __pmem *addr;
-               err = dax_get_addr(&bh, &addr, inode->i_blkbits);
-               if (err < 0)
-                       return err;
-               clear_pmem(addr + offset, length);
+               struct block_device *bdev = bh.b_bdev;
+               struct blk_dax_ctl dax = {
+                       .sector = to_sector(&bh, inode),
+                       .size = PAGE_CACHE_SIZE,
+               };
+
+               if (dax_map_atomic(bdev, &dax) < 0)
+                       return PTR_ERR(dax.addr);
+               clear_pmem(dax.addr + offset, length);
                wmb_pmem();
+               dax_unmap_atomic(bdev, &dax);
        }
 
        return 0;
index 023f6a1f23cd034810aa26acde809cde118841f2..6915c950e6e8aeefb7c6162e7e7410de98f4e309 100644 (file)
@@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
        if (!wbc->wb)
                return;
 
-       rcu_read_lock();
        id = mem_cgroup_css_from_page(page)->id;
-       rcu_read_unlock();
 
        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
index 47789292a582f84ef7dc98be8fb29285bcd466af..8bbf7f3e2a27e0669e7f7fd5624a2cb5b50ce0fe 100644 (file)
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
        delete_from_page_cache(page);
 }
 
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+       struct vm_area_struct *vma;
+
+       /*
+        * end == 0 indicates that the entire range after
+        * start should be unmapped.
+        */
+       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+               unsigned long v_offset;
+               unsigned long v_end;
+
+               /*
+                * Can the expression below overflow on 32-bit arches?
+                * No, because the interval tree returns us only those vmas
+                * which overlap the truncated area starting at pgoff,
+                * and no vma on a 32-bit arch can span beyond the 4GB.
+                */
+               if (vma->vm_pgoff < start)
+                       v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+               else
+                       v_offset = 0;
+
+               if (!end)
+                       v_end = vma->vm_end;
+               else {
+                       v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+                                                       + vma->vm_start;
+                       if (v_end > vma->vm_end)
+                               v_end = vma->vm_end;
+               }
+
+               unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+                                                                       NULL);
+       }
+}
 
 /*
  * remove_inode_hugepages handles two distinct cases: truncation and hole
  * punch.  There are subtle differences in operation for each case.
-
+ *
  * truncation is indicated by end of range being LLONG_MAX
  *     In this case, we first scan the range and release found pages.
  *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
+                       bool rsv_on_error;
                        u32 hash;
 
                        /*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                                        mapping, next, 0);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
-                       lock_page(page);
-                       if (likely(!page_mapped(page))) {
-                               bool rsv_on_error = !PagePrivate(page);
-                               /*
-                                * We must free the huge page and remove
-                                * from page cache (remove_huge_page) BEFORE
-                                * removing the region/reserve map
-                                * (hugetlb_unreserve_pages).  In rare out
-                                * of memory conditions, removal of the
-                                * region/reserve map could fail.  Before
-                                * free'ing the page, note PagePrivate which
-                                * is used in case of error.
-                                */
-                               remove_huge_page(page);
-                               freed++;
-                               if (!truncate_op) {
-                                       if (unlikely(hugetlb_unreserve_pages(
-                                                       inode, next,
-                                                       next + 1, 1)))
-                                               hugetlb_fix_reserve_counts(
-                                                       inode, rsv_on_error);
-                               }
-                       } else {
-                               /*
-                                * If page is mapped, it was faulted in after
-                                * being unmapped.  It indicates a race between
-                                * hole punch and page fault.  Do nothing in
-                                * this case.  Getting here in a truncate
-                                * operation is a bug.
-                                */
+                       /*
+                        * If page is mapped, it was faulted in after being
+                        * unmapped in caller.  Unmap (again) now after taking
+                        * the fault mutex.  The mutex will prevent faults
+                        * until we finish removing the page.
+                        *
+                        * This race can only happen in the hole punch case.
+                        * Getting here in a truncate operation is a bug.
+                        */
+                       if (unlikely(page_mapped(page))) {
                                BUG_ON(truncate_op);
+
+                               i_mmap_lock_write(mapping);
+                               hugetlb_vmdelete_list(&mapping->i_mmap,
+                                       next * pages_per_huge_page(h),
+                                       (next + 1) * pages_per_huge_page(h));
+                               i_mmap_unlock_write(mapping);
+                       }
+
+                       lock_page(page);
+                       /*
+                        * We must free the huge page and remove from page
+                        * cache (remove_huge_page) BEFORE removing the
+                        * region/reserve map (hugetlb_unreserve_pages).  In
+                        * rare out of memory conditions, removal of the
+                        * region/reserve map could fail.  Before free'ing
+                        * the page, note PagePrivate which is used in case
+                        * of error.
+                        */
+                       rsv_on_error = !PagePrivate(page);
+                       remove_huge_page(page);
+                       freed++;
+                       if (!truncate_op) {
+                               if (unlikely(hugetlb_unreserve_pages(inode,
+                                                       next, next + 1, 1)))
+                                       hugetlb_fix_reserve_counts(inode,
+                                                               rsv_on_error);
                        }
 
                        unlock_page(page);
@@ -452,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
        clear_inode(inode);
 }
 
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-       struct vm_area_struct *vma;
-
-       /*
-        * end == 0 indicates that the entire range after
-        * start should be unmapped.
-        */
-       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-               unsigned long v_offset;
-
-               /*
-                * Can the expression below overflow on 32-bit arches?
-                * No, because the interval tree returns us only those vmas
-                * which overlap the truncated area starting at pgoff,
-                * and no vma on a 32-bit arch can span beyond the 4GB.
-                */
-               if (vma->vm_pgoff < start)
-                       v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-               else
-                       v_offset = 0;
-
-               if (end) {
-                       end = ((end - start) << PAGE_SHIFT) +
-                              vma->vm_start + v_offset;
-                       if (end > vma->vm_end)
-                               end = vma->vm_end;
-               } else
-                       end = vma->vm_end;
-
-               unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
-       }
-}
-
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
        pgoff_t pgoff;
@@ -708,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 /*
  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
  * be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
  * i_mmap_rwsem.
  */
 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
index 93484034a03d04c38cc5ff7779fb95e7611fbd09..b2855eea54050655818a424b850bc4d94d5d8f47 100644 (file)
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
         * pseudo flags for the well known (anonymous) memory mapped pages
         *
         * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
-        * simple test in page_mapped() is not enough.
+        * simple test in page_mapcount() is not enough.
         */
-       if (!PageSlab(page) && page_mapped(page))
+       if (!PageSlab(page) && page_mapcount(page))
                u |= 1 << KPF_MMAP;
        if (PageAnon(page))
                u |= 1 << KPF_ANON;
index a353b4c6e86e5d24007ac4f9ead1d80d577037d6..65a1b6c69c111e26bae44cccd7961a303ecdc993 100644 (file)
@@ -466,9 +466,10 @@ struct mem_size_stats {
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-               unsigned long size, bool young, bool dirty)
+               bool compound, bool young, bool dirty)
 {
-       int mapcount;
+       int i, nr = compound ? HPAGE_PMD_NR : 1;
+       unsigned long size = nr * PAGE_SIZE;
 
        if (PageAnon(page))
                mss->anonymous += size;
@@ -477,23 +478,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
        /* Accumulate the size in pages that have been accessed. */
        if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
-       mapcount = page_mapcount(page);
-       if (mapcount >= 2) {
-               u64 pss_delta;
 
-               if (dirty || PageDirty(page))
-                       mss->shared_dirty += size;
-               else
-                       mss->shared_clean += size;
-               pss_delta = (u64)size << PSS_SHIFT;
-               do_div(pss_delta, mapcount);
-               mss->pss += pss_delta;
-       } else {
+       /*
+        * page_count(page) == 1 guarantees the page is mapped exactly once.
+        * If any subpage of the compound page mapped with PTE it would elevate
+        * page_count().
+        */
+       if (page_count(page) == 1) {
                if (dirty || PageDirty(page))
                        mss->private_dirty += size;
                else
                        mss->private_clean += size;
                mss->pss += (u64)size << PSS_SHIFT;
+               return;
+       }
+
+       for (i = 0; i < nr; i++, page++) {
+               int mapcount = page_mapcount(page);
+
+               if (mapcount >= 2) {
+                       if (dirty || PageDirty(page))
+                               mss->shared_dirty += PAGE_SIZE;
+                       else
+                               mss->shared_clean += PAGE_SIZE;
+                       mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+               } else {
+                       if (dirty || PageDirty(page))
+                               mss->private_dirty += PAGE_SIZE;
+                       else
+                               mss->private_clean += PAGE_SIZE;
+                       mss->pss += PAGE_SIZE << PSS_SHIFT;
+               }
        }
 }
 
@@ -554,7 +569,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 
        if (!page)
                return;
-       smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+       smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -570,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
        if (IS_ERR_OR_NULL(page))
                return;
        mss->anonymous_thp += HPAGE_PMD_SIZE;
-       smaps_account(mss, page, HPAGE_PMD_SIZE,
-                       pmd_young(*pmd), pmd_dirty(*pmd));
+       smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -587,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                smaps_pmd_entry(pmd, addr, walk);
                spin_unlock(ptl);
                return 0;
@@ -898,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
                        clear_soft_dirty_pmd(vma, addr, pmd);
                        goto out;
@@ -1172,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
        int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
                u64 flags = 0, frame = 0;
                pmd_t pmd = *pmdp;
 
@@ -1504,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *orig_pte;
        pte_t *pte;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
 
index d4a61d8dc021e6e70a3a1317ef793753de8165ed..bc045c7994e1bf6fe98af3a475afaeb4ecd8f16a 100644 (file)
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 #  define choose_32_64(a,b) b
 #endif
 
-#define valid_dev(x)  choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x)  choose_32_64(old_valid_dev(x),true)
 #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
 
 #ifndef INIT_STRUCT_STAT_PADDING
index 3a6803cb0ec9848c31a5a200be24f3e928be520a..0b3c0d39ef753053bb26c1b9fb4979e706240a58 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_GENERIC_PGTABLE_H
 #define _ASM_GENERIC_PGTABLE_H
 
+#include <linux/pfn.h>
+
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_MMU
 
@@ -207,11 +209,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
-#endif
-
 #ifndef pmdp_collapse_flush
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@ -554,7 +551,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
  * by vm_insert_pfn().
  */
 static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-                                  unsigned long pfn)
+                                  pfn_t pfn)
 {
        return 0;
 }
@@ -589,7 +586,7 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
 extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-                           unsigned long pfn);
+                           pfn_t pfn);
 extern int track_pfn_copy(struct vm_area_struct *vma);
 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size);
@@ -627,10 +624,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
 {
        return 0;
 }
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return 0;
-}
 #ifndef __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
index b58fd667f87bc7c7b093c3cac1d293d78d89ec4d..af0254c0942476f67e92c08f8e75918e529b4994 100644 (file)
@@ -4,6 +4,7 @@
 /* References to section boundaries */
 
 #include <linux/compiler.h>
+#include <linux/types.h>
 
 /*
  * Usage guidelines:
@@ -63,4 +64,68 @@ static inline int arch_is_kernel_data(unsigned long addr)
 }
 #endif
 
+/**
+ * memory_contains - checks if an object is contained within a memory region
+ * @begin: virtual address of the beginning of the memory region
+ * @end: virtual address of the end of the memory region
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if the object specified by @virt and @size is entirely
+ * contained within the memory region defined by @begin and @end, false
+ * otherwise.
+ */
+static inline bool memory_contains(void *begin, void *end, void *virt,
+                                  size_t size)
+{
+       return virt >= begin && virt + size <= end;
+}
+
+/**
+ * memory_intersects - checks if the region occupied by an object intersects
+ *                     with another memory region
+ * @begin: virtual address of the beginning of the memory regien
+ * @end: virtual address of the end of the memory region
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if an object's memory region, specified by @virt and @size,
+ * intersects with the region specified by @begin and @end, false otherwise.
+ */
+static inline bool memory_intersects(void *begin, void *end, void *virt,
+                                    size_t size)
+{
+       void *vend = virt + size;
+
+       return (virt >= begin && virt < end) || (vend >= begin && vend < end);
+}
+
+/**
+ * init_section_contains - checks if an object is contained within the init
+ *                         section
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if the object specified by @virt and @size is entirely
+ * contained within the init section, false otherwise.
+ */
+static inline bool init_section_contains(void *virt, size_t size)
+{
+       return memory_contains(__init_begin, __init_end, virt, size);
+}
+
+/**
+ * init_section_intersects - checks if the region occupied by an object
+ *                           intersects with the init section
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if an object's memory region, specified by @virt and @size,
+ * intersects with the init section, false otherwise.
+ */
+static inline bool init_section_intersects(void *virt, size_t size)
+{
+       return memory_intersects(__init_begin, __init_end, virt, size);
+}
+
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
index c70e3588a48c723f4b7cd8536c0dabfc02871d5c..bfb64d672e1976142aec9171da9bcf388505bd53 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
+#include <linux/pfn.h>
 #include <linux/bio.h>
 #include <linux/stringify.h>
 #include <linux/gfp.h>
@@ -1617,6 +1618,20 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+/**
+ * struct blk_dax_ctl - control and output parameters for ->direct_access
+ * @sector: (input) offset relative to a block_device
+ * @addr: (output) kernel virtual address for @sector populated by driver
+ * @pfn: (output) page frame number for @addr populated by driver
+ * @size: (input) number of bytes requested
+ */
+struct blk_dax_ctl {
+       sector_t sector;
+       void __pmem *addr;
+       long size;
+       pfn_t pfn;
+};
+
 struct block_device_operations {
        int (*open) (struct block_device *, fmode_t);
        void (*release) (struct gendisk *, fmode_t);
@@ -1624,7 +1639,7 @@ struct block_device_operations {
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-                       unsigned long *pfn);
+                       pfn_t *);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1643,8 +1658,7 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
-extern long bdev_direct_access(struct block_device *, sector_t,
-               void __pmem **addr, unsigned long *pfn, long size);
+extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
index bd194343c3460d85730e53ef340ca8337e5827a1..ea731af2451ee3607c16fda14d50fa684a54f03b 100644 (file)
@@ -150,6 +150,7 @@ extern int console_trylock(void);
 extern void console_unlock(void);
 extern void console_conditional_schedule(void);
 extern void console_unblank(void);
+extern void console_flush_on_panic(void);
 extern struct tty_driver *console_device(int *);
 extern void console_stop(struct console *);
 extern void console_start(struct console *);
index a729120644d59d639ff00f4ad5f095e6b024df28..56762ab4171331f87f6ca9930a32d816b7aba581 100644 (file)
@@ -37,7 +37,7 @@ static inline bool __must_check IS_ERR(__force const void *ptr)
 
 static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
 {
-       return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+       return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
 }
 
 /**
index ecb080d6ff42077513f03b95537dc108bded9e07..cfe81e10bd5429baf5ed0ccf50bd4613ff6ee3c6 100644 (file)
@@ -19,13 +19,16 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          unsigned long addr,
                                          pmd_t *pmd,
                                          unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+                       struct vm_area_struct *vma,
+                       pmd_t *pmd, unsigned long addr, unsigned long next);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
                        struct vm_area_struct *vma,
                        pmd_t *pmd, unsigned long addr);
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned char *vec);
-extern int move_huge_pmd(struct vm_area_struct *vma,
+extern bool move_huge_pmd(struct vm_area_struct *vma,
                         struct vm_area_struct *new_vma,
                         unsigned long old_addr,
                         unsigned long new_addr, unsigned long old_end,
@@ -34,8 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
                        int prot_numa);
 int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
-                       unsigned long pfn, bool write);
-
+                       pfn_t pfn, bool write);
 enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -48,21 +50,13 @@ enum transparent_hugepage_flag {
 #endif
 };
 
-enum page_check_address_pmd_flag {
-       PAGE_CHECK_ADDRESS_PMD_FLAG,
-       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
-       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
-};
-extern pmd_t *page_check_address_pmd(struct page *page,
-                                    struct mm_struct *mm,
-                                    unsigned long address,
-                                    enum page_check_address_pmd_flag flag,
-                                    spinlock_t **ptl);
-
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd, int flags);
+
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
 #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
@@ -95,30 +89,28 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 #endif /* CONFIG_DEBUG_VM */
 
 extern unsigned long transparent_hugepage_flags;
-extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+
+extern void prep_transhuge_page(struct page *page);
+extern void free_transhuge_page(struct page *page);
+
+int split_huge_page_to_list(struct page *page, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 {
        return split_huge_page_to_list(page, NULL);
 }
-extern void __split_huge_page_pmd(struct vm_area_struct *vma,
-               unsigned long address, pmd_t *pmd);
-#define split_huge_page_pmd(__vma, __address, __pmd)                   \
+void deferred_split_huge_page(struct page *page);
+
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long address);
+
+#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                            \
                pmd_t *____pmd = (__pmd);                               \
-               if (unlikely(pmd_trans_huge(*____pmd)))                 \
-                       __split_huge_page_pmd(__vma, __address,         \
-                                       ____pmd);                       \
+               if (pmd_trans_huge(*____pmd)                            \
+                                       || pmd_devmap(*____pmd))        \
+                       __split_huge_pmd(__vma, __pmd, __address);      \
        }  while (0)
-#define wait_split_huge_page(__anon_vma, __pmd)                                \
-       do {                                                            \
-               pmd_t *____pmd = (__pmd);                               \
-               anon_vma_lock_write(__anon_vma);                        \
-               anon_vma_unlock_write(__anon_vma);                      \
-               BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
-                      pmd_trans_huge(*____pmd));                       \
-       } while (0)
-extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd);
+
 #if HPAGE_PMD_ORDER >= MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
@@ -128,17 +120,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl);
 /* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
-       if (pmd_trans_huge(*pmd))
+       if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
                return __pmd_trans_huge_lock(pmd, vma, ptl);
        else
-               return 0;
+               return false;
 }
 static inline int hpage_nr_pages(struct page *page)
 {
@@ -183,11 +175,8 @@ static inline int split_huge_page(struct page *page)
 {
        return 0;
 }
-#define split_huge_page_pmd(__vma, __address, __pmd)   \
-       do { } while (0)
-#define wait_split_huge_page(__anon_vma, __pmd)        \
-       do { } while (0)
-#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+static inline void deferred_split_huge_page(struct page *page) {}
+#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)
 static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
@@ -201,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         long adjust_next)
 {
 }
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
-       return 0;
+       return false;
 }
 
 static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -218,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
        return false;
 }
 
+
+static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
+               unsigned long addr, pmd_t *pmd, int flags)
+{
+       return NULL;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
index e76574d8f9b586127f8534d93a7c778bc89e8c0d..7d953c2542a8f296fafbcb59b2bac323c7a64407 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/cgroup.h>
 #include <linux/list.h>
 #include <linux/kref.h>
+#include <asm/pgtable.h>
 
 struct ctl_table;
 struct user_struct;
index de64c1e536125fc30296fba6678e028f9bf7d529..fffd88d7f4269852277e7d9bed29e86c1a9f9490 100644 (file)
@@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr);
 
 void *__devm_memremap_pages(struct device *dev, struct resource *res);
 
-#ifdef CONFIG_ZONE_DEVICE
-void *devm_memremap_pages(struct device *dev, struct resource *res);
-#else
-static inline void *devm_memremap_pages(struct device *dev, struct resource *res)
-{
-       /*
-        * Fail attempts to call devm_memremap_pages() without
-        * ZONE_DEVICE support enabled, this requires callers to fall
-        * back to plain devm_memremap() based on config
-        */
-       WARN_ON_ONCE(1);
-       return ERR_PTR(-ENXIO);
-}
-#endif
-
 /*
  * Some systems do not have legacy ISA devices.
  * /dev/port is not a valid interface on these systems.
index 052c7b32cc91d3ebd051d3018739227e303967b1..8e9e288b08c13f6cddc63cf33724248b2e1150e8 100644 (file)
@@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val)
        return MKDEV((val >> 8) & 255, val & 255);
 }
 
-static inline bool new_valid_dev(dev_t dev)
-{
-       return 1;
-}
-
 static inline u32 new_encode_dev(dev_t dev)
 {
        unsigned major = MAJOR(dev);
index 7311c3294e25f22a610209a63115c3d0778ecd0b..f31638c6e8738230edebe2606ff27d8782fc1a19 100644 (file)
@@ -202,26 +202,26 @@ extern int _cond_resched(void);
 
 /**
  * abs - return absolute value of an argument
- * @x: the value.  If it is unsigned type, it is converted to signed type first
- *   (s64, long or int depending on its size).
+ * @x: the value.  If it is unsigned type, it is converted to signed type first.
+ *     char is treated as if it was signed (regardless of whether it really is)
+ *     but the macro's return type is preserved as char.
  *
- * Return: an absolute value of x.  If x is 64-bit, macro's return type is s64,
- *   otherwise it is signed long.
+ * Return: an absolute value of x.
  */
-#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({      \
-               s64 __x = (x);                                          \
-               (__x < 0) ? -__x : __x;                                 \
-       }), ({                                                          \
-               long ret;                                               \
-               if (sizeof(x) == sizeof(long)) {                        \
-                       long __x = (x);                                 \
-                       ret = (__x < 0) ? -__x : __x;                   \
-               } else {                                                \
-                       int __x = (x);                                  \
-                       ret = (__x < 0) ? -__x : __x;                   \
-               }                                                       \
-               ret;                                                    \
-       }))
+#define abs(x) __abs_choose_expr(x, long long,                         \
+               __abs_choose_expr(x, long,                              \
+               __abs_choose_expr(x, int,                               \
+               __abs_choose_expr(x, short,                             \
+               __abs_choose_expr(x, char,                              \
+               __builtin_choose_expr(                                  \
+                       __builtin_types_compatible_p(typeof(x), char),  \
+                       (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
+                       ((void)0)))))))
+
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr(       \
+       __builtin_types_compatible_p(typeof(x),   signed type) ||       \
+       __builtin_types_compatible_p(typeof(x), unsigned type),         \
+       ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
 
 /**
  * reciprocal_scale - "scale" a value into range [0, ep_ro)
index f707f74055c3bde9766b8e801a59e9ee5242e4fb..861f690aa79118a0203d90f896bf860e3374820e 100644 (file)
@@ -66,7 +66,7 @@
  * error pfns indicate that the gfn is in slot but faild to
  * translate it to pfn on host.
  */
-static inline bool is_error_pfn(pfn_t pfn)
+static inline bool is_error_pfn(kvm_pfn_t pfn)
 {
        return !!(pfn & KVM_PFN_ERR_MASK);
 }
@@ -76,13 +76,13 @@ static inline bool is_error_pfn(pfn_t pfn)
  * translated to pfn - it is not in slot or failed to
  * translate it to pfn.
  */
-static inline bool is_error_noslot_pfn(pfn_t pfn)
+static inline bool is_error_noslot_pfn(kvm_pfn_t pfn)
 {
        return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
 }
 
 /* noslot pfn indicates that the gfn is not in slot. */
-static inline bool is_noslot_pfn(pfn_t pfn)
+static inline bool is_noslot_pfn(kvm_pfn_t pfn)
 {
        return pfn == KVM_PFN_NOSLOT;
 }
@@ -591,19 +591,20 @@ void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable);
-pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
-pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
-pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
-                          bool *async, bool write_fault, bool *writable);
+kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
+                              bool atomic, bool *async, bool write_fault,
+                              bool *writable);
 
-void kvm_release_pfn_clean(pfn_t pfn);
-void kvm_set_pfn_dirty(pfn_t pfn);
-void kvm_set_pfn_accessed(pfn_t pfn);
-void kvm_get_pfn(pfn_t pfn);
+void kvm_release_pfn_clean(kvm_pfn_t pfn);
+void kvm_set_pfn_dirty(kvm_pfn_t pfn);
+void kvm_set_pfn_accessed(kvm_pfn_t pfn);
+void kvm_get_pfn(kvm_pfn_t pfn);
 
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
                        int len);
@@ -629,8 +630,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
-pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
-pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
@@ -811,7 +812,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
-bool kvm_is_reserved_pfn(pfn_t pfn);
+bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
 
 struct kvm_irq_ack_notifier {
        struct hlist_node link;
@@ -965,7 +966,7 @@ static inline gfn_t gpa_to_gfn(gpa_t gpa)
        return (gfn_t)(gpa >> PAGE_SHIFT);
 }
 
-static inline hpa_t pfn_to_hpa(pfn_t pfn)
+static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
 {
        return (hpa_t)pfn << PAGE_SHIFT;
 }
index 1b47a185c2f0b5a567c6cbb97f96d2d050cf79cb..8bf259dae9f6cc278bb2c8623507c452f8d61694 100644 (file)
@@ -53,7 +53,7 @@ typedef unsigned long  hva_t;
 typedef u64            hpa_t;
 typedef u64            hfn_t;
 
-typedef hfn_t pfn_t;
+typedef hfn_t kvm_pfn_t;
 
 struct gfn_to_hva_cache {
        u64 generation;
index 5356f4d661a721ba0446b1183e2a834f3bf3b56f..30cf4200ab40ee40fdae694291e36fe869b508bc 100644 (file)
@@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry);
 extern void list_del(struct list_head *entry);
 #endif
 
+#ifdef CONFIG_DEBUG_LIST
+/*
+ * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
+ * of the pages it allocates is ever passed to list_add()
+ */
+extern void list_force_poison(struct list_head *entry);
+#else
+/* fallback to the less strict LIST_POISON* definitions */
+#define list_force_poison list_del
+#endif
+
 /**
  * list_replace - replace old entry by new one
  * @old : the element to be replaced
index 173fb44e22f1393d63a3daff70d3f8dfee3c7a00..3106ac1c895e0853c3be61b4edee37160f25ca03 100644 (file)
@@ -61,6 +61,14 @@ extern int memblock_debug;
 extern bool movable_node_enabled;
 #endif /* CONFIG_MOVABLE_NODE */
 
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#define __init_memblock __meminit
+#define __initdata_memblock __meminitdata
+#else
+#define __init_memblock
+#define __initdata_memblock
+#endif
+
 #define memblock_dbg(fmt, ...) \
        if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 
@@ -166,7 +174,7 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m)
        return m->flags & MEMBLOCK_HOTPLUG;
 }
 
-static inline bool movable_node_is_enabled(void)
+static inline bool __init_memblock movable_node_is_enabled(void)
 {
        return movable_node_enabled;
 }
@@ -405,14 +413,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
        for (idx = 0; idx < memblock_type->cnt;                         \
             idx++,rgn = &memblock_type->regions[idx])
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-#define __init_memblock __meminit
-#define __initdata_memblock __meminitdata
-#else
-#define __init_memblock
-#define __initdata_memblock
-#endif
-
 #ifdef CONFIG_MEMTEST
 extern void early_memtest(phys_addr_t start, phys_addr_t end);
 #else
index 2292468f2a305a030480be2349b7769934005e93..189f04d4d2ecc36c706299dd11ecbd58bd64e315 100644 (file)
@@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp);
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound);
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+                             bool lrucare, bool compound);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound);
 void mem_cgroup_uncharge(struct page *page);
 void mem_cgroup_uncharge_list(struct list_head *page_list);
 
@@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root,
 
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask,
-                                       struct mem_cgroup **memcgp)
+                                       struct mem_cgroup **memcgp,
+                                       bool compound)
 {
        *memcgp = NULL;
        return 0;
@@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 
 static inline void mem_cgroup_commit_charge(struct page *page,
                                            struct mem_cgroup *memcg,
-                                           bool lrucare)
+                                           bool lrucare, bool compound)
 {
 }
 
 static inline void mem_cgroup_cancel_charge(struct page *page,
-                                           struct mem_cgroup *memcg)
+                                           struct mem_cgroup *memcg,
+                                           bool compound)
 {
 }
 
index 2ea574ff97146446729ab6fbe66333e3df3117a0..43405992d0277367eb55afeb847745acc09ed968 100644 (file)
@@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
+extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+               unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
                                          unsigned long pnum);
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
new file mode 100644 (file)
index 0000000..bcaa634
--- /dev/null
@@ -0,0 +1,114 @@
+#ifndef _LINUX_MEMREMAP_H_
+#define _LINUX_MEMREMAP_H_
+#include <linux/mm.h>
+#include <linux/ioport.h>
+#include <linux/percpu-refcount.h>
+
+struct resource;
+struct device;
+
+/**
+ * struct vmem_altmap - pre-allocated storage for vmemmap_populate
+ * @base_pfn: base of the entire dev_pagemap mapping
+ * @reserve: pages mapped, but reserved for driver use (relative to @base)
+ * @free: free pages set aside in the mapping for memmap storage
+ * @align: pages reserved to meet allocation alignments
+ * @alloc: track pages consumed, private to vmemmap_populate()
+ */
+struct vmem_altmap {
+       const unsigned long base_pfn;
+       const unsigned long reserve;
+       unsigned long free;
+       unsigned long align;
+       unsigned long alloc;
+};
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
+
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
+#else
+static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+       return NULL;
+}
+#endif
+
+/**
+ * struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @altmap: pre-allocated/reserved memory for vmemmap allocations
+ * @res: physical address range covered by @ref
+ * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @dev: host device of the mapping for debug
+ */
+struct dev_pagemap {
+       struct vmem_altmap *altmap;
+       const struct resource *res;
+       struct percpu_ref *ref;
+       struct device *dev;
+};
+
+#ifdef CONFIG_ZONE_DEVICE
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+               struct percpu_ref *ref, struct vmem_altmap *altmap);
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+#else
+static inline void *devm_memremap_pages(struct device *dev,
+               struct resource *res, struct percpu_ref *ref,
+               struct vmem_altmap *altmap)
+{
+       /*
+        * Fail attempts to call devm_memremap_pages() without
+        * ZONE_DEVICE support enabled, this requires callers to fall
+        * back to plain devm_memremap() based on config
+        */
+       WARN_ON_ONCE(1);
+       return ERR_PTR(-ENXIO);
+}
+
+static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
+{
+       return NULL;
+}
+#endif
+
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the
+ * same mapping.
+ */
+static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+               struct dev_pagemap *pgmap)
+{
+       const struct resource *res = pgmap ? pgmap->res : NULL;
+       resource_size_t phys = PFN_PHYS(pfn);
+
+       /*
+        * In the cached case we're already holding a live reference so
+        * we can simply do a blind increment
+        */
+       if (res && phys >= res->start && phys <= res->end) {
+               percpu_ref_get(pgmap->ref);
+               return pgmap;
+       }
+
+       /* fall back to slow path lookup */
+       rcu_read_lock();
+       pgmap = find_dev_pagemap(phys);
+       if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+               pgmap = NULL;
+       rcu_read_unlock();
+
+       return pgmap;
+}
+
+static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
+{
+       if (pgmap)
+               percpu_ref_put(pgmap->ref);
+}
+#endif /* _LINUX_MEMREMAP_H_ */
index 839d9e9a1c38618c9a8a2c5f18682c75d22f1348..f1cd22f2df1ac50438e7d70bb0df85c44580b8d3 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/mm_types.h>
 #include <linux/range.h>
 #include <linux/pfn.h>
+#include <linux/percpu-refcount.h>
 #include <linux/bit_spinlock.h>
 #include <linux/shrinker.h>
 #include <linux/resource.h>
@@ -329,6 +330,13 @@ struct inode;
 #define page_private(page)             ((page)->private)
 #define set_page_private(page, v)      ((page)->private = (v))
 
+#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static inline int pmd_devmap(pmd_t pmd)
+{
+       return 0;
+}
+#endif
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -410,39 +418,17 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 
 extern void kvfree(const void *addr);
 
-static inline void compound_lock(struct page *page)
+static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
+       return &page[1].compound_mapcount;
 }
 
-static inline void compound_unlock(struct page *page)
+static inline int compound_mapcount(struct page *page)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       bit_spin_unlock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline unsigned long compound_lock_irqsave(struct page *page)
-{
-       unsigned long uninitialized_var(flags);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       local_irq_save(flags);
-       compound_lock(page);
-#endif
-       return flags;
-}
-
-static inline void compound_unlock_irqrestore(struct page *page,
-                                             unsigned long flags)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       compound_unlock(page);
-       local_irq_restore(flags);
-#endif
+       if (!PageCompound(page))
+               return 0;
+       page = compound_head(page);
+       return atomic_read(compound_mapcount_ptr(page)) + 1;
 }
 
 /*
@@ -455,61 +441,29 @@ static inline void page_mapcount_reset(struct page *page)
        atomic_set(&(page)->_mapcount, -1);
 }
 
+int __page_mapcount(struct page *page);
+
 static inline int page_mapcount(struct page *page)
 {
        VM_BUG_ON_PAGE(PageSlab(page), page);
-       return atomic_read(&page->_mapcount) + 1;
-}
 
-static inline int page_count(struct page *page)
-{
-       return atomic_read(&compound_head(page)->_count);
-}
-
-static inline bool __compound_tail_refcounted(struct page *page)
-{
-       return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
-}
-
-/*
- * This takes a head page as parameter and tells if the
- * tail page reference counting can be skipped.
- *
- * For this to be safe, PageSlab and PageHeadHuge must remain true on
- * any given page where they return true here, until all tail pins
- * have been released.
- */
-static inline bool compound_tail_refcounted(struct page *page)
-{
-       VM_BUG_ON_PAGE(!PageHead(page), page);
-       return __compound_tail_refcounted(page);
+       if (unlikely(PageCompound(page)))
+               return __page_mapcount(page);
+       return atomic_read(&page->_mapcount) + 1;
 }
 
-static inline void get_huge_page_tail(struct page *page)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int total_mapcount(struct page *page);
+#else
+static inline int total_mapcount(struct page *page)
 {
-       /*
-        * __split_huge_page_refcount() cannot run from under us.
-        */
-       VM_BUG_ON_PAGE(!PageTail(page), page);
-       VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-       VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-       if (compound_tail_refcounted(compound_head(page)))
-               atomic_inc(&page->_mapcount);
+       return page_mapcount(page);
 }
+#endif
 
-extern bool __get_page_tail(struct page *page);
-
-static inline void get_page(struct page *page)
+static inline int page_count(struct page *page)
 {
-       if (unlikely(PageTail(page)))
-               if (likely(__get_page_tail(page)))
-                       return;
-       /*
-        * Getting a normal page or the head of a compound page
-        * requires to already have an elevated page->_count.
-        */
-       VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
-       atomic_inc(&page->_count);
+       return atomic_read(&compound_head(page)->_count);
 }
 
 static inline struct page *virt_to_head_page(const void *x)
@@ -528,7 +482,8 @@ static inline void init_page_count(struct page *page)
        atomic_set(&page->_count, 1);
 }
 
-void put_page(struct page *page);
+void __put_page(struct page *page);
+
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
@@ -547,6 +502,9 @@ enum compound_dtor_id {
        COMPOUND_PAGE_DTOR,
 #ifdef CONFIG_HUGETLB_PAGE
        HUGETLB_PAGE_DTOR,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       TRANSHUGE_PAGE_DTOR,
 #endif
        NR_COMPOUND_DTORS,
 };
@@ -577,6 +535,8 @@ static inline void set_compound_order(struct page *page, unsigned int order)
        page[1].compound_order = order;
 }
 
+void free_compound_page(struct page *page);
+
 #ifdef CONFIG_MMU
 /*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
@@ -704,6 +664,51 @@ static inline enum zone_type page_zonenum(const struct page *page)
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
+#ifdef CONFIG_ZONE_DEVICE
+void get_zone_device_page(struct page *page);
+void put_zone_device_page(struct page *page);
+static inline bool is_zone_device_page(const struct page *page)
+{
+       return page_zonenum(page) == ZONE_DEVICE;
+}
+#else
+static inline void get_zone_device_page(struct page *page)
+{
+}
+static inline void put_zone_device_page(struct page *page)
+{
+}
+static inline bool is_zone_device_page(const struct page *page)
+{
+       return false;
+}
+#endif
+
+static inline void get_page(struct page *page)
+{
+       page = compound_head(page);
+       /*
+        * Getting a normal page or the head of a compound page
+        * requires to already have an elevated page->_count.
+        */
+       VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+       atomic_inc(&page->_count);
+
+       if (unlikely(is_zone_device_page(page)))
+               get_zone_device_page(page);
+}
+
+static inline void put_page(struct page *page)
+{
+       page = compound_head(page);
+
+       if (put_page_testzero(page))
+               __put_page(page);
+
+       if (unlikely(is_zone_device_page(page)))
+               put_zone_device_page(page);
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif
@@ -993,10 +998,21 @@ static inline pgoff_t page_file_index(struct page *page)
 
 /*
  * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
  */
-static inline int page_mapped(struct page *page)
-{
-       return atomic_read(&(page)->_mapcount) >= 0;
+static inline bool page_mapped(struct page *page)
+{
+       int i;
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) >= 0;
+       page = compound_head(page);
+       if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+               return true;
+       for (i = 0; i < hpage_nr_pages(page); i++) {
+               if (atomic_read(&page[i]._mapcount) >= 0)
+                       return true;
+       }
+       return false;
 }
 
 /*
@@ -1084,7 +1100,7 @@ static inline bool shmem_mapping(struct address_space *mapping)
 }
 #endif
 
-extern int can_do_mlock(void);
+extern bool can_do_mlock(void);
 extern int user_shm_lock(size_t, struct user_struct *);
 extern void user_shm_unlock(size_t, struct user_struct *);
 
@@ -1178,7 +1194,8 @@ int invalidate_inode_page(struct page *page);
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                           unsigned long address, unsigned int fault_flags);
+                           unsigned long address, unsigned int fault_flags,
+                           bool *unlocked);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
@@ -1190,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 }
 static inline int fixup_user_fault(struct task_struct *tsk,
                struct mm_struct *mm, unsigned long address,
-               unsigned int fault_flags)
+               unsigned int fault_flags, bool *unlocked)
 {
        /* should never happen if there's no MMU */
        BUG();
@@ -1444,6 +1461,13 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
 
+#ifndef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t pte)
+{
+       return 0;
+}
+#endif
+
 int vma_wants_writenotify(struct vm_area_struct *vma);
 
 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -2114,7 +2138,7 @@ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                       unsigned long pfn);
+                       pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 
@@ -2224,7 +2248,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
 void *vmemmap_alloc_block(unsigned long size, int node);
-void *vmemmap_alloc_block_buf(unsigned long size, int node);
+struct vmem_altmap;
+void *__vmemmap_alloc_block_buf(unsigned long size, int node,
+               struct vmem_altmap *altmap);
+static inline void *vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+       return __vmemmap_alloc_block_buf(size, node, NULL);
+}
+
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
 int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node);
@@ -2246,7 +2277,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int get_hwpoison_page(struct page *page);
-extern void put_hwpoison_page(struct page *page);
+#define put_hwpoison_page(page)        put_page(page)
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
index 6bc9a0ce22530cd260c5d50736875aedabd70f76..d3ebb9d21a5334d26e85bc865d318535f5864569 100644 (file)
@@ -54,6 +54,8 @@ struct page {
                                                 * see PAGE_MAPPING_ANON below.
                                                 */
                void *s_mem;                    /* slab first object */
+               atomic_t compound_mapcount;     /* first tail page */
+               /* page_deferred_list().next     -- second tail page */
        };
 
        /* Second double word */
@@ -61,6 +63,7 @@ struct page {
                union {
                        pgoff_t index;          /* Our offset within mapping. */
                        void *freelist;         /* sl[aou]b first free object */
+                       /* page_deferred_list().prev    -- second tail page */
                };
 
                union {
@@ -81,20 +84,9 @@ struct page {
 
                                union {
                                        /*
-                                        * Count of ptes mapped in
-                                        * mms, to show when page is
-                                        * mapped & limit reverse map
-                                        * searches.
-                                        *
-                                        * Used also for tail pages
-                                        * refcounting instead of
-                                        * _count. Tail pages cannot
-                                        * be mapped and keeping the
-                                        * tail page _count zero at
-                                        * all times guarantees
-                                        * get_page_unless_zero() will
-                                        * never succeed on tail
-                                        * pages.
+                                        * Count of ptes mapped in mms, to show
+                                        * when page is mapped & limit reverse
+                                        * map searches.
                                         */
                                        atomic_t _mapcount;
 
@@ -124,6 +116,11 @@ struct page {
                                         * Can be used as a generic list
                                         * by the page owner.
                                         */
+               struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
+                                           * lru or handled by a slab
+                                           * allocator, this points to the
+                                           * hosting device page map.
+                                           */
                struct {                /* slub per cpu partial pages */
                        struct page *next;      /* Next partial slab */
 #ifdef CONFIG_64BIT
index 772362adf4713b77be041b6232097c97a7f5aad7..053824b0a412e0c2ca371e3844c1534b913fb6d0 100644 (file)
@@ -56,4 +56,10 @@ void dump_mm(const struct mm_struct *mm);
 #define VIRTUAL_BUG_ON(cond) do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#else
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#endif
+
 #endif
index bb53c7b863152468196a390eb269449a03949e4d..19724e6ebd2651e514216033db6acf808d3a440d 100644 (file)
@@ -101,9 +101,6 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,            /* hardware poisoned page. Don't touch */
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       PG_compound_lock,
-#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
@@ -129,53 +126,104 @@ enum pageflags {
 
        /* SLOB */
        PG_slob_free = PG_private,
+
+       /* Compound pages. Stored in first tail page's flags */
+       PG_double_map = PG_private_2,
 };
 
 #ifndef __GENERATING_BOUNDS_H
 
+struct page;   /* forward declaration */
+
+static inline struct page *compound_head(struct page *page)
+{
+       unsigned long head = READ_ONCE(page->compound_head);
+
+       if (unlikely(head & 1))
+               return (struct page *) (head - 1);
+       return page;
+}
+
+static inline int PageTail(struct page *page)
+{
+       return READ_ONCE(page->compound_head) & 1;
+}
+
+static inline int PageCompound(struct page *page)
+{
+       return test_bit(PG_head, &page->flags) || PageTail(page);
+}
+
+/*
+ * Page flags policies wrt compound pages
+ *
+ * PF_ANY:
+ *     the page flag is relevant for small, head and tail pages.
+ *
+ * PF_HEAD:
+ *     for compound page all operations related to the page flag applied to
+ *     head page.
+ *
+ * PF_NO_TAIL:
+ *     modifications of the page flag must be done on small or head pages,
+ *     checks can be done on tail pages too.
+ *
+ * PF_NO_COMPOUND:
+ *     the page flag is not relevant for compound pages.
+ */
+#define PF_ANY(page, enforce)  page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({                                   \
+               VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);     \
+               compound_head(page);})
+#define PF_NO_COMPOUND(page, enforce) ({                               \
+               VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
+               page;})
+
 /*
  * Macros to create function definitions for page flags
  */
-#define TESTPAGEFLAG(uname, lname)                                     \
-static inline int Page##uname(const struct page *page)                 \
-                       { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy)                             \
+static inline int Page##uname(struct page *page)                       \
+       { return test_bit(PG_##lname, &policy(page, 0)->flags); }
 
-#define SETPAGEFLAG(uname, lname)                                      \
+#define SETPAGEFLAG(uname, lname, policy)                              \
 static inline void SetPage##uname(struct page *page)                   \
-                       { set_bit(PG_##lname, &page->flags); }
+       { set_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define CLEARPAGEFLAG(uname, lname)                                    \
+#define CLEARPAGEFLAG(uname, lname, policy)                            \
 static inline void ClearPage##uname(struct page *page)                 \
-                       { clear_bit(PG_##lname, &page->flags); }
+       { clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define __SETPAGEFLAG(uname, lname)                                    \
+#define __SETPAGEFLAG(uname, lname, policy)                            \
 static inline void __SetPage##uname(struct page *page)                 \
-                       { __set_bit(PG_##lname, &page->flags); }
+       { __set_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define __CLEARPAGEFLAG(uname, lname)                                  \
+#define __CLEARPAGEFLAG(uname, lname, policy)                          \
 static inline void __ClearPage##uname(struct page *page)               \
-                       { __clear_bit(PG_##lname, &page->flags); }
+       { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define TESTSETFLAG(uname, lname)                                      \
+#define TESTSETFLAG(uname, lname, policy)                              \
 static inline int TestSetPage##uname(struct page *page)                        \
-               { return test_and_set_bit(PG_##lname, &page->flags); }
+       { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define TESTCLEARFLAG(uname, lname)                                    \
+#define TESTCLEARFLAG(uname, lname, policy)                            \
 static inline int TestClearPage##uname(struct page *page)              \
-               { return test_and_clear_bit(PG_##lname, &page->flags); }
-
-#define __TESTCLEARFLAG(uname, lname)                                  \
-static inline int __TestClearPage##uname(struct page *page)            \
-               { return __test_and_clear_bit(PG_##lname, &page->flags); }
+       { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)              \
-       SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy)                                 \
+       TESTPAGEFLAG(uname, lname, policy)                              \
+       SETPAGEFLAG(uname, lname, policy)                               \
+       CLEARPAGEFLAG(uname, lname, policy)
 
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)            \
-       __SETPAGEFLAG(uname, lname)  __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy)                               \
+       TESTPAGEFLAG(uname, lname, policy)                              \
+       __SETPAGEFLAG(uname, lname, policy)                             \
+       __CLEARPAGEFLAG(uname, lname, policy)
 
-#define TESTSCFLAG(uname, lname)                                       \
-       TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy)                               \
+       TESTSETFLAG(uname, lname, policy)                               \
+       TESTCLEARFLAG(uname, lname, policy)
 
 #define TESTPAGEFLAG_FALSE(uname)                                      \
 static inline int Page##uname(const struct page *page) { return 0; }
@@ -195,56 +243,62 @@ static inline int TestSetPage##uname(struct page *page) { return 0; }
 #define TESTCLEARFLAG_FALSE(uname)                                     \
 static inline int TestClearPage##uname(struct page *page) { return 0; }
 
-#define __TESTCLEARFLAG_FALSE(uname)                                   \
-static inline int __TestClearPage##uname(struct page *page) { return 0; }
-
 #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)                        \
        SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
 
 #define TESTSCFLAG_FALSE(uname)                                                \
        TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
 
-struct page;   /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
-       __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
-       TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked)             /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)    /* Xen */
-PAGEFLAG(SavePinned, savepinned);                      /* Xen */
-PAGEFLAG(Foreign, foreign);                            /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
-       __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+       TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+       __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+       __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+       TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND)        /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
+       TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+       __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+       __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+       __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
 
 /*
  * Private page markings that may be used by the filesystem that owns the page
  * for its own purposes.
  * - PG_private and PG_private_2 cause releasepage() and co to be invoked
  */
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
-       __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+       __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+       TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
  */
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+       TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+       TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+       TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 
 #ifdef CONFIG_HIGHMEM
 /*
@@ -257,31 +311,33 @@ PAGEFLAG_FALSE(HighMem)
 #endif
 
 #ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
 #else
 PAGEFLAG_FALSE(SwapCache)
 #endif
 
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
-       TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+       __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+       TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
 
 #ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
-       TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+       __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+       TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
 #else
 PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
-       TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
+       TESTSCFLAG_FALSE(Mlocked)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
 #else
 PAGEFLAG_FALSE(Uncached)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
 #else
 PAGEFLAG_FALSE(HWPoison)
@@ -289,10 +345,10 @@ PAGEFLAG_FALSE(HWPoison)
 #endif
 
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young)
-SETPAGEFLAG(Young, young)
-TESTCLEARFLAG(Young, young)
-PAGEFLAG(Idle, idle)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
 /*
@@ -317,6 +373,7 @@ PAGEFLAG(Idle, idle)
 
 static inline int PageAnon(struct page *page)
 {
+       page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
 }
 
@@ -329,6 +386,7 @@ static inline int PageAnon(struct page *page)
  */
 static inline int PageKsm(struct page *page)
 {
+       page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 }
@@ -340,8 +398,9 @@ u64 stable_page_flags(struct page *page);
 
 static inline int PageUptodate(struct page *page)
 {
-       int ret = test_bit(PG_uptodate, &(page)->flags);
-
+       int ret;
+       page = compound_head(page);
+       ret = test_bit(PG_uptodate, &(page)->flags);
        /*
         * Must ensure that the data we read out of the page is loaded
         * _after_ we've loaded page->flags to check for PageUptodate.
@@ -358,22 +417,24 @@ static inline int PageUptodate(struct page *page)
 
 static inline void __SetPageUptodate(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        smp_wmb();
-       __set_bit(PG_uptodate, &(page)->flags);
+       __set_bit(PG_uptodate, &page->flags);
 }
 
 static inline void SetPageUptodate(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the page
         * uptodate are actually visible before PageUptodate becomes true.
         */
        smp_wmb();
-       set_bit(PG_uptodate, &(page)->flags);
+       set_bit(PG_uptodate, &page->flags);
 }
 
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
 int test_clear_page_writeback(struct page *page);
 int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -393,12 +454,7 @@ static inline void set_page_writeback_keepwrite(struct page *page)
        test_set_page_writeback_keepwrite(page);
 }
 
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-
-static inline int PageTail(struct page *page)
-{
-       return READ_ONCE(page->compound_head) & 1;
-}
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
 
 static inline void set_compound_head(struct page *page, struct page *head)
 {
@@ -410,20 +466,6 @@ static inline void clear_compound_head(struct page *page)
        WRITE_ONCE(page->compound_head, 0);
 }
 
-static inline struct page *compound_head(struct page *page)
-{
-       unsigned long head = READ_ONCE(page->compound_head);
-
-       if (unlikely(head & 1))
-               return (struct page *) (head - 1);
-       return page;
-}
-
-static inline int PageCompound(struct page *page)
-{
-       return PageHead(page) || PageTail(page);
-
-}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline void ClearPageCompound(struct page *page)
 {
@@ -484,22 +526,43 @@ static inline int PageTransTail(struct page *page)
        return PageTail(page);
 }
 
-#else
-
-static inline int PageTransHuge(struct page *page)
+/*
+ * PageDoubleMap indicates that the compound page is mapped with PTEs as well
+ * as PMDs.
+ *
+ * This is required for optimization of rmap operations for THP: we can postpone
+ * per small page mapcount accounting (and its overhead from atomic operations)
+ * until the first PMD split.
+ *
+ * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
+ * by one. This reference will go away with last compound_mapcount.
+ *
+ * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
+ */
+static inline int PageDoubleMap(struct page *page)
 {
-       return 0;
+       return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
 }
 
-static inline int PageTransCompound(struct page *page)
+static inline int TestSetPageDoubleMap(struct page *page)
 {
-       return 0;
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+       return test_and_set_bit(PG_double_map, &page[1].flags);
 }
 
-static inline int PageTransTail(struct page *page)
+static inline int TestClearPageDoubleMap(struct page *page)
 {
-       return 0;
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+       return test_and_clear_bit(PG_double_map, &page[1].flags);
 }
+
+#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+TESTPAGEFLAG_FALSE(DoubleMap)
+       TESTSETFLAG_FALSE(DoubleMap)
+       TESTCLEARFLAG_FALSE(DoubleMap)
 #endif
 
 /*
@@ -583,12 +646,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 #define __PG_MLOCKED           0
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __PG_COMPOUND_LOCK             (1 << PG_compound_lock)
-#else
-#define __PG_COMPOUND_LOCK             0
-#endif
-
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
  * these flags set.  It they are, there is a problem.
@@ -598,8 +655,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
         1 << PG_private | 1 << PG_private_2 | \
         1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-        1 << PG_unevictable | __PG_MLOCKED | \
-        __PG_COMPOUND_LOCK)
+        1 << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -626,6 +682,10 @@ static inline int page_has_private(struct page *page)
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
 }
 
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
 #endif /* !__GENERATING_BOUNDS_H */
 
 #endif /* PAGE_FLAGS_H */
index 26eabf5ec718a457eb9cef5635f5c475cca0c48d..4d08b6c33557250edda8e949ff64f5f2fd1ffdc1 100644 (file)
@@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
  */
 static inline pgoff_t page_to_pgoff(struct page *page)
 {
+       pgoff_t pgoff;
+
        if (unlikely(PageHeadHuge(page)))
                return page->index << compound_order(page);
-       else
+
+       if (likely(!PageTransTail(page)))
                return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+       /*
+        *  We don't initialize ->index for tail pages: calculate based on
+        *  head page
+        */
+       pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+       pgoff += page - compound_head(page);
+       return pgoff;
 }
 
 /*
@@ -433,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                unsigned int flags);
 extern void unlock_page(struct page *page);
 
-static inline void __set_page_locked(struct page *page)
-{
-       __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
-       __clear_bit(PG_locked, &page->flags);
-}
-
 static inline int trylock_page(struct page *page)
 {
+       page = compound_head(page);
        return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
 }
 
@@ -497,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
 
 static inline int wait_on_page_locked_killable(struct page *page)
 {
-       if (PageLocked(page))
-               return wait_on_page_bit_killable(page, PG_locked);
-       return 0;
+       if (!PageLocked(page))
+               return 0;
+       return wait_on_page_bit_killable(compound_head(page), PG_locked);
 }
 
 extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -518,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit)
 static inline void wait_on_page_locked(struct page *page)
 {
        if (PageLocked(page))
-               wait_on_page_bit(page, PG_locked);
+               wait_on_page_bit(compound_head(page), PG_locked);
 }
 
 /* 
@@ -664,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
  */
 static inline int add_to_page_cache(struct page *page,
                struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
 {
        int error;
 
-       __set_page_locked(page);
+       __SetPageLocked(page);
        error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
        if (unlikely(error))
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
        return error;
 }
 
index 97f3e88aead4acbb7dcb5e3ac4cf2c9e03885e6a..2d8e49711b6392a591ee352b12c162096d4c89f6 100644 (file)
@@ -3,6 +3,15 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
+
+/*
+ * pfn_t: encapsulates a page-frame number that is optionally backed
+ * by memmap (struct page).  Whether a pfn_t has a 'struct page'
+ * backing is indicated by flags in the high bits of the value.
+ */
+typedef struct {
+       unsigned long val;
+} pfn_t;
 #endif
 
 #define PFN_ALIGN(x)   (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
new file mode 100644 (file)
index 0000000..0703b53
--- /dev/null
@@ -0,0 +1,102 @@
+#ifndef _LINUX_PFN_T_H_
+#define _LINUX_PFN_T_H_
+#include <linux/mm.h>
+
+/*
+ * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags
+ * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry
+ * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
+ * PFN_DEV - pfn is not covered by system memmap by default
+ * PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ */
+#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \
+               << (BITS_PER_LONG - PAGE_SHIFT))
+#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1))
+#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2))
+#define PFN_DEV (1UL << (BITS_PER_LONG - 3))
+#define PFN_MAP (1UL << (BITS_PER_LONG - 4))
+
+static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags)
+{
+       pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
+
+       return pfn_t;
+}
+
+/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */
+static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
+{
+       return __pfn_to_pfn_t(pfn, 0);
+}
+
+extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags);
+
+static inline bool pfn_t_has_page(pfn_t pfn)
+{
+       return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0;
+}
+
+static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
+{
+       return pfn.val & ~PFN_FLAGS_MASK;
+}
+
+static inline struct page *pfn_t_to_page(pfn_t pfn)
+{
+       if (pfn_t_has_page(pfn))
+               return pfn_to_page(pfn_t_to_pfn(pfn));
+       return NULL;
+}
+
+static inline dma_addr_t pfn_t_to_phys(pfn_t pfn)
+{
+       return PFN_PHYS(pfn_t_to_pfn(pfn));
+}
+
+static inline void *pfn_t_to_virt(pfn_t pfn)
+{
+       if (pfn_t_has_page(pfn))
+               return __va(pfn_t_to_phys(pfn));
+       return NULL;
+}
+
+static inline pfn_t page_to_pfn_t(struct page *page)
+{
+       return pfn_to_pfn_t(page_to_pfn(page));
+}
+
+static inline int pfn_t_valid(pfn_t pfn)
+{
+       return pfn_valid(pfn_t_to_pfn(pfn));
+}
+
+#ifdef CONFIG_MMU
+static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot)
+{
+       return pfn_pte(pfn_t_to_pfn(pfn), pgprot);
+}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
+{
+       return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
+}
+#endif
+
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline bool pfn_t_devmap(pfn_t pfn)
+{
+       const unsigned long flags = PFN_DEV|PFN_MAP;
+
+       return (pfn.val & flags) == flags;
+}
+#else
+static inline bool pfn_t_devmap(pfn_t pfn)
+{
+       return false;
+}
+pte_t pte_mkdevmap(pte_t pte);
+pmd_t pmd_mkdevmap(pmd_t pmd);
+#endif
+#endif /* _LINUX_PFN_T_H_ */
index 317e16de09e508ed64b87dae6c5006b1efcb129d..4a27153574e2839546fb40ee8e8afea483e33cef 100644 (file)
  * Magic number "tsta" to indicate a static timer initializer
  * for the object debugging code.
  */
-#define TIMER_ENTRY_STATIC     ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC     ((void *) 0x300 + POISON_POINTER_DELTA)
 
 /********** mm/debug-pagealloc.c **********/
 #define PAGE_POISON 0xaa
 
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING   ((void *) 0x400 + POISON_POINTER_DELTA)
+
 /********** mm/slab.c **********/
 /*
  * Magic nums for obj red zoning.
index 9729565c25ff19accc05ca6419bbc6f50f8cdb53..9ccbdf2c1453f4897ea884d4a89ea5a8bb04ede8 100644 (file)
@@ -106,13 +106,13 @@ struct va_format {
 
 /*
  * Dummy printk for disabled debugging statements to use whilst maintaining
- * gcc's format and side-effect checking.
+ * gcc's format checking.
  */
-static inline __printf(1, 2)
-int no_printk(const char *fmt, ...)
-{
-       return 0;
-}
+#define no_printk(fmt, ...)                    \
+do {                                           \
+       if (0)                                  \
+               printk(fmt, ##__VA_ARGS__);     \
+} while (0)
 
 #ifdef CONFIG_EARLY_PRINTK
 extern asmlinkage __printf(1, 2)
index 29446aeef36e553aa361774d39c0852517c87405..bdf597c4f0be82965911266ed1668d06510492cb 100644 (file)
@@ -85,6 +85,7 @@ enum ttu_flags {
        TTU_UNMAP = 1,                  /* unmap mode */
        TTU_MIGRATION = 2,              /* migration mode */
        TTU_MUNLOCK = 4,                /* munlock mode */
+       TTU_LZFREE = 8,                 /* lazy free mode */
 
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
@@ -161,25 +162,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 
 struct anon_vma *page_get_anon_vma(struct page *page);
 
+/* bitflags for do_page_add_anon_rmap() */
+#define RMAP_EXCLUSIVE 0x01
+#define RMAP_COMPOUND 0x02
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
 void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+               unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                           unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+               unsigned long, bool);
 void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, bool);
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
                            unsigned long);
 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                                unsigned long);
 
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, bool compound)
 {
-       atomic_inc(&page->_mapcount);
+       atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
 }
 
 /*
@@ -209,6 +216,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        return ptep;
 }
 
+/*
+ * Used by idle page tracking to check if a page was referenced via page
+ * tables.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+                                 unsigned long address, pmd_t **pmdp,
+                                 pte_t **ptep, spinlock_t **ptlp);
+#else
+static inline bool page_check_address_transhuge(struct page *page,
+                               struct mm_struct *mm, unsigned long address,
+                               pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp)
+{
+       *ptep = page_check_address(page, mm, address, ptlp, 0);
+       *pmdp = NULL;
+       return !!*ptep;
+}
+#endif
+
 /*
  * Used by swapoff to help locate where page is expected in vma.
  */
@@ -286,5 +312,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_AGAIN     1
 #define SWAP_FAIL      2
 #define SWAP_MLOCK     3
+#define SWAP_LZFREE    4
 
 #endif /* _LINUX_RMAP_H */
index 066bd21765ad728c64bd6b96f5987486bb5f810d..414e101cd06195fe60339e71a8dfd2487001df7e 100644 (file)
@@ -307,6 +307,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
 extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
@@ -538,7 +539,8 @@ static inline int swp_swapcount(swp_entry_t entry)
        return 0;
 }
 
-#define reuse_swap_page(page)  (page_mapcount(page) == 1)
+#define reuse_swap_page(page) \
+       (!PageTransCompound(page) && page_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
index e623d392db0c16c4cfc21102bfb3d63158d14326..67c1dbd19c6df356012227cb7d3c0e283c49f833 100644 (file)
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                FOR_ALL_ZONES(PGALLOC),
                PGFREE, PGACTIVATE, PGDEACTIVATE,
                PGFAULT, PGMAJFAULT,
+               PGLAZYFREED,
                FOR_ALL_ZONES(PGREFILL),
                FOR_ALL_ZONES(PGSTEAL_KSWAPD),
                FOR_ALL_ZONES(PGSTEAL_DIRECT),
@@ -68,7 +69,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                THP_FAULT_FALLBACK,
                THP_COLLAPSE_ALLOC,
                THP_COLLAPSE_ALLOC_FAILED,
-               THP_SPLIT,
+               THP_SPLIT_PAGE,
+               THP_SPLIT_PAGE_FAILED,
+               THP_SPLIT_PMD,
                THP_ZERO_PAGE_ALLOC,
                THP_ZERO_PAGE_ALLOC_FAILED,
 #endif
index 97d635cabac8d5e389cfaa331c39a92e1f23f0c6..0f803d2783e3834194cb3f5cd3dd4da5c017304c 100644 (file)
@@ -22,6 +22,7 @@
        EM( SCAN_PAGE_LRU,              "page_not_in_lru")              \
        EM( SCAN_PAGE_LOCK,             "page_locked")                  \
        EM( SCAN_PAGE_ANON,             "page_not_anon")                \
+       EM( SCAN_PAGE_COMPOUND,         "page_compound")                \
        EM( SCAN_ANY_PROCESS,           "no_process_for_page")          \
        EM( SCAN_VMA_NULL,              "vma_null")                     \
        EM( SCAN_VMA_CHECK,             "vma_check_failed")             \
index a74dd84bbb6d04777fe76e043dbd6246b5914d1a..58274382a6164de8e9d03ec85da0cf07996869d6 100644 (file)
@@ -41,6 +41,7 @@
 #define MADV_DONTNEED  4               /* don't need these pages */
 
 /* common parameters: try to keep these consistent across architectures */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 #define MADV_REMOVE    9               /* remove these pages & resources */
 #define MADV_DONTFORK  10              /* don't inherit across fork */
 #define MADV_DOFORK    11              /* do inherit across fork */
index 5481b49e8c3f2a1e732fbe751e97b2d7b7d0bc0a..4644217b237341513e67c861d03d8df063702c62 100644 (file)
@@ -285,7 +285,7 @@ config FHANDLE
 
 config USELIB
        bool "uselib syscall"
-       default y
+       def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION
        help
          This option enables the uselib syscall, a system call used in the
          dynamic linker from libc5 and earlier.  glibc does not use this
index bb0669169716e4847d67f651896531d7bca7bc40..0167679182c08dae79cf54e0b3830fc626866ee2 100644 (file)
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        const unsigned long mmun_end   = addr + PAGE_SIZE;
        struct mem_cgroup *memcg;
 
-       err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+       err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+                       false);
        if (err)
                return err;
 
@@ -175,8 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                goto unlock;
 
        get_page(kpage);
-       page_add_new_anon_rmap(kpage, vma, addr);
-       mem_cgroup_commit_charge(kpage, memcg, false);
+       page_add_new_anon_rmap(kpage, vma, addr, false);
+       mem_cgroup_commit_charge(kpage, memcg, false, false);
        lru_cache_add_active_or_unevictable(kpage, vma);
 
        if (!PageAnon(page)) {
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
-       page_remove_rmap(page);
+       page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
        err = 0;
  unlock:
-       mem_cgroup_cancel_charge(kpage, memcg);
+       mem_cgroup_cancel_charge(kpage, memcg, false);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
index 8a310e240cdaf60b27aaff432c5837bbd3e61d90..c6f514573b28a425f648569473bab692d0ee1940 100644 (file)
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-       struct page *page, *page_head;
+       struct page *page;
+       struct address_space *mapping;
        int err, ro = 0;
 
        /*
@@ -519,46 +520,9 @@ again:
        else
                err = 0;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       page_head = page;
-       if (unlikely(PageTail(page))) {
-               put_page(page);
-               /* serialize against __split_huge_page_splitting() */
-               local_irq_disable();
-               if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
-                       page_head = compound_head(page);
-                       /*
-                        * page_head is valid pointer but we must pin
-                        * it before taking the PG_lock and/or
-                        * PG_compound_lock. The moment we re-enable
-                        * irqs __split_huge_page_splitting() can
-                        * return and the head page can be freed from
-                        * under us. We can't take the PG_lock and/or
-                        * PG_compound_lock on a page that could be
-                        * freed from under us.
-                        */
-                       if (page != page_head) {
-                               get_page(page_head);
-                               put_page(page);
-                       }
-                       local_irq_enable();
-               } else {
-                       local_irq_enable();
-                       goto again;
-               }
-       }
-#else
-       page_head = compound_head(page);
-       if (page != page_head) {
-               get_page(page_head);
-               put_page(page);
-       }
-#endif
-
-       lock_page(page_head);
-
+       lock_page(page);
        /*
-        * If page_head->mapping is NULL, then it cannot be a PageAnon
+        * If page->mapping is NULL, then it cannot be a PageAnon
         * page; but it might be the ZERO_PAGE or in the gate area or
         * in a special mapping (all cases which we are happy to fail);
         * or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
         *
         * The case we do have to guard against is when memory pressure made
         * shmem_writepage move it from filecache to swapcache beneath us:
-        * an unlikely race, but we do need to retry for page_head->mapping.
+        * an unlikely race, but we do need to retry for page->mapping.
         */
-       if (!page_head->mapping) {
-               int shmem_swizzled = PageSwapCache(page_head);
-               unlock_page(page_head);
-               put_page(page_head);
+       mapping = compound_head(page)->mapping;
+       if (!mapping) {
+               int shmem_swizzled = PageSwapCache(page);
+               unlock_page(page);
+               put_page(page);
                if (shmem_swizzled)
                        goto again;
                return -EFAULT;
@@ -588,7 +553,7 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-       if (PageAnon(page_head)) {
+       if (PageAnon(page)) {
                /*
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
@@ -603,15 +568,15 @@ again:
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = page_head->mapping->host;
+               key->shared.inode = mapping->host;
                key->shared.pgoff = basepage_index(page);
        }
 
        get_futex_key_refs(key); /* implies MB (B) */
 
 out:
-       unlock_page(page_head);
-       put_page(page_head);
+       unlock_page(page);
+       put_page(page);
        return err;
 }
 
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 
        down_read(&mm->mmap_sem);
        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-                              FAULT_FLAG_WRITE);
+                              FAULT_FLAG_WRITE, NULL);
        up_read(&mm->mmap_sem);
 
        return ret < 0 ? ret : 0;
index 7658d32c5c78aa6343ac8cd669070084d1cd3805..e517a16cb426cc30b0251a43d22c5d2cb01bead9 100644 (file)
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
 #include <linux/device.h>
 #include <linux/types.h>
+#include <linux/pfn_t.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/memory_hotplug.h>
@@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
 
+pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+{
+       return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
+
 #ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
 struct page_map {
        struct resource res;
+       struct percpu_ref *ref;
+       struct dev_pagemap pgmap;
+       struct vmem_altmap altmap;
 };
 
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+       percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+
+void put_zone_device_page(struct page *page)
+{
+       put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+
+static void pgmap_radix_release(struct resource *res)
+{
+       resource_size_t key;
+
+       mutex_lock(&pgmap_lock);
+       for (key = res->start; key <= res->end; key += SECTION_SIZE)
+               radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+       mutex_unlock(&pgmap_lock);
+}
+
+static unsigned long pfn_first(struct page_map *page_map)
+{
+       struct dev_pagemap *pgmap = &page_map->pgmap;
+       const struct resource *res = &page_map->res;
+       struct vmem_altmap *altmap = pgmap->altmap;
+       unsigned long pfn;
+
+       pfn = res->start >> PAGE_SHIFT;
+       if (altmap)
+               pfn += vmem_altmap_offset(altmap);
+       return pfn;
+}
+
+static unsigned long pfn_end(struct page_map *page_map)
+{
+       const struct resource *res = &page_map->res;
+
+       return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+#define for_each_device_pfn(pfn, map) \
+       for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+
+static void devm_memremap_pages_release(struct device *dev, void *data)
 {
-       struct page_map *page_map = res;
+       struct page_map *page_map = data;
+       struct resource *res = &page_map->res;
+       resource_size_t align_start, align_size;
+       struct dev_pagemap *pgmap = &page_map->pgmap;
+
+       if (percpu_ref_tryget_live(pgmap->ref)) {
+               dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+               percpu_ref_put(pgmap->ref);
+       }
+
+       pgmap_radix_release(res);
 
        /* pages are dead and unused, undo the arch mapping */
-       arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(resource_size(res), SECTION_SIZE);
+       arch_remove_memory(align_start, align_size);
+       dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+                       "%s: failed to free all reserved pages\n", __func__);
+}
+
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
+{
+       struct page_map *page_map;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+       return page_map ? &page_map->pgmap : NULL;
 }
 
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ *    (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ *    treated as a "System RAM" range, i.e. not a device mmio range, but
+ *    this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+               struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
        int is_ram = region_intersects(res->start, resource_size(res),
                        "System RAM");
+       resource_size_t key, align_start, align_size;
+       struct dev_pagemap *pgmap;
        struct page_map *page_map;
+       unsigned long pfn;
        int error, nid;
 
        if (is_ram == REGION_MIXED) {
@@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
        if (is_ram == REGION_INTERSECTS)
                return __va(res->start);
 
+       if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+               dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+                               __func__);
+               return ERR_PTR(-ENXIO);
+       }
+
+       if (!ref)
+               return ERR_PTR(-EINVAL);
+
        page_map = devres_alloc_node(devm_memremap_pages_release,
                        sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
        if (!page_map)
                return ERR_PTR(-ENOMEM);
+       pgmap = &page_map->pgmap;
 
        memcpy(&page_map->res, res, sizeof(*res));
 
+       pgmap->dev = dev;
+       if (altmap) {
+               memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+               pgmap->altmap = &page_map->altmap;
+       }
+       pgmap->ref = ref;
+       pgmap->res = &page_map->res;
+
+       mutex_lock(&pgmap_lock);
+       error = 0;
+       for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+               struct dev_pagemap *dup;
+
+               rcu_read_lock();
+               dup = find_dev_pagemap(key);
+               rcu_read_unlock();
+               if (dup) {
+                       dev_err(dev, "%s: %pr collides with mapping for %s\n",
+                                       __func__, res, dev_name(dup->dev));
+                       error = -EBUSY;
+                       break;
+               }
+               error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+                               page_map);
+               if (error) {
+                       dev_err(dev, "%s: failed: %d\n", __func__, error);
+                       break;
+               }
+       }
+       mutex_unlock(&pgmap_lock);
+       if (error)
+               goto err_radix;
+
        nid = dev_to_node(dev);
        if (nid < 0)
                nid = numa_mem_id();
 
-       error = arch_add_memory(nid, res->start, resource_size(res), true);
-       if (error) {
-               devres_free(page_map);
-               return ERR_PTR(error);
-       }
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(resource_size(res), SECTION_SIZE);
+       error = arch_add_memory(nid, align_start, align_size, true);
+       if (error)
+               goto err_add_memory;
 
+       for_each_device_pfn(pfn, page_map) {
+               struct page *page = pfn_to_page(pfn);
+
+               /* ZONE_DEVICE pages must never appear on a slab lru */
+               list_force_poison(&page->lru);
+               page->pgmap = pgmap;
+       }
        devres_add(dev, page_map);
        return __va(res->start);
+
+ err_add_memory:
+ err_radix:
+       pgmap_radix_release(res);
+       devres_free(page_map);
+       return ERR_PTR(error);
 }
 EXPORT_SYMBOL(devm_memremap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+       /* number of pfns from base where pfn_to_page() is valid */
+       return altmap->reserve + altmap->free;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+       altmap->alloc -= nr_pfns;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+       /*
+        * 'memmap_start' is the virtual address for the first "struct
+        * page" in this range of the vmemmap array.  In the case of
+        * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+        * pointer arithmetic, so we can perform this to_vmem_altmap()
+        * conversion without concern for the initialization state of
+        * the struct page fields.
+        */
+       struct page *page = (struct page *) memmap_start;
+       struct dev_pagemap *pgmap;
+
+       /*
+        * Uncoditionally retrieve a dev_pagemap associated with the
+        * given physical address, this is only for use in the
+        * arch_{add|remove}_memory() for setting up and tearing down
+        * the memmap.
+        */
+       rcu_read_lock();
+       pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+       rcu_read_unlock();
+
+       return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
index b333380c6bb239f1ee870430774613aaab4dfb2d..d96469de72dc555434818411625ff8a3f855f748 100644 (file)
@@ -180,8 +180,7 @@ void panic(const char *fmt, ...)
         * panic() is not being callled from OOPS.
         */
        debug_locks_off();
-       console_trylock();
-       console_unlock();
+       console_flush_on_panic();
 
        if (!panic_blink)
                panic_blink = no_blink;
index 2ce8826f1053d801f0b003f5a3e67de0fdfe8e30..e79439134978cf5f572fa1e981a57c3a5c9a8f69 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/uio.h>
 
 #include <asm/uaccess.h>
+#include <asm-generic/sections.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
@@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                            const char *dict, size_t dictlen,
                            const char *fmt, va_list args)
 {
-       static int recursion_bug;
+       static bool recursion_bug;
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
        size_t text_len = 0;
@@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * it can be printed at the next appropriate moment:
                 */
                if (!oops_in_progress && !lockdep_recursing(current)) {
-                       recursion_bug = 1;
+                       recursion_bug = true;
                        local_irq_restore(flags);
                        return 0;
                }
@@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                static const char recursion_msg[] =
                        "BUG: recent printk recursion!";
 
-               recursion_bug = 0;
+               recursion_bug = false;
                /* emit KERN_CRIT message */
                printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
                                         NULL, 0, recursion_msg,
@@ -2233,13 +2234,24 @@ void console_unlock(void)
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
-       bool retry;
+       bool do_cond_resched, retry;
 
        if (console_suspended) {
                up_console_sem();
                return;
        }
 
+       /*
+        * Console drivers are called under logbuf_lock, so
+        * @console_may_schedule should be cleared before; however, we may
+        * end up dumping a lot of lines, for example, if called from
+        * console registration path, and should invoke cond_resched()
+        * between lines if allowable.  Not doing so can cause a very long
+        * scheduling stall on a slow console leading to RCU stall and
+        * softlockup warnings which exacerbate the issue with more
+        * messages practically incapacitating the system.
+        */
+       do_cond_resched = console_may_schedule;
        console_may_schedule = 0;
 
        /* flush buffered message fragment immediately to console */
@@ -2311,6 +2323,9 @@ skip:
                call_console_drivers(level, ext_text, ext_len, text, len);
                start_critical_timings();
                local_irq_restore(flags);
+
+               if (do_cond_resched)
+                       cond_resched();
        }
        console_locked = 0;
 
@@ -2378,6 +2393,25 @@ void console_unblank(void)
        console_unlock();
 }
 
+/**
+ * console_flush_on_panic - flush console content on panic
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(void)
+{
+       /*
+        * If someone else is holding the console lock, trylock will fail
+        * and may_schedule may be set.  Ignore and proceed to unlock so
+        * that messages are flushed out.  As this can be called from any
+        * context and we don't want to get preempted while flushing,
+        * ensure may_schedule is cleared.
+        */
+       console_trylock();
+       console_may_schedule = 0;
+       console_unlock();
+}
+
 /*
  * Return the console tty driver structure and its associated index
  */
@@ -2658,13 +2692,36 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
 
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
 static int __init printk_late_init(void)
 {
        struct console *con;
 
        for_each_console(con) {
                if (!keep_bootcon && con->flags & CON_BOOT) {
-                       unregister_console(con);
+                       /*
+                        * Make sure to unregister boot consoles whose data
+                        * resides in the init section before the init section
+                        * is discarded. Boot consoles whose data will stick
+                        * around will automatically be unregistered when the
+                        * proper console replaces them.
+                        */
+                       if (init_section_intersects(con, sizeof(*con)))
+                               unregister_console(con);
                }
        }
        hotcpu_notifier(console_cpu_notify, 0);
index edb6de4f5908cefc7e9a792adfea5998c4ae9a87..a467e6c28a3b2d4be1ecfa5c14d3a35d1ef304f8 100644 (file)
@@ -529,8 +529,6 @@ static int __init cpu_stop_init(void)
 }
 early_initcall(cpu_stop_init);
 
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
-
 static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
        struct multi_stop_data msdata = {
@@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
        mutex_unlock(&stop_cpus_mutex);
        return ret ?: done.ret;
 }
-
-#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
index ee1ac1cc082c10318c2ae6f88890124e9c9c934e..f75a33f29f6e9b55c1e4b5ab9e1b355565fbaec6 100644 (file)
@@ -580,6 +580,14 @@ config DEBUG_VM_RB
 
          If unsure, say N.
 
+config DEBUG_VM_PGFLAGS
+       bool "Debug page-flags operations"
+       depends on DEBUG_VM
+       help
+         Enables extra validation on page flags operations.
+
+         If unsure, say N.
+
 config DEBUG_VIRTUAL
        bool "Debug VM translations"
        depends on DEBUG_KERNEL && X86
@@ -1589,7 +1597,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 
 config LATENCYTOP
        bool "Latency measuring infrastructure"
-       depends on HAVE_LATENCYTOP_SUPPORT
        depends on DEBUG_KERNEL
        depends on STACKTRACE_SUPPORT
        depends on PROC_FS
index f194e6e593e19db22ad6b7e50aeee0e7c46eafa6..7f6c506a494226479af1152f13f99f1870f554d1 100644 (file)
 /* Simplified asprintf. */
 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 {
-       unsigned int len;
+       unsigned int first, second;
        char *p;
        va_list aq;
 
        va_copy(aq, ap);
-       len = vsnprintf(NULL, 0, fmt, aq);
+       first = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);
 
-       p = kmalloc_track_caller(len+1, gfp);
+       p = kmalloc_track_caller(first+1, gfp);
        if (!p)
                return NULL;
 
-       vsnprintf(p, len+1, fmt, ap);
+       second = vsnprintf(p, first+1, fmt, ap);
+       WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
+            first, second, fmt);
 
        return p;
 }
index 3859bf63561c63936947b007fe3ee20e822509a1..3345a089ef7b954d475d67965b8ce891c1b16ab0 100644 (file)
 #include <linux/kernel.h>
 #include <linux/rculist.h>
 
+static struct list_head force_poison;
+void list_force_poison(struct list_head *entry)
+{
+       entry->next = &force_poison;
+       entry->prev = &force_poison;
+}
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -23,6 +30,8 @@ void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
 {
+       WARN(new->next == &force_poison || new->prev == &force_poison,
+               "list_add attempted on force-poisoned entry\n");
        WARN(next->prev != prev,
                "list_add corruption. next->prev should be "
                "prev (%p), but was %p. (next=%p).\n",
index c5a666af9ba5a4536edd58b626f2b0b339b2138d..4f6ae60433bc5d1e1d6df9b7c8d5d359512ad82a 100644 (file)
 #include <linux/slab.h>
 #include <linux/string.h>
 
+#include <linux/bitmap.h>
+#include <linux/dcache.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 
 #define BUF_SIZE 256
+#define PAD_SIZE 16
 #define FILL_CHAR '$'
 
 #define PTR1 ((void*)0x01234567)
@@ -39,6 +42,7 @@
 static unsigned total_tests __initdata;
 static unsigned failed_tests __initdata;
 static char *test_buffer __initdata;
+static char *alloced_buffer __initdata;
 
 static int __printf(4, 0) __init
 do_test(int bufsize, const char *expect, int elen,
@@ -49,7 +53,7 @@ do_test(int bufsize, const char *expect, int elen,
 
        total_tests++;
 
-       memset(test_buffer, FILL_CHAR, BUF_SIZE);
+       memset(alloced_buffer, FILL_CHAR, BUF_SIZE + 2*PAD_SIZE);
        va_copy(aq, ap);
        ret = vsnprintf(test_buffer, bufsize, fmt, aq);
        va_end(aq);
@@ -60,8 +64,13 @@ do_test(int bufsize, const char *expect, int elen,
                return 1;
        }
 
+       if (memchr_inv(alloced_buffer, FILL_CHAR, PAD_SIZE)) {
+               pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote before buffer\n", bufsize, fmt);
+               return 1;
+       }
+
        if (!bufsize) {
-               if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) {
+               if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE + PAD_SIZE)) {
                        pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n",
                                fmt);
                        return 1;
@@ -76,6 +85,12 @@ do_test(int bufsize, const char *expect, int elen,
                return 1;
        }
 
+       if (memchr_inv(test_buffer + written + 1, FILL_CHAR, BUF_SIZE + PAD_SIZE - (written + 1))) {
+               pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n",
+                       bufsize, fmt);
+               return 1;
+       }
+
        if (memcmp(test_buffer, expect, written)) {
                pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n",
                        bufsize, fmt, test_buffer, written, expect);
@@ -91,7 +106,12 @@ __test(const char *expect, int elen, const char *fmt, ...)
        int rand;
        char *p;
 
-       BUG_ON(elen >= BUF_SIZE);
+       if (elen >= BUF_SIZE) {
+               pr_err("error in test suite: expected output length %d too long. Format was '%s'.\n",
+                      elen, fmt);
+               failed_tests++;
+               return;
+       }
 
        va_start(ap, fmt);
 
@@ -109,6 +129,7 @@ __test(const char *expect, int elen, const char *fmt, ...)
 
        p = kvasprintf(GFP_KERNEL, fmt, ap);
        if (p) {
+               total_tests++;
                if (memcmp(p, expect, elen+1)) {
                        pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n",
                                fmt, p, expect);
@@ -140,6 +161,30 @@ test_number(void)
        test("0x1234abcd  ", "%#-12x", 0x1234abcd);
        test("  0x1234abcd", "%#12x", 0x1234abcd);
        test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234);
+       test("0|1|1|128|255", "%hhu|%hhu|%hhu|%hhu|%hhu", 0, 1, 257, 128, -1);
+       test("0|1|1|-128|-1", "%hhd|%hhd|%hhd|%hhd|%hhd", 0, 1, 257, 128, -1);
+       test("2015122420151225", "%ho%ho%#ho", 1037, 5282, -11627);
+       /*
+        * POSIX/C99: Â»The result of converting zero with an explicit
+        * precision of zero shall be no characters.« Hence the output
+        * from the below test should really be "00|0||| ". However,
+        * the kernel's printf also produces a single 0 in that
+        * case. This test case simply documents the current
+        * behaviour.
+        */
+       test("00|0|0|0|0", "%.2d|%.1d|%.0d|%.*d|%1.0d", 0, 0, 0, 0, 0, 0);
+#ifndef __CHAR_UNSIGNED__
+       {
+               /*
+                * Passing a 'char' to a %02x specifier doesn't do
+                * what was presumably the intention when char is
+                * signed and the value is negative. One must either &
+                * with 0xff or cast to u8.
+                */
+               char val = -16;
+               test("0xfffffff0|0xf0|0xf0", "%#02x|%#02x|%#02x", val, val & 0xff, (u8)val);
+       }
+#endif
 }
 
 static void __init
@@ -148,14 +193,23 @@ test_string(void)
        test("", "%s%.0s", "", "123");
        test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456");
        test("1  |  2|3  |  4|5  ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5");
+       test("1234      ", "%-10.4s", "123456");
+       test("      1234", "%10.4s", "123456");
        /*
-        * POSIX and C99 say that a missing precision should be
-        * treated as a precision of 0. However, the kernel's printf
-        * implementation treats this case as if the . wasn't
-        * present. Let's add a test case documenting the current
-        * behaviour; should anyone ever feel the need to follow the
-        * standards more closely, this can be revisited.
+        * POSIX and C99 say that a negative precision (which is only
+        * possible to pass via a * argument) should be treated as if
+        * the precision wasn't present, and that if the precision is
+        * omitted (as in %.s), the precision should be taken to be
+        * 0. However, the kernel's printf behave exactly opposite,
+        * treating a negative precision as 0 and treating an omitted
+        * precision specifier as if no precision was given.
+        *
+        * These test cases document the current behaviour; should
+        * anyone ever feel the need to follow the standards more
+        * closely, this can be revisited.
         */
+       test("    ", "%4.*s", -5, "123456");
+       test("123456", "%.s", "123456");
        test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c");
        test("a  |   |   ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c");
 }
@@ -273,9 +327,35 @@ uuid(void)
        test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid);
 }
 
+static struct dentry test_dentry[4] __initdata = {
+       { .d_parent = &test_dentry[0],
+         .d_name = QSTR_INIT(test_dentry[0].d_iname, 3),
+         .d_iname = "foo" },
+       { .d_parent = &test_dentry[0],
+         .d_name = QSTR_INIT(test_dentry[1].d_iname, 5),
+         .d_iname = "bravo" },
+       { .d_parent = &test_dentry[1],
+         .d_name = QSTR_INIT(test_dentry[2].d_iname, 4),
+         .d_iname = "alfa" },
+       { .d_parent = &test_dentry[2],
+         .d_name = QSTR_INIT(test_dentry[3].d_iname, 5),
+         .d_iname = "romeo" },
+};
+
 static void __init
 dentry(void)
 {
+       test("foo", "%pd", &test_dentry[0]);
+       test("foo", "%pd2", &test_dentry[0]);
+
+       test("romeo", "%pd", &test_dentry[3]);
+       test("alfa/romeo", "%pd2", &test_dentry[3]);
+       test("bravo/alfa/romeo", "%pd3", &test_dentry[3]);
+       test("/bravo/alfa/romeo", "%pd4", &test_dentry[3]);
+       test("/bravo/alfa", "%pd4", &test_dentry[2]);
+
+       test("bravo/alfa  |bravo/alfa  ", "%-12pd2|%*pd2", &test_dentry[2], -12, &test_dentry[2]);
+       test("  bravo/alfa|  bravo/alfa", "%12pd2|%*pd2", &test_dentry[2], 12, &test_dentry[2]);
 }
 
 static void __init
@@ -288,6 +368,20 @@ struct_clk(void)
 {
 }
 
+static void __init
+large_bitmap(void)
+{
+       const int nbits = 1 << 16;
+       unsigned long *bits = kcalloc(BITS_TO_LONGS(nbits), sizeof(long), GFP_KERNEL);
+       if (!bits)
+               return;
+
+       bitmap_set(bits, 1, 20);
+       bitmap_set(bits, 60000, 15);
+       test("1-20,60000-60014", "%*pbl", nbits, bits);
+       kfree(bits);
+}
+
 static void __init
 bitmap(void)
 {
@@ -307,6 +401,8 @@ bitmap(void)
        bitmap_fill(bits, 20);
        test("fffff|fffff", "%20pb|%*pb", bits, 20, bits);
        test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits);
+
+       large_bitmap();
 }
 
 static void __init
@@ -337,16 +433,17 @@ test_pointer(void)
 static int __init
 test_printf_init(void)
 {
-       test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
-       if (!test_buffer)
+       alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL);
+       if (!alloced_buffer)
                return -ENOMEM;
+       test_buffer = alloced_buffer + PAD_SIZE;
 
        test_basic();
        test_number();
        test_string();
        test_pointer();
 
-       kfree(test_buffer);
+       kfree(alloced_buffer);
 
        if (failed_tests == 0)
                pr_info("all %u tests passed\n", total_tests);
index ac3f9476b7765bc23ddcaf3db2166d11cd85d24c..48ff9c36644d64c324c5c243212f282e406dce2f 100644 (file)
@@ -383,13 +383,14 @@ enum format_type {
 };
 
 struct printf_spec {
-       u8      type;           /* format_type enum */
-       u8      flags;          /* flags to number() */
-       u8      base;           /* number base, 8, 10 or 16 only */
-       u8      qualifier;      /* number qualifier, one of 'hHlLtzZ' */
-       s16     field_width;    /* width of output field */
-       s16     precision;      /* # of digits/chars */
-};
+       unsigned int    type:8;         /* format_type enum */
+       signed int      field_width:24; /* width of output field */
+       unsigned int    flags:8;        /* flags to number() */
+       unsigned int    base:8;         /* number base, 8, 10 or 16 only */
+       signed int      precision:16;   /* # of digits/chars */
+} __packed;
+#define FIELD_WIDTH_MAX ((1 << 23) - 1)
+#define PRECISION_MAX ((1 << 15) - 1)
 
 static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
@@ -402,6 +403,10 @@ char *number(char *buf, char *end, unsigned long long num,
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
+       int field_width = spec.field_width;
+       int precision = spec.precision;
+
+       BUILD_BUG_ON(sizeof(struct printf_spec) != 8);
 
        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
@@ -413,20 +418,20 @@ char *number(char *buf, char *end, unsigned long long num,
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
-                       spec.field_width--;
+                       field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
-                       spec.field_width--;
+                       field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
-                       spec.field_width--;
+                       field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
-                       spec.field_width -= 2;
+                       field_width -= 2;
                else if (!is_zero)
-                       spec.field_width--;
+                       field_width--;
        }
 
        /* generate full string in tmp[], in reverse order */
@@ -448,12 +453,12 @@ char *number(char *buf, char *end, unsigned long long num,
        }
 
        /* printing 100 using %2d gives "100", not "00" */
-       if (i > spec.precision)
-               spec.precision = i;
+       if (i > precision)
+               precision = i;
        /* leading space padding */
-       spec.field_width -= spec.precision;
+       field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
-               while (--spec.field_width >= 0) {
+               while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
@@ -482,14 +487,14 @@ char *number(char *buf, char *end, unsigned long long num,
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);
                BUILD_BUG_ON(' ' + ZEROPAD != '0');
-               while (--spec.field_width >= 0) {
+               while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
-       while (i <= --spec.precision) {
+       while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
@@ -501,7 +506,7 @@ char *number(char *buf, char *end, unsigned long long num,
                ++buf;
        }
        /* trailing space padding */
-       while (--spec.field_width >= 0) {
+       while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
@@ -511,37 +516,20 @@ char *number(char *buf, char *end, unsigned long long num,
 }
 
 static noinline_for_stack
-char *string(char *buf, char *end, const char *s, struct printf_spec spec)
+char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
 {
-       int len, i;
-
-       if ((unsigned long)s < PAGE_SIZE)
-               s = "(null)";
+       struct printf_spec spec;
 
-       len = strnlen(s, spec.precision);
-
-       if (!(spec.flags & LEFT)) {
-               while (len < spec.field_width--) {
-                       if (buf < end)
-                               *buf = ' ';
-                       ++buf;
-               }
-       }
-       for (i = 0; i < len; ++i) {
-               if (buf < end)
-                       *buf = *s;
-               ++buf; ++s;
-       }
-       while (len < spec.field_width--) {
-               if (buf < end)
-                       *buf = ' ';
-               ++buf;
-       }
+       spec.type = FORMAT_TYPE_PTR;
+       spec.field_width = 2 + 2 * size;        /* 0x + hex */
+       spec.flags = SPECIAL | SMALL | ZEROPAD;
+       spec.base = 16;
+       spec.precision = -1;
 
-       return buf;
+       return number(buf, end, num, spec);
 }
 
-static void widen(char *buf, char *end, unsigned len, unsigned spaces)
+static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
 {
        size_t size;
        if (buf >= end) /* nowhere to put anything */
@@ -559,6 +547,56 @@ static void widen(char *buf, char *end, unsigned len, unsigned spaces)
        memset(buf, ' ', spaces);
 }
 
+/*
+ * Handle field width padding for a string.
+ * @buf: current buffer position
+ * @n: length of string
+ * @end: end of output buffer
+ * @spec: for field width and flags
+ * Returns: new buffer position after padding.
+ */
+static noinline_for_stack
+char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
+{
+       unsigned spaces;
+
+       if (likely(n >= spec.field_width))
+               return buf;
+       /* we want to pad the sucker */
+       spaces = spec.field_width - n;
+       if (!(spec.flags & LEFT)) {
+               move_right(buf - n, end, n, spaces);
+               return buf + spaces;
+       }
+       while (spaces--) {
+               if (buf < end)
+                       *buf = ' ';
+               ++buf;
+       }
+       return buf;
+}
+
+static noinline_for_stack
+char *string(char *buf, char *end, const char *s, struct printf_spec spec)
+{
+       int len = 0;
+       size_t lim = spec.precision;
+
+       if ((unsigned long)s < PAGE_SIZE)
+               s = "(null)";
+
+       while (lim--) {
+               char c = *s++;
+               if (!c)
+                       break;
+               if (buf < end)
+                       *buf = c;
+               ++buf;
+               ++len;
+       }
+       return widen_string(buf, len, end, spec);
+}
+
 static noinline_for_stack
 char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
@@ -600,20 +638,7 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
                        *buf = c;
        }
        rcu_read_unlock();
-       if (n < spec.field_width) {
-               /* we want to pad the sucker */
-               unsigned spaces = spec.field_width - n;
-               if (!(spec.flags & LEFT)) {
-                       widen(buf - n, end, n, spaces);
-                       return buf + spaces;
-               }
-               while (spaces--) {
-                       if (buf < end)
-                               *buf = ' ';
-                       ++buf;
-               }
-       }
-       return buf;
+       return widen_string(buf, n, end, spec);
 }
 
 #ifdef CONFIG_BLOCK
@@ -659,11 +684,7 @@ char *symbol_string(char *buf, char *end, void *ptr,
 
        return string(buf, end, sym, spec);
 #else
-       spec.field_width = 2 * sizeof(void *);
-       spec.flags |= SPECIAL | SMALL | ZEROPAD;
-       spec.base = 16;
-
-       return number(buf, end, value, spec);
+       return special_hex_number(buf, end, value, sizeof(void *));
 #endif
 }
 
@@ -1324,40 +1345,45 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
        return string(buf, end, uuid, spec);
 }
 
-static
-char *netdev_feature_string(char *buf, char *end, const u8 *addr,
-                     struct printf_spec spec)
+static noinline_for_stack
+char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt)
 {
-       spec.flags |= SPECIAL | SMALL | ZEROPAD;
-       if (spec.field_width == -1)
-               spec.field_width = 2 + 2 * sizeof(netdev_features_t);
-       spec.base = 16;
+       unsigned long long num;
+       int size;
 
-       return number(buf, end, *(const netdev_features_t *)addr, spec);
+       switch (fmt[1]) {
+       case 'F':
+               num = *(const netdev_features_t *)addr;
+               size = sizeof(netdev_features_t);
+               break;
+       default:
+               num = (unsigned long)addr;
+               size = sizeof(unsigned long);
+               break;
+       }
+
+       return special_hex_number(buf, end, num, size);
 }
 
 static noinline_for_stack
-char *address_val(char *buf, char *end, const void *addr,
-                 struct printf_spec spec, const char *fmt)
+char *address_val(char *buf, char *end, const void *addr, const char *fmt)
 {
        unsigned long long num;
-
-       spec.flags |= SPECIAL | SMALL | ZEROPAD;
-       spec.base = 16;
+       int size;
 
        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
-               spec.field_width = sizeof(dma_addr_t) * 2 + 2;
+               size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
-               spec.field_width = sizeof(phys_addr_t) * 2 + 2;
+               size = sizeof(phys_addr_t);
                break;
        }
 
-       return number(buf, end, num, spec);
+       return special_hex_number(buf, end, num, size);
 }
 
 static noinline_for_stack
@@ -1376,10 +1402,7 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
 #ifdef CONFIG_COMMON_CLK
                return string(buf, end, __clk_get_name(clk), spec);
 #else
-               spec.base = 16;
-               spec.field_width = sizeof(unsigned long) * 2 + 2;
-               spec.flags |= SPECIAL | SMALL | ZEROPAD;
-               return number(buf, end, (unsigned long)clk, spec);
+               return special_hex_number(buf, end, (unsigned long)clk, sizeof(unsigned long));
 #endif
        }
 }
@@ -1609,13 +1632,9 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
                break;
 
        case 'N':
-               switch (fmt[1]) {
-               case 'F':
-                       return netdev_feature_string(buf, end, ptr, spec);
-               }
-               break;
+               return netdev_bits(buf, end, ptr, fmt);
        case 'a':
-               return address_val(buf, end, ptr, spec, fmt);
+               return address_val(buf, end, ptr, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 'C':
@@ -1664,6 +1683,7 @@ static noinline_for_stack
 int format_decode(const char *fmt, struct printf_spec *spec)
 {
        const char *start = fmt;
+       char qualifier;
 
        /* we finished early by reading the field width */
        if (spec->type == FORMAT_TYPE_WIDTH) {
@@ -1746,16 +1766,16 @@ precision:
 
 qualifier:
        /* get the conversion qualifier */
-       spec->qualifier = -1;
+       qualifier = 0;
        if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
            _tolower(*fmt) == 'z' || *fmt == 't') {
-               spec->qualifier = *fmt++;
-               if (unlikely(spec->qualifier == *fmt)) {
-                       if (spec->qualifier == 'l') {
-                               spec->qualifier = 'L';
+               qualifier = *fmt++;
+               if (unlikely(qualifier == *fmt)) {
+                       if (qualifier == 'l') {
+                               qualifier = 'L';
                                ++fmt;
-                       } else if (spec->qualifier == 'h') {
-                               spec->qualifier = 'H';
+                       } else if (qualifier == 'h') {
+                               qualifier = 'H';
                                ++fmt;
                        }
                }
@@ -1812,19 +1832,19 @@ qualifier:
                return fmt - start;
        }
 
-       if (spec->qualifier == 'L')
+       if (qualifier == 'L')
                spec->type = FORMAT_TYPE_LONG_LONG;
-       else if (spec->qualifier == 'l') {
+       else if (qualifier == 'l') {
                BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
                spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
-       } else if (_tolower(spec->qualifier) == 'z') {
+       } else if (_tolower(qualifier) == 'z') {
                spec->type = FORMAT_TYPE_SIZE_T;
-       } else if (spec->qualifier == 't') {
+       } else if (qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
-       } else if (spec->qualifier == 'H') {
+       } else if (qualifier == 'H') {
                BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
                spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
-       } else if (spec->qualifier == 'h') {
+       } else if (qualifier == 'h') {
                BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
                spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
        } else {
@@ -1835,6 +1855,24 @@ qualifier:
        return ++fmt - start;
 }
 
+static void
+set_field_width(struct printf_spec *spec, int width)
+{
+       spec->field_width = width;
+       if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
+               spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
+       }
+}
+
+static void
+set_precision(struct printf_spec *spec, int prec)
+{
+       spec->precision = prec;
+       if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
+               spec->precision = clamp(prec, 0, PRECISION_MAX);
+       }
+}
+
 /**
  * vsnprintf - Format a string and place it in a buffer
  * @buf: The buffer to place the result into
@@ -1902,11 +1940,11 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                }
 
                case FORMAT_TYPE_WIDTH:
-                       spec.field_width = va_arg(args, int);
+                       set_field_width(&spec, va_arg(args, int));
                        break;
 
                case FORMAT_TYPE_PRECISION:
-                       spec.precision = va_arg(args, int);
+                       set_precision(&spec, va_arg(args, int));
                        break;
 
                case FORMAT_TYPE_CHAR: {
@@ -2346,11 +2384,11 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
                }
 
                case FORMAT_TYPE_WIDTH:
-                       spec.field_width = get_arg(int);
+                       set_field_width(&spec, get_arg(int));
                        break;
 
                case FORMAT_TYPE_PRECISION:
-                       spec.precision = get_arg(int);
+                       set_precision(&spec, get_arg(int));
                        break;
 
                case FORMAT_TYPE_CHAR: {
index 5d2072ed8d5e7c925877b0759163035a9a2a0fe0..f05b2d5d6481f64913500c1ad01857d53a5de0de 100644 (file)
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
        {1UL << PG_hwpoison,            "hwpoison"      },
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       {1UL << PG_compound_lock,       "compound_lock" },
-#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        {1UL << PG_young,               "young"         },
        {1UL << PG_idle,                "idle"          },
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags,
 void dump_page_badflags(struct page *page, const char *reason,
                unsigned long badflags)
 {
-       pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+       pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
                  page, atomic_read(&page->_count), page_mapcount(page),
                  page->mapping, page->index);
+       if (PageCompound(page))
+               pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+       pr_cont("\n");
        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
        dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
        if (reason)
index ff42d31c891a1f9fcce038da09c5229fdc0029ed..847ee43c28068a0fb744fe124c1d8afa429d0719 100644 (file)
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
                __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
-       BUG_ON(page_mapped(page));
+       VM_BUG_ON_PAGE(page_mapped(page), page);
 
        /*
         * At this point page must be either written or cleaned by truncate.
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
 
        if (!huge) {
                error = mem_cgroup_try_charge(page, current->mm,
-                                             gfp_mask, &memcg);
+                                             gfp_mask, &memcg, false);
                if (error)
                        return error;
        }
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
        error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error) {
                if (!huge)
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, false);
                return error;
        }
 
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
                __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
-               mem_cgroup_commit_charge(page, memcg, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
        trace_mm_filemap_add_to_page_cache(page);
        return 0;
 err_insert:
@@ -653,7 +653,7 @@ err_insert:
        /* Leave page->index set: truncation relies upon it */
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        return error;
 }
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
        void *shadow = NULL;
        int ret;
 
-       __set_page_locked(page);
+       __SetPageLocked(page);
        ret = __add_to_page_cache_locked(page, mapping, offset,
                                         gfp_mask, &shadow);
        if (unlikely(ret))
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
        else {
                /*
                 * The page might have been evicted from cache only
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
  */
 void unlock_page(struct page *page)
 {
+       page = compound_head(page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        clear_bit_unlock(PG_locked, &page->flags);
        smp_mb__after_atomic();
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
  */
 void __lock_page(struct page *page)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+       struct page *page_head = compound_head(page);
+       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
 
-       __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+       __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
 
 int __lock_page_killable(struct page *page)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+       struct page *page_head = compound_head(page);
+       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
 
-       return __wait_on_bit_lock(page_waitqueue(page), &wait,
+       return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
                                        bit_wait_io, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
index deafa2c91b362206b7ef56aec918509a9cc24dd2..b64a36175884e07604b0e216bc2d545a2892dcb7 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 
 #include <linux/mm.h>
+#include <linux/memremap.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags)
 {
        struct mm_struct *mm = vma->vm_mm;
+       struct dev_pagemap *pgmap = NULL;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
@@ -98,7 +100,17 @@ retry:
        }
 
        page = vm_normal_page(vma, address, pte);
-       if (unlikely(!page)) {
+       if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+               /*
+                * Only return device mapping pages in the FOLL_GET case since
+                * they are only valid while holding the pgmap reference.
+                */
+               pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+               if (pgmap)
+                       page = pte_page(pte);
+               else
+                       goto no_page;
+       } else if (unlikely(!page)) {
                if (flags & FOLL_DUMP) {
                        /* Avoid special (like zero) pages in core dumps */
                        page = ERR_PTR(-EFAULT);
@@ -116,8 +128,28 @@ retry:
                }
        }
 
-       if (flags & FOLL_GET)
-               get_page_foll(page);
+       if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+               int ret;
+               get_page(page);
+               pte_unmap_unlock(ptep, ptl);
+               lock_page(page);
+               ret = split_huge_page(page);
+               unlock_page(page);
+               put_page(page);
+               if (ret)
+                       return ERR_PTR(ret);
+               goto retry;
+       }
+
+       if (flags & FOLL_GET) {
+               get_page(page);
+
+               /* drop the pgmap reference now that we hold the page */
+               if (pgmap) {
+                       put_dev_pagemap(pgmap);
+                       pgmap = NULL;
+               }
+       }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +162,10 @@ retry:
                mark_page_accessed(page);
        }
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /* Do not mlock pte-mapped THP */
+               if (PageTransCompound(page))
+                       goto out;
+
                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        }
        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                return no_page_table(vma, flags);
-       if (pmd_trans_huge(*pmd)) {
-               if (flags & FOLL_SPLIT) {
-                       split_huge_page_pmd(vma, address, pmd);
-                       return follow_page_pte(vma, address, pmd, flags);
-               }
+       if (pmd_devmap(*pmd)) {
                ptl = pmd_lock(mm, pmd);
-               if (likely(pmd_trans_huge(*pmd))) {
-                       if (unlikely(pmd_trans_splitting(*pmd))) {
-                               spin_unlock(ptl);
-                               wait_split_huge_page(vma->anon_vma, pmd);
-                       } else {
-                               page = follow_trans_huge_pmd(vma, address,
-                                                            pmd, flags);
-                               spin_unlock(ptl);
-                               *page_mask = HPAGE_PMD_NR - 1;
-                               return page;
-                       }
-               } else
+               page = follow_devmap_pmd(vma, address, pmd, flags);
+               spin_unlock(ptl);
+               if (page)
+                       return page;
+       }
+       if (likely(!pmd_trans_huge(*pmd)))
+               return follow_page_pte(vma, address, pmd, flags);
+
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_trans_huge(*pmd))) {
+               spin_unlock(ptl);
+               return follow_page_pte(vma, address, pmd, flags);
+       }
+       if (flags & FOLL_SPLIT) {
+               int ret;
+               page = pmd_page(*pmd);
+               if (is_huge_zero_page(page)) {
+                       spin_unlock(ptl);
+                       ret = 0;
+                       split_huge_pmd(vma, pmd, address);
+               } else {
+                       get_page(page);
                        spin_unlock(ptl);
+                       lock_page(page);
+                       ret = split_huge_page(page);
+                       unlock_page(page);
+                       put_page(page);
+               }
+
+               return ret ? ERR_PTR(ret) :
+                       follow_page_pte(vma, address, pmd, flags);
        }
-       return follow_page_pte(vma, address, pmd, flags);
+
+       page = follow_trans_huge_pmd(vma, address, pmd, flags);
+       spin_unlock(ptl);
+       *page_mask = HPAGE_PMD_NR - 1;
+       return page;
 }
 
 static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages);
  * @mm:                mm_struct of target mm
  * @address:   user address
  * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ *             does not allow retry
  *
  * This is meant to be called in the specific scenario where for locking reasons
  * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages);
  * The main difference with get_user_pages() is that this function will
  * unconditionally call handle_mm_fault() which will in turn perform all the
  * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
  *
  * This is important for some architectures where those bits also gate the
  * access permission to the page because they are maintained in software.  On
  * such architectures, gup() will not be enough to make a subsequent access
  * succeed.
  *
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
  */
 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long address, unsigned int fault_flags)
+                    unsigned long address, unsigned int fault_flags,
+                    bool *unlocked)
 {
        struct vm_area_struct *vma;
        vm_flags_t vm_flags;
-       int ret;
+       int ret, major = 0;
 
+       if (unlocked)
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+
+retry:
        vma = find_extend_vma(mm, address);
        if (!vma || address < vma->vm_start)
                return -EFAULT;
@@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                return -EFAULT;
 
        ret = handle_mm_fault(mm, vma, address, fault_flags);
+       major |= ret & VM_FAULT_MAJOR;
        if (ret & VM_FAULT_ERROR) {
                if (ret & VM_FAULT_OOM)
                        return -ENOMEM;
@@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                        return -EFAULT;
                BUG();
        }
+
+       if (ret & VM_FAULT_RETRY) {
+               down_read(&mm->mmap_sem);
+               if (!(fault_flags & FAULT_FLAG_TRIED)) {
+                       *unlocked = true;
+                       fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       fault_flags |= FAULT_FLAG_TRIED;
+                       goto retry;
+               }
+       }
+
        if (tsk) {
-               if (ret & VM_FAULT_MAJOR)
+               if (major)
                        tsk->maj_flt++;
                else
                        tsk->min_flt++;
@@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
        if (vma->vm_flags & VM_LOCKONFAULT)
                gup_flags &= ~FOLL_POPULATE;
-
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr)
  *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
  *      pages containing page tables.
  *
- *  *) THP splits will broadcast an IPI, this can be achieved by overriding
- *      pmdp_splitting_flush.
- *
  *  *) ptes can be read atomically by the architecture.
  *
  *  *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                 * for an example see gup_get_pte in arch/x86/mm/gup.c
                 */
                pte_t pte = READ_ONCE(*ptep);
-               struct page *page;
+               struct page *head, *page;
 
                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
+               head = compound_head(page);
 
-               if (!page_cache_get_speculative(page))
+               if (!page_cache_get_speculative(head))
                        goto pte_unmap;
 
                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                       put_page(page);
+                       put_page(head);
                        goto pte_unmap;
                }
 
+               VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
                (*nr)++;
 
@@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        if (write && !pmd_write(orig))
@@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
        refs = 0;
        head = pmd_page(orig);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                return 0;
        }
 
-       /*
-        * Any tail pages need their mapcount reference taken before we
-        * return. (This allows the THP code to bump their ref count when
-        * they are split into base pages).
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        if (write && !pud_write(orig))
@@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
        refs = 0;
        head = pud_page(orig);
        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                return 0;
        }
 
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                        struct page **pages, int *nr)
 {
        int refs;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
 
        if (write && !pgd_write(orig))
                return 0;
@@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
        refs = 0;
        head = pgd_page(orig);
        page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                return 0;
        }
 
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = READ_ONCE(*pmdp);
 
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
 
                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
index f952f055fdcf571ec82893d388a73fa6b23b284b..b2db98136af9d574646ef3f25d3cf1d462c174c1 100644 (file)
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/swapops.h>
 #include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/pfn_t.h>
 #include <linux/mman.h>
+#include <linux/memremap.h>
 #include <linux/pagemap.h>
+#include <linux/debugfs.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
@@ -45,6 +49,7 @@ enum scan_result {
        SCAN_PAGE_LRU,
        SCAN_PAGE_LOCK,
        SCAN_PAGE_ANON,
+       SCAN_PAGE_COMPOUND,
        SCAN_ANY_PROCESS,
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
@@ -133,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
 
 static void set_recommended_min_free_kbytes(void)
 {
@@ -665,6 +674,9 @@ static int __init hugepage_init(void)
        err = register_shrinker(&huge_zero_page_shrinker);
        if (err)
                goto err_hzp_shrinker;
+       err = register_shrinker(&deferred_split_shrinker);
+       if (err)
+               goto err_split_shrinker;
 
        /*
         * By default disable transparent hugepages on smaller systems,
@@ -682,6 +694,8 @@ static int __init hugepage_init(void)
 
        return 0;
 err_khugepaged:
+       unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
        unregister_shrinker(&huge_zero_page_shrinker);
 err_hzp_shrinker:
        khugepaged_slab_exit();
@@ -738,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
        return entry;
 }
 
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+       /*
+        * ->lru in the tail pages is occupied by compound_head.
+        * Let's use ->mapping + ->index in the second tail page as list_head.
+        */
+       return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+       /*
+        * we use page->mapping and page->indexlru in second tail page
+        * as list_head: assuming THP order >= 2
+        */
+       BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+       INIT_LIST_HEAD(page_deferred_list(page));
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
@@ -751,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -759,7 +794,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                return VM_FAULT_OOM;
        }
@@ -775,7 +810,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_none(*pmd))) {
                spin_unlock(ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                pte_free(mm, pgtable);
        } else {
@@ -786,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                        int ret;
 
                        spin_unlock(ptl);
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(mm, pgtable);
                        ret = handle_userfault(vma, address, flags,
@@ -797,8 +832,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               page_add_new_anon_rmap(page, vma, haddr);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, haddr, true);
+               mem_cgroup_commit_charge(page, memcg, false, true);
                lru_cache_add_active_or_unevictable(page, vma);
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -892,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
+       prep_transhuge_page(page);
        return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                            flags);
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+               pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
 {
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;
        spinlock_t *ptl;
 
        ptl = pmd_lock(mm, pmd);
-       if (pmd_none(*pmd)) {
-               entry = pmd_mkhuge(pfn_pmd(pfn, prot));
-               if (write) {
-                       entry = pmd_mkyoung(pmd_mkdirty(entry));
-                       entry = maybe_pmd_mkwrite(entry, vma);
-               }
-               set_pmd_at(mm, addr, pmd, entry);
-               update_mmu_cache_pmd(vma, addr, pmd);
-       }
+       entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+       if (pfn_t_devmap(pfn))
+               entry = pmd_mkdevmap(entry);
+       if (write) {
+               entry = pmd_mkyoung(pmd_mkdirty(entry));
+               entry = maybe_pmd_mkwrite(entry, vma);
+       }
+       set_pmd_at(mm, addr, pmd, entry);
+       update_mmu_cache_pmd(vma, addr, pmd);
        spin_unlock(ptl);
 }
 
 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                       pmd_t *pmd, unsigned long pfn, bool write)
+                       pmd_t *pmd, pfn_t pfn, bool write)
 {
        pgprot_t pgprot = vma->vm_page_prot;
        /*
@@ -929,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+       BUG_ON(!pfn_t_devmap(pfn));
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;
@@ -939,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        return VM_FAULT_NOPAGE;
 }
 
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd)
+{
+       pmd_t _pmd;
+
+       /*
+        * We should set the dirty bit only for FOLL_WRITE but for now
+        * the dirty bit in the pmd is meaningless.  And if the dirty
+        * bit will become meaningful and we'll only set it with
+        * FOLL_WRITE, an atomic set_bit will be required on the pmd to
+        * set the young bit, instead of the current set_pmd_at.
+        */
+       _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+       if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+                               pmd, _pmd,  1))
+               update_mmu_cache_pmd(vma, addr, pmd);
+}
+
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd, int flags)
+{
+       unsigned long pfn = pmd_pfn(*pmd);
+       struct mm_struct *mm = vma->vm_mm;
+       struct dev_pagemap *pgmap;
+       struct page *page;
+
+       assert_spin_locked(pmd_lockptr(mm, pmd));
+
+       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+               return NULL;
+
+       if (pmd_present(*pmd) && pmd_devmap(*pmd))
+               /* pass */;
+       else
+               return NULL;
+
+       if (flags & FOLL_TOUCH)
+               touch_pmd(vma, addr, pmd);
+
+       /*
+        * device mapped pages can only be returned if the
+        * caller will manage the page reference count.
+        */
+       if (!(flags & FOLL_GET))
+               return ERR_PTR(-EEXIST);
+
+       pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+       pgmap = get_dev_pagemap(pfn, NULL);
+       if (!pgmap)
+               return ERR_PTR(-EFAULT);
+       page = pfn_to_page(pfn);
+       get_page(page);
+       put_dev_pagemap(pgmap);
+
+       return page;
+}
+
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *vma)
@@ -960,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
        ret = -EAGAIN;
        pmd = *src_pmd;
-       if (unlikely(!pmd_trans_huge(pmd))) {
+       if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
@@ -983,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
 
-       if (unlikely(pmd_trans_splitting(pmd))) {
-               /* split huge page running from under us */
-               spin_unlock(src_ptl);
-               spin_unlock(dst_ptl);
-               pte_free(dst_mm, pgtable);
-
-               wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
-               goto out;
+       if (pmd_trans_huge(pmd)) {
+               /* thp accounting separate from pmd_devmap accounting */
+               src_page = pmd_page(pmd);
+               VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+               get_page(src_page);
+               page_dup_rmap(src_page, true);
+               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+               atomic_long_inc(&dst_mm->nr_ptes);
+               pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        }
-       src_page = pmd_page(pmd);
-       VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-       get_page(src_page);
-       page_dup_rmap(src_page);
-       add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
-       pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       atomic_long_inc(&dst_mm->nr_ptes);
 
        ret = 0;
 out_unlock:
@@ -1035,37 +1122,6 @@ unlock:
        spin_unlock(ptl);
 }
 
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
-
-               atomic_add(HPAGE_PMD_NR, &page->_count);
-               while (++page < endpage)
-                       get_huge_page_tail(page);
-       } else {
-               get_page(page);
-       }
-}
-
-static void put_user_huge_page(struct page *page)
-{
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
-
-               while (page < endpage)
-                       put_page(page++);
-       } else {
-               put_page(page);
-       }
-}
-
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -1095,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
-                                                  &memcg))) {
+                                                  &memcg, false))) {
                        if (pages[i])
                                put_page(pages[i]);
                        while (--i >= 0) {
                                memcg = (void *)page_private(pages[i]);
                                set_page_private(pages[i], 0);
-                               mem_cgroup_cancel_charge(pages[i], memcg);
+                               mem_cgroup_cancel_charge(pages[i], memcg,
+                                               false);
                                put_page(pages[i]);
                        }
                        kfree(pages);
@@ -1139,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-               page_add_new_anon_rmap(pages[i], vma, haddr);
-               mem_cgroup_commit_charge(pages[i], memcg, false);
+               page_add_new_anon_rmap(pages[i], vma, haddr, false);
+               mem_cgroup_commit_charge(pages[i], memcg, false, false);
                lru_cache_add_active_or_unevictable(pages[i], vma);
                pte = pte_offset_map(&_pmd, haddr);
                VM_BUG_ON(!pte_none(*pte));
@@ -1151,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-       page_remove_rmap(page);
+       page_remove_rmap(page, true);
        spin_unlock(ptl);
 
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1168,7 +1225,7 @@ out_free_pages:
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-               mem_cgroup_cancel_charge(pages[i], memcg);
+               mem_cgroup_cancel_charge(pages[i], memcg, false);
                put_page(pages[i]);
        }
        kfree(pages);
@@ -1198,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        page = pmd_page(orig_pmd);
        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-       if (page_mapcount(page) == 1) {
+       /*
+        * We can only reuse the page if nobody else maps the huge page or it's
+        * part. We can do it by checking page_mapcount() on each sub-page, but
+        * it's expensive.
+        * The cheaper way is to check page_count() to be equal 1: every
+        * mapcount takes page reference reference, so this way we can
+        * guarantee, that the PMD is the only mapping.
+        * This can give false negative if somebody pinned the page, but that's
+        * fine.
+        */
+       if (page_mapcount(page) == 1 && page_count(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1207,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
-       get_user_huge_page(page);
+       get_page(page);
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
@@ -1217,30 +1284,33 @@ alloc:
        } else
                new_page = NULL;
 
-       if (unlikely(!new_page)) {
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
                if (!page) {
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                        ret |= VM_FAULT_FALLBACK;
                } else {
                        ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                        pmd, orig_pmd, page, haddr);
                        if (ret & VM_FAULT_OOM) {
-                               split_huge_page(page);
+                               split_huge_pmd(vma, pmd, address);
                                ret |= VM_FAULT_FALLBACK;
                        }
-                       put_user_huge_page(page);
+                       put_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
        }
 
-       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
+                                          true))) {
                put_page(new_page);
                if (page) {
-                       split_huge_page(page);
-                       put_user_huge_page(page);
+                       split_huge_pmd(vma, pmd, address);
+                       put_page(page);
                } else
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                ret |= VM_FAULT_FALLBACK;
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
@@ -1260,10 +1330,10 @@ alloc:
 
        spin_lock(ptl);
        if (page)
-               put_user_huge_page(page);
+               put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(ptl);
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, true);
                put_page(new_page);
                goto out_mn;
        } else {
@@ -1271,8 +1341,8 @@ alloc:
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-               page_add_new_anon_rmap(new_page, vma, haddr);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, haddr, true);
+               mem_cgroup_commit_charge(new_page, memcg, false, true);
                lru_cache_add_active_or_unevictable(new_page, vma);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
@@ -1281,7 +1351,7 @@ alloc:
                        put_huge_zero_page();
                } else {
                        VM_BUG_ON_PAGE(!PageHead(page), page);
-                       page_remove_rmap(page);
+                       page_remove_rmap(page, true);
                        put_page(page);
                }
                ret |= VM_FAULT_WRITE;
@@ -1319,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
        page = pmd_page(*pmd);
        VM_BUG_ON_PAGE(!PageHead(page), page);
-       if (flags & FOLL_TOUCH) {
-               pmd_t _pmd;
+       if (flags & FOLL_TOUCH)
+               touch_pmd(vma, addr, pmd);
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                /*
-                * We should set the dirty bit only for FOLL_WRITE but
-                * for now the dirty bit in the pmd is meaningless.
-                * And if the dirty bit will become meaningful and
-                * we'll only set it with FOLL_WRITE, an atomic
-                * set_bit will be required on the pmd to set the
-                * young bit, instead of the current set_pmd_at.
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
                 */
-               _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
-               if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                                         pmd, _pmd,  1))
-                       update_mmu_cache_pmd(vma, addr, pmd);
-       }
-       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
                                mlock_vma_page(page);
@@ -1345,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        if (flags & FOLL_GET)
-               get_page_foll(page);
+               get_page(page);
 
 out:
        return page;
@@ -1480,13 +1550,84 @@ out:
        return 0;
 }
 
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+       spinlock_t *ptl;
+       pmd_t orig_pmd;
+       struct page *page;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 0;
+
+       if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+               goto out;
+
+       orig_pmd = *pmd;
+       if (is_huge_zero_pmd(orig_pmd)) {
+               ret = 1;
+               goto out;
+       }
+
+       page = pmd_page(orig_pmd);
+       /*
+        * If other processes are mapping this page, we couldn't discard
+        * the page unless they all do MADV_FREE so let's skip the page.
+        */
+       if (page_mapcount(page) != 1)
+               goto out;
+
+       if (!trylock_page(page))
+               goto out;
+
+       /*
+        * If user want to discard part-pages of THP, split it so MADV_FREE
+        * will deactivate only them.
+        */
+       if (next - addr != HPAGE_PMD_SIZE) {
+               get_page(page);
+               spin_unlock(ptl);
+               if (split_huge_page(page)) {
+                       put_page(page);
+                       unlock_page(page);
+                       goto out_unlocked;
+               }
+               put_page(page);
+               unlock_page(page);
+               ret = 1;
+               goto out_unlocked;
+       }
+
+       if (PageDirty(page))
+               ClearPageDirty(page);
+       unlock_page(page);
+
+       if (PageActive(page))
+               deactivate_page(page);
+
+       if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                       tlb->fullmm);
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+       }
+       ret = 1;
+out:
+       spin_unlock(ptl);
+out_unlocked:
+       return ret;
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
        pmd_t orig_pmd;
        spinlock_t *ptl;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+       if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
@@ -1508,7 +1649,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                put_huge_zero_page();
        } else {
                struct page *page = pmd_page(orig_pmd);
-               page_remove_rmap(page);
+               page_remove_rmap(page, true);
                VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1520,13 +1661,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return 1;
 }
 
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
                  pmd_t *old_pmd, pmd_t *new_pmd)
 {
        spinlock_t *old_ptl, *new_ptl;
-       int ret = 0;
        pmd_t pmd;
 
        struct mm_struct *mm = vma->vm_mm;
@@ -1535,7 +1675,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
            (new_addr & ~HPAGE_PMD_MASK) ||
            old_end - old_addr < HPAGE_PMD_SIZE ||
            (new_vma->vm_flags & VM_NOHUGEPAGE))
-               goto out;
+               return false;
 
        /*
         * The destination pmd shouldn't be established, free_pgtables()
@@ -1543,15 +1683,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
         */
        if (WARN_ON(!pmd_none(*new_pmd))) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
-               goto out;
+               return false;
        }
 
        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
-       ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
-       if (ret == 1) {
+       if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1567,9 +1706,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
+               return true;
        }
-out:
-       return ret;
+       return false;
 }
 
 /*
@@ -1585,7 +1724,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
@@ -1616,495 +1755,109 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 }
 
 /*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ * Returns true if a given pmd maps a thp, false otherwise.
  *
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
  */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd))) {
-               if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(*ptl);
-                       wait_split_huge_page(vma->anon_vma, pmd);
-                       return -1;
-               } else {
-                       /* Thp mapped by 'pmd' is stable, so we can
-                        * handle it as it is. */
-                       return 1;
-               }
-       }
+       if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+               return true;
        spin_unlock(*ptl);
-       return 0;
+       return false;
 }
 
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
-                             struct mm_struct *mm,
-                             unsigned long address,
-                             enum page_check_address_pmd_flag flag,
-                             spinlock_t **ptl)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-
-       if (address & ~HPAGE_PMD_MASK)
-               return NULL;
-
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return NULL;
-       pmd = pmd_offset(pud, address);
-
-       *ptl = pmd_lock(mm, pmd);
-       if (!pmd_present(*pmd))
-               goto unlock;
-       if (pmd_page(*pmd) != page)
-               goto unlock;
-       /*
-        * split_vma() may create temporary aliased mappings. There is
-        * no risk as long as all huge pmd are found and have their
-        * splitting bit set before __split_huge_page_refcount
-        * runs. Finding the same huge pmd more than once during the
-        * same rmap walk is not a problem.
-        */
-       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-           pmd_trans_splitting(*pmd))
-               goto unlock;
-       if (pmd_trans_huge(*pmd)) {
-               VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
-                         !pmd_trans_splitting(*pmd));
-               return pmd;
-       }
-unlock:
-       spin_unlock(*ptl);
-       return NULL;
-}
+#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
 
-static int __split_huge_page_splitting(struct page *page,
-                                      struct vm_area_struct *vma,
-                                      unsigned long address)
+int hugepage_madvise(struct vm_area_struct *vma,
+                    unsigned long *vm_flags, int advice)
 {
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd;
-       int ret = 0;
-       /* For mmu_notifiers */
-       const unsigned long mmun_start = address;
-       const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
-
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
-       if (pmd) {
+       switch (advice) {
+       case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+               /*
+                * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+                * can't handle this properly after s390_enable_sie, so we simply
+                * ignore the madvise to prevent qemu from causing a SIGSEGV.
+                */
+               if (mm_has_pgste(vma->vm_mm))
+                       return 0;
+#endif
                /*
-                * We can't temporarily set the pmd to null in order
-                * to split it, the pmd must remain marked huge at all
-                * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->rwsem to
-                * serialize against split_huge_page*.
+                * Be somewhat over-protective like KSM for now!
                 */
-               pmdp_splitting_flush(vma, address, pmd);
-
-               ret = 1;
-               spin_unlock(ptl);
+               if (*vm_flags & VM_NO_THP)
+                       return -EINVAL;
+               *vm_flags &= ~VM_NOHUGEPAGE;
+               *vm_flags |= VM_HUGEPAGE;
+               /*
+                * If the vma become good for khugepaged to scan,
+                * register it here without waiting a page fault that
+                * may not happen any time soon.
+                */
+               if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
+                       return -ENOMEM;
+               break;
+       case MADV_NOHUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & VM_NO_THP)
+                       return -EINVAL;
+               *vm_flags &= ~VM_HUGEPAGE;
+               *vm_flags |= VM_NOHUGEPAGE;
+               /*
+                * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                * this vma even if we leave the mm registered in khugepaged if
+                * it got registered before VM_NOHUGEPAGE was set.
+                */
+               break;
        }
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-       return ret;
+       return 0;
 }
 
-static void __split_huge_page_refcount(struct page *page,
-                                      struct list_head *list)
+static int __init khugepaged_slab_init(void)
 {
-       int i;
-       struct zone *zone = page_zone(page);
-       struct lruvec *lruvec;
-       int tail_count = 0;
-
-       /* prevent PageLRU to go away from under us, and freeze lru stats */
-       spin_lock_irq(&zone->lru_lock);
-       lruvec = mem_cgroup_page_lruvec(page, zone);
+       mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+                                         sizeof(struct mm_slot),
+                                         __alignof__(struct mm_slot), 0, NULL);
+       if (!mm_slot_cache)
+               return -ENOMEM;
 
-       compound_lock(page);
-       /* complete memcg works before add pages to LRU */
-       mem_cgroup_split_huge_fixup(page);
+       return 0;
+}
 
-       for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
-               struct page *page_tail = page + i;
+static void __init khugepaged_slab_exit(void)
+{
+       kmem_cache_destroy(mm_slot_cache);
+}
 
-               /* tail_page->_mapcount cannot change */
-               BUG_ON(page_mapcount(page_tail) < 0);
-               tail_count += page_mapcount(page_tail);
-               /* check for overflow */
-               BUG_ON(tail_count < 0);
-               BUG_ON(atomic_read(&page_tail->_count) != 0);
-               /*
-                * tail_page->_count is zero and not changing from
-                * under us. But get_page_unless_zero() may be running
-                * from under us on the tail_page. If we used
-                * atomic_set() below instead of atomic_add(), we
-                * would then run atomic_set() concurrently with
-                * get_page_unless_zero(), and atomic_set() is
-                * implemented in C not using locked ops. spin_unlock
-                * on x86 sometime uses locked ops because of PPro
-                * errata 66, 92, so unless somebody can guarantee
-                * atomic_set() here would be safe on all archs (and
-                * not only on x86), it's safer to use atomic_add().
-                */
-               atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
-                          &page_tail->_count);
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+       if (!mm_slot_cache)     /* initialization failed */
+               return NULL;
+       return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
 
-               /* after clearing PageTail the gup refcount can be released */
-               smp_mb__after_atomic();
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+       kmem_cache_free(mm_slot_cache, mm_slot);
+}
 
-               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-               page_tail->flags |= (page->flags &
-                                    ((1L << PG_referenced) |
-                                     (1L << PG_swapbacked) |
-                                     (1L << PG_mlocked) |
-                                     (1L << PG_uptodate) |
-                                     (1L << PG_active) |
-                                     (1L << PG_unevictable)));
-               page_tail->flags |= (1L << PG_dirty);
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+       struct mm_slot *mm_slot;
 
-               clear_compound_head(page_tail);
+       hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
+               if (mm == mm_slot->mm)
+                       return mm_slot;
 
-               if (page_is_young(page))
-                       set_page_young(page_tail);
-               if (page_is_idle(page))
-                       set_page_idle(page_tail);
-
-               /*
-                * __split_huge_page_splitting() already set the
-                * splitting bit in all pmd that could map this
-                * hugepage, that will ensure no CPU can alter the
-                * mapcount on the head page. The mapcount is only
-                * accounted in the head page and it has to be
-                * transferred to all tail pages in the below code. So
-                * for this code to be safe, the split the mapcount
-                * can't change. But that doesn't mean userland can't
-                * keep changing and reading the page contents while
-                * we transfer the mapcount, so the pmd splitting
-                * status is achieved setting a reserved bit in the
-                * pmd, not by clearing the present bit.
-               */
-               page_tail->_mapcount = page->_mapcount;
-
-               BUG_ON(page_tail->mapping);
-               page_tail->mapping = page->mapping;
-
-               page_tail->index = page->index + i;
-               page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
-               BUG_ON(!PageAnon(page_tail));
-               BUG_ON(!PageUptodate(page_tail));
-               BUG_ON(!PageDirty(page_tail));
-               BUG_ON(!PageSwapBacked(page_tail));
-
-               lru_add_page_tail(page, page_tail, lruvec, list);
-       }
-       atomic_sub(tail_count, &page->_count);
-       BUG_ON(atomic_read(&page->_count) <= 0);
-
-       __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
-       ClearPageCompound(page);
-       compound_unlock(page);
-       spin_unlock_irq(&zone->lru_lock);
-
-       for (i = 1; i < HPAGE_PMD_NR; i++) {
-               struct page *page_tail = page + i;
-               BUG_ON(page_count(page_tail) <= 0);
-               /*
-                * Tail pages may be freed if there wasn't any mapping
-                * like if add_to_swap() is running on a lru page that
-                * had its mapping zapped. And freeing these pages
-                * requires taking the lru_lock so we do the put_page
-                * of the tail pages after the split is complete.
-                */
-               put_page(page_tail);
-       }
-
-       /*
-        * Only the head page (now become a regular page) is required
-        * to be pinned by the caller.
-        */
-       BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
-                                struct vm_area_struct *vma,
-                                unsigned long address)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd, _pmd;
-       int ret = 0, i;
-       pgtable_t pgtable;
-       unsigned long haddr;
-
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
-       if (pmd) {
-               pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-               pmd_populate(mm, &_pmd, pgtable);
-               if (pmd_write(*pmd))
-                       BUG_ON(page_mapcount(page) != 1);
-
-               haddr = address;
-               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-                       pte_t *pte, entry;
-                       BUG_ON(PageCompound(page+i));
-                       /*
-                        * Note that NUMA hinting access restrictions are not
-                        * transferred to avoid any possibility of altering
-                        * permissions across VMAs.
-                        */
-                       entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                       if (!pmd_write(*pmd))
-                               entry = pte_wrprotect(entry);
-                       if (!pmd_young(*pmd))
-                               entry = pte_mkold(entry);
-                       pte = pte_offset_map(&_pmd, haddr);
-                       BUG_ON(!pte_none(*pte));
-                       set_pte_at(mm, haddr, pte, entry);
-                       pte_unmap(pte);
-               }
-
-               smp_wmb(); /* make pte visible before pmd */
-               /*
-                * Up to this point the pmd is present and huge and
-                * userland has the whole access to the hugepage
-                * during the split (which happens in place). If we
-                * overwrite the pmd with the not-huge version
-                * pointing to the pte here (which of course we could
-                * if all CPUs were bug free), userland could trigger
-                * a small page size TLB miss on the small sized TLB
-                * while the hugepage TLB entry is still established
-                * in the huge TLB. Some CPU doesn't like that. See
-                * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
-                * Erratum 383 on page 93. Intel should be safe but is
-                * also warns that it's only safe if the permission
-                * and cache attributes of the two entries loaded in
-                * the two TLB is identical (which should be the case
-                * here). But it is generally safer to never allow
-                * small and huge TLB entries for the same virtual
-                * address to be loaded simultaneously. So instead of
-                * doing "pmd_populate(); flush_pmd_tlb_range();" we first
-                * mark the current pmd notpresent (atomically because
-                * here the pmd_trans_huge and pmd_trans_splitting
-                * must remain set at all times on the pmd until the
-                * split is complete for this pmd), then we flush the
-                * SMP TLB and finally we write the non-huge version
-                * of the pmd entry with pmd_populate.
-                */
-               pmdp_invalidate(vma, address, pmd);
-               pmd_populate(mm, pmd, pgtable);
-               ret = 1;
-               spin_unlock(ptl);
-       }
-
-       return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
-                             struct anon_vma *anon_vma,
-                             struct list_head *list)
-{
-       int mapcount, mapcount2;
-       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-       struct anon_vma_chain *avc;
-
-       BUG_ON(!PageHead(page));
-       BUG_ON(PageTail(page));
-
-       mapcount = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount += __split_huge_page_splitting(page, vma, addr);
-       }
-       /*
-        * It is critical that new vmas are added to the tail of the
-        * anon_vma list. This guarantes that if copy_huge_pmd() runs
-        * and establishes a child pmd before
-        * __split_huge_page_splitting() freezes the parent pmd (so if
-        * we fail to prevent copy_huge_pmd() from running until the
-        * whole __split_huge_page() is complete), we will still see
-        * the newly established pmd of the child later during the
-        * walk, to be able to set it as pmd_trans_splitting too.
-        */
-       if (mapcount != page_mapcount(page)) {
-               pr_err("mapcount %d page_mapcount %d\n",
-                       mapcount, page_mapcount(page));
-               BUG();
-       }
-
-       __split_huge_page_refcount(page, list);
-
-       mapcount2 = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount2 += __split_huge_page_map(page, vma, addr);
-       }
-       if (mapcount != mapcount2) {
-               pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-                       mapcount, mapcount2, page_mapcount(page));
-               BUG();
-       }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
-       struct anon_vma *anon_vma;
-       int ret = 1;
-
-       BUG_ON(is_huge_zero_page(page));
-       BUG_ON(!PageAnon(page));
-
-       /*
-        * The caller does not necessarily hold an mmap_sem that would prevent
-        * the anon_vma disappearing so we first we take a reference to it
-        * and then lock the anon_vma for write. This is similar to
-        * page_lock_anon_vma_read except the write lock is taken to serialise
-        * against parallel split or collapse operations.
-        */
-       anon_vma = page_get_anon_vma(page);
-       if (!anon_vma)
-               goto out;
-       anon_vma_lock_write(anon_vma);
-
-       ret = 0;
-       if (!PageCompound(page))
-               goto out_unlock;
-
-       BUG_ON(!PageSwapBacked(page));
-       __split_huge_page(page, anon_vma, list);
-       count_vm_event(THP_SPLIT);
-
-       BUG_ON(PageCompound(page));
-out_unlock:
-       anon_vma_unlock_write(anon_vma);
-       put_anon_vma(anon_vma);
-out:
-       return ret;
-}
-
-#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
-
-int hugepage_madvise(struct vm_area_struct *vma,
-                    unsigned long *vm_flags, int advice)
-{
-       switch (advice) {
-       case MADV_HUGEPAGE:
-#ifdef CONFIG_S390
-               /*
-                * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
-                * can't handle this properly after s390_enable_sie, so we simply
-                * ignore the madvise to prevent qemu from causing a SIGSEGV.
-                */
-               if (mm_has_pgste(vma->vm_mm))
-                       return 0;
-#endif
-               /*
-                * Be somewhat over-protective like KSM for now!
-                */
-               if (*vm_flags & VM_NO_THP)
-                       return -EINVAL;
-               *vm_flags &= ~VM_NOHUGEPAGE;
-               *vm_flags |= VM_HUGEPAGE;
-               /*
-                * If the vma become good for khugepaged to scan,
-                * register it here without waiting a page fault that
-                * may not happen any time soon.
-                */
-               if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
-                       return -ENOMEM;
-               break;
-       case MADV_NOHUGEPAGE:
-               /*
-                * Be somewhat over-protective like KSM for now!
-                */
-               if (*vm_flags & VM_NO_THP)
-                       return -EINVAL;
-               *vm_flags &= ~VM_HUGEPAGE;
-               *vm_flags |= VM_NOHUGEPAGE;
-               /*
-                * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
-                * this vma even if we leave the mm registered in khugepaged if
-                * it got registered before VM_NOHUGEPAGE was set.
-                */
-               break;
-       }
-
-       return 0;
-}
-
-static int __init khugepaged_slab_init(void)
-{
-       mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
-                                         sizeof(struct mm_slot),
-                                         __alignof__(struct mm_slot), 0, NULL);
-       if (!mm_slot_cache)
-               return -ENOMEM;
-
-       return 0;
-}
-
-static void __init khugepaged_slab_exit(void)
-{
-       kmem_cache_destroy(mm_slot_cache);
-}
-
-static inline struct mm_slot *alloc_mm_slot(void)
-{
-       if (!mm_slot_cache)     /* initialization failed */
-               return NULL;
-       return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
-       kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
-       struct mm_slot *mm_slot;
-
-       hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
-               if (mm == mm_slot->mm)
-                       return mm_slot;
-
-       return NULL;
-}
+       return NULL;
+}
 
 static void insert_to_mm_slots_hash(struct mm_struct *mm,
                                    struct mm_slot *mm_slot)
@@ -2371,7 +2124,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                         * superfluous.
                         */
                        pte_clear(vma->vm_mm, address, _pte);
-                       page_remove_rmap(src_page);
+                       page_remove_rmap(src_page, false);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
@@ -2481,6 +2234,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                return NULL;
        }
 
+       prep_transhuge_page(*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
        return *hpage;
 }
@@ -2492,8 +2246,12 @@ static int khugepaged_find_target_node(void)
 
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                          HPAGE_PMD_ORDER);
+       struct page *page;
+
+       page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+       if (page)
+               prep_transhuge_page(page);
+       return page;
 }
 
 static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2543,7 +2301,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
            (vma->vm_flags & VM_NOHUGEPAGE))
                return false;
-
        if (!vma->anon_vma || vma->vm_ops)
                return false;
        if (is_vma_temporary_stack(vma))
@@ -2583,7 +2340,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out_nolock;
        }
 
-       if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) {
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out_nolock;
        }
@@ -2682,8 +2439,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
-       page_add_new_anon_rmap(new_page, vma, address);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       page_add_new_anon_rmap(new_page, vma, address, true);
+       mem_cgroup_commit_charge(new_page, memcg, false, true);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
@@ -2703,7 +2460,7 @@ out_nolock:
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
 out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, true);
        goto out_up_write;
 }
 
@@ -2755,6 +2512,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
+
+               /* TODO: teach khugepaged to collapse THP mapped with pte */
+               if (PageCompound(page)) {
+                       result = SCAN_PAGE_COMPOUND;
+                       goto out_unmap;
+               }
+
                /*
                 * Record which node the original page is from and save this
                 * information to khugepaged_node_load[].
@@ -2767,7 +2531,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
                khugepaged_node_load[node]++;
-               VM_BUG_ON_PAGE(PageCompound(page), page);
                if (!PageLRU(page)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
@@ -3040,8 +2803,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmd_t _pmd;
        int i;
 
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
@@ -3060,66 +2823,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        put_huge_zero_page();
 }
 
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long haddr, bool freeze)
 {
-       spinlock_t *ptl;
-       struct page *page = NULL;
        struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct page *page;
+       pgtable_t pgtable;
+       pmd_t _pmd;
+       bool young, write, dirty;
+       int i;
 
-       BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+       VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+       VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+
+       count_vm_event(THP_SPLIT_PMD);
 
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
-again:
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_trans_huge(*pmd)))
-               goto unlock;
        if (vma_is_dax(vma)) {
                pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                if (is_huge_zero_pmd(_pmd))
                        put_huge_zero_page();
+               return;
        } else if (is_huge_zero_pmd(*pmd)) {
-               __split_huge_zero_page_pmd(vma, haddr, pmd);
-       } else {
-               page = pmd_page(*pmd);
-               VM_BUG_ON_PAGE(!page_count(page), page);
-               get_page(page);
+               return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }
- unlock:
-       spin_unlock(ptl);
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-       if (!page)
-               return;
+       page = pmd_page(*pmd);
+       VM_BUG_ON_PAGE(!page_count(page), page);
+       atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+       write = pmd_write(*pmd);
+       young = pmd_young(*pmd);
+       dirty = pmd_dirty(*pmd);
 
-       split_huge_page(page);
-       put_page(page);
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+       pmd_populate(mm, &_pmd, pgtable);
+
+       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               pte_t entry, *pte;
+               /*
+                * Note that NUMA hinting access restrictions are not
+                * transferred to avoid any possibility of altering
+                * permissions across VMAs.
+                */
+               if (freeze) {
+                       swp_entry_t swp_entry;
+                       swp_entry = make_migration_entry(page + i, write);
+                       entry = swp_entry_to_pte(swp_entry);
+               } else {
+                       entry = mk_pte(page + i, vma->vm_page_prot);
+                       entry = maybe_mkwrite(entry, vma);
+                       if (!write)
+                               entry = pte_wrprotect(entry);
+                       if (!young)
+                               entry = pte_mkold(entry);
+               }
+               if (dirty)
+                       SetPageDirty(page + i);
+               pte = pte_offset_map(&_pmd, haddr);
+               BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, haddr, pte, entry);
+               atomic_inc(&page[i]._mapcount);
+               pte_unmap(pte);
+       }
+
+       /*
+        * Set PG_double_map before dropping compound_mapcount to avoid
+        * false-negative page_mapped().
+        */
+       if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+               for (i = 0; i < HPAGE_PMD_NR; i++)
+                       atomic_inc(&page[i]._mapcount);
+       }
+
+       if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+               /* Last compound_mapcount is gone. */
+               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+               if (TestClearPageDoubleMap(page)) {
+                       /* No need in mapcount reference anymore */
+                       for (i = 0; i < HPAGE_PMD_NR; i++)
+                               atomic_dec(&page[i]._mapcount);
+               }
+       }
 
+       smp_wmb(); /* make pte visible before pmd */
        /*
-        * We don't always have down_write of mmap_sem here: a racing
-        * do_huge_pmd_wp_page() might have copied-on-write to another
-        * huge page before our split_huge_page() got the anon_vma lock.
+        * Up to this point the pmd is present and huge and userland has the
+        * whole access to the hugepage during the split (which happens in
+        * place). If we overwrite the pmd with the not-huge version pointing
+        * to the pte here (which of course we could if all CPUs were bug
+        * free), userland could trigger a small page size TLB miss on the
+        * small sized TLB while the hugepage TLB entry is still established in
+        * the huge TLB. Some CPU doesn't like that.
+        * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+        * 383 on page 93. Intel should be safe but is also warns that it's
+        * only safe if the permission and cache attributes of the two entries
+        * loaded in the two TLB is identical (which should be the case here).
+        * But it is generally safer to never allow small and huge TLB entries
+        * for the same virtual address to be loaded simultaneously. So instead
+        * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+        * current pmd notpresent (atomically because here the pmd_trans_huge
+        * and pmd_trans_splitting must remain set at all times on the pmd
+        * until the split is complete for this pmd), then we flush the SMP TLB
+        * and finally we write the non-huge version of the pmd entry with
+        * pmd_populate.
         */
-       if (unlikely(pmd_trans_huge(*pmd)))
-               goto again;
+       pmdp_invalidate(vma, haddr, pmd);
+       pmd_populate(mm, pmd, pgtable);
+
+       if (freeze) {
+               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                       page_remove_rmap(page + i, false);
+                       put_page(page + i);
+               }
+       }
 }
 
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long address)
 {
-       struct vm_area_struct *vma;
+       spinlock_t *ptl;
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
 
-       vma = find_vma(mm, address);
-       BUG_ON(vma == NULL);
-       split_huge_page_pmd(vma, address, pmd);
+       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       ptl = pmd_lock(mm, pmd);
+       if (pmd_trans_huge(*pmd)) {
+               page = pmd_page(*pmd);
+               if (PageMlocked(page))
+                       get_page(page);
+               else
+                       page = NULL;
+       } else if (!pmd_devmap(*pmd))
+               goto out;
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
+       spin_unlock(ptl);
+       mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
 }
 
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
                                    unsigned long address)
 {
        pgd_t *pgd;
@@ -3128,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm,
 
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
 
-       pgd = pgd_offset(mm, address);
+       pgd = pgd_offset(vma->vm_mm, address);
        if (!pgd_present(*pgd))
                return;
 
@@ -3137,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm,
                return;
 
        pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
+       if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
         * materialize from under us.
         */
-       split_huge_page_pmd_mm(mm, address, pmd);
+       split_huge_pmd(vma, pmd, address);
 }
 
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3159,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (start & ~HPAGE_PMD_MASK &&
            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, start);
+               split_huge_pmd_address(vma, start);
 
        /*
         * If the new end address isn't hpage aligned and it could
@@ -3169,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (end & ~HPAGE_PMD_MASK &&
            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, end);
+               split_huge_pmd_address(vma, end);
 
        /*
         * If we're also updating the vma->vm_next->vm_start, if the new
@@ -3183,6 +3033,540 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                if (nstart & ~HPAGE_PMD_MASK &&
                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_page_address(next->vm_mm, nstart);
+                       split_huge_pmd_address(next, nstart);
+       }
+}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+{
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       spinlock_t *ptl;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int i, nr = HPAGE_PMD_NR;
+
+       /* Skip pages which doesn't belong to the VMA */
+       if (address < vma->vm_start) {
+               int off = (vma->vm_start - address) >> PAGE_SHIFT;
+               page += off;
+               nr -= off;
+               address = vma->vm_start;
+       }
+
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+       pmd = pmd_offset(pud, address);
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (!pmd_present(*pmd)) {
+               spin_unlock(ptl);
+               return;
+       }
+       if (pmd_trans_huge(*pmd)) {
+               if (page == pmd_page(*pmd))
+                       __split_huge_pmd_locked(vma, pmd, haddr, true);
+               spin_unlock(ptl);
+               return;
+       }
+       spin_unlock(ptl);
+
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+               pte_t entry, swp_pte;
+               swp_entry_t swp_entry;
+
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non PMD-aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!pte_present(*pte))
+                       continue;
+               if (page_to_pfn(page) != pte_pfn(*pte))
+                       continue;
+               flush_cache_page(vma, address, page_to_pfn(page));
+               entry = ptep_clear_flush(vma, address, pte);
+               if (pte_dirty(entry))
+                       SetPageDirty(page);
+               swp_entry = make_migration_entry(page, pte_write(entry));
+               swp_pte = swp_entry_to_pte(swp_entry);
+               if (pte_soft_dirty(entry))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(vma->vm_mm, address, pte, swp_pte);
+               page_remove_rmap(page, false);
+               put_page(page);
+       }
+       pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                       pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+       }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+{
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte, entry;
+       swp_entry_t swp_entry;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       int i, nr = HPAGE_PMD_NR;
+
+       /* Skip pages which doesn't belong to the VMA */
+       if (address < vma->vm_start) {
+               int off = (vma->vm_start - address) >> PAGE_SHIFT;
+               page += off;
+               nr -= off;
+               address = vma->vm_start;
+       }
+
+       pmd = mm_find_pmd(vma->vm_mm, address);
+       if (!pmd)
+               return;
+
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non-PMD aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!is_swap_pte(*pte))
+                       continue;
+
+               swp_entry = pte_to_swp_entry(*pte);
+               if (!is_migration_entry(swp_entry))
+                       continue;
+               if (migration_entry_to_page(swp_entry) != page)
+                       continue;
+
+               get_page(page);
+               page_add_anon_rmap(page, vma, address, false);
+
+               entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+               if (PageDirty(page))
+                       entry = pte_mkdirty(entry);
+               if (is_write_migration_entry(swp_entry))
+                       entry = maybe_mkwrite(entry, vma);
+
+               flush_dcache_page(page);
+               set_pte_at(vma->vm_mm, address, pte, entry);
+
+               /* No need to invalidate - it was non-present before */
+               update_mmu_cache(vma, address, pte);
+       }
+       pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                       pgoff, pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               unfreeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+       }
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+               struct lruvec *lruvec, struct list_head *list)
+{
+       int mapcount;
+       struct page *page_tail = head + tail;
+
+       mapcount = atomic_read(&page_tail->_mapcount) + 1;
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+       /*
+        * tail_page->_count is zero and not changing from under us. But
+        * get_page_unless_zero() may be running from under us on the
+        * tail_page. If we used atomic_set() below instead of atomic_add(), we
+        * would then run atomic_set() concurrently with
+        * get_page_unless_zero(), and atomic_set() is implemented in C not
+        * using locked ops. spin_unlock on x86 sometime uses locked ops
+        * because of PPro errata 66, 92, so unless somebody can guarantee
+        * atomic_set() here would be safe on all archs (and not only on x86),
+        * it's safer to use atomic_add().
+        */
+       atomic_add(mapcount + 1, &page_tail->_count);
+
+
+       page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       page_tail->flags |= (head->flags &
+                       ((1L << PG_referenced) |
+                        (1L << PG_swapbacked) |
+                        (1L << PG_mlocked) |
+                        (1L << PG_uptodate) |
+                        (1L << PG_active) |
+                        (1L << PG_locked) |
+                        (1L << PG_unevictable) |
+                        (1L << PG_dirty)));
+
+       /*
+        * After clearing PageTail the gup refcount can be released.
+        * Page flags also must be visible before we make the page non-compound.
+        */
+       smp_wmb();
+
+       clear_compound_head(page_tail);
+
+       if (page_is_young(head))
+               set_page_young(page_tail);
+       if (page_is_idle(head))
+               set_page_idle(page_tail);
+
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+
+       page_tail->index = head->index + tail;
+       page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+       lru_add_page_tail(head, page_tail, lruvec, list);
+
+       return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+       struct page *head = compound_head(page);
+       struct zone *zone = page_zone(head);
+       struct lruvec *lruvec;
+       int i, tail_mapcount;
+
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+       lruvec = mem_cgroup_page_lruvec(head, zone);
+
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(head);
+
+       tail_mapcount = 0;
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+               tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+       atomic_sub(tail_mapcount, &head->_count);
+
+       ClearPageCompound(head);
+       spin_unlock_irq(&zone->lru_lock);
+
+       unfreeze_page(page_anon_vma(head), head);
+
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               struct page *subpage = head + i;
+               if (subpage == page)
+                       continue;
+               unlock_page(subpage);
+
+               /*
+                * Subpages may be freed if there wasn't any mapping
+                * like if add_to_swap() is running on a lru page that
+                * had its mapping zapped. And freeing these pages
+                * requires taking the lru_lock so we do the put_page
+                * of the tail pages after the split is complete.
+                */
+               put_page(subpage);
        }
 }
+
+int total_mapcount(struct page *page)
+{
+       int i, ret;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) + 1;
+
+       ret = compound_mapcount(page);
+       if (PageHuge(page))
+               return ret;
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               ret += atomic_read(&page[i]._mapcount) + 1;
+       if (PageDoubleMap(page))
+               ret -= HPAGE_PMD_NR;
+       return ret;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+       struct page *head = compound_head(page);
+       struct anon_vma *anon_vma;
+       int count, mapcount, ret;
+       bool mlocked;
+
+       VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+       VM_BUG_ON_PAGE(!PageAnon(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+       /*
+        * The caller does not necessarily hold an mmap_sem that would prevent
+        * the anon_vma disappearing so we first we take a reference to it
+        * and then lock the anon_vma for write. This is similar to
+        * page_lock_anon_vma_read except the write lock is taken to serialise
+        * against parallel split or collapse operations.
+        */
+       anon_vma = page_get_anon_vma(head);
+       if (!anon_vma) {
+               ret = -EBUSY;
+               goto out;
+       }
+       anon_vma_lock_write(anon_vma);
+
+       /*
+        * Racy check if we can split the page, before freeze_page() will
+        * split PMDs
+        */
+       if (total_mapcount(head) != page_count(head) - 1) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
+       mlocked = PageMlocked(page);
+       freeze_page(anon_vma, head);
+       VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+       /* Make sure the page is not on per-CPU pagevec as it takes pin */
+       if (mlocked)
+               lru_add_drain();
+
+       /* Prevent deferred_split_scan() touching ->_count */
+       spin_lock(&split_queue_lock);
+       count = page_count(head);
+       mapcount = total_mapcount(head);
+       if (!mapcount && count == 1) {
+               if (!list_empty(page_deferred_list(head))) {
+                       split_queue_len--;
+                       list_del(page_deferred_list(head));
+               }
+               spin_unlock(&split_queue_lock);
+               __split_huge_page(page, list);
+               ret = 0;
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+               spin_unlock(&split_queue_lock);
+               pr_alert("total_mapcount: %u, page_count(): %u\n",
+                               mapcount, count);
+               if (PageTail(page))
+                       dump_page(head, NULL);
+               dump_page(page, "total_mapcount(head) > 0");
+               BUG();
+       } else {
+               spin_unlock(&split_queue_lock);
+               unfreeze_page(anon_vma, head);
+               ret = -EBUSY;
+       }
+
+out_unlock:
+       anon_vma_unlock_write(anon_vma);
+       put_anon_vma(anon_vma);
+out:
+       count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+       return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (!list_empty(page_deferred_list(page))) {
+               split_queue_len--;
+               list_del(page_deferred_list(page));
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+       unsigned long flags;
+
+       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (list_empty(page_deferred_list(page))) {
+               list_add_tail(page_deferred_list(page), &split_queue);
+               split_queue_len++;
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       /*
+        * Split a page from split_queue will free up at least one page,
+        * at most HPAGE_PMD_NR - 1. We don't track exact number.
+        * Let's use HPAGE_PMD_NR / 2 as ballpark.
+        */
+       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       unsigned long flags;
+       LIST_HEAD(list), *pos, *next;
+       struct page *page;
+       int split = 0;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_init(&split_queue, &list);
+
+       /* Take pin on all head pages to avoid freeing them under us */
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               page = compound_head(page);
+               /* race with put_compound_page() */
+               if (!get_page_unless_zero(page)) {
+                       list_del_init(page_deferred_list(page));
+                       split_queue_len--;
+               }
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               lock_page(page);
+               /* split_huge_page() removes page from list on success */
+               if (!split_huge_page(page))
+                       split++;
+               unlock_page(page);
+               put_page(page);
+       }
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_tail(&list, &split_queue);
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+       .count_objects = deferred_split_count,
+       .scan_objects = deferred_split_scan,
+       .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+       struct zone *zone;
+       struct page *page;
+       unsigned long pfn, max_zone_pfn;
+       unsigned long total = 0, split = 0;
+
+       if (val != 1)
+               return -EINVAL;
+
+       for_each_populated_zone(zone) {
+               max_zone_pfn = zone_end_pfn(zone);
+               for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+                       if (!pfn_valid(pfn))
+                               continue;
+
+                       page = pfn_to_page(pfn);
+                       if (!get_page_unless_zero(page))
+                               continue;
+
+                       if (zone != page_zone(page))
+                               goto next;
+
+                       if (!PageHead(page) || !PageAnon(page) ||
+                                       PageHuge(page))
+                               goto next;
+
+                       total++;
+                       lock_page(page);
+                       if (!split_huge_page(page))
+                               split++;
+                       unlock_page(page);
+next:
+                       put_page(page);
+               }
+       }
+
+       pr_info("%lu of %lu THP split", split, total);
+
+       return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+               "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+       void *ret;
+
+       ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+                       &split_huge_pages_fops);
+       if (!ret)
+               pr_warn("Failed to create split_huge_pages in debugfs");
+       return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif
index be934df69b85983c950dfcfcfa04406edb50b23f..12908dcf58316afd3f4b14d379a8e139249cd396 100644 (file)
@@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
 
        /* we rely on prep_new_huge_page to set the destructor */
        set_compound_order(page, order);
-       __SetPageHead(page);
        __ClearPageReserved(page);
+       __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                /*
                 * For gigantic hugepages allocated through bootmem at
@@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
-                       page_dup_rmap(ptepage);
+                       page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                        hugetlb_count_add(pages_per_huge_page(h), dst);
                }
@@ -3186,7 +3186,7 @@ again:
                        set_page_dirty(page);
 
                hugetlb_count_sub(pages_per_huge_page(h), mm);
-               page_remove_rmap(page);
+               page_remove_rmap(page, true);
                force_flush = !__tlb_remove_page(tlb, page);
                if (force_flush) {
                        address += sz;
@@ -3415,7 +3415,7 @@ retry_avoidcopy:
                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
-               page_remove_rmap(old_page);
+               page_remove_rmap(old_page, true);
                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
@@ -3585,7 +3585,7 @@ retry:
                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, vma, address);
        } else
-               page_dup_rmap(page);
+               page_dup_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
        set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                       get_page_foll(pages[i]);
+                       get_page(pages[i]);
                }
 
                if (vmas)
index 38e24b89e4c400394212941a1789dd75bb902198..ed8b5ffcf9b16fbfcf3ccba0d182957980ad45ab 100644 (file)
@@ -13,6 +13,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 
 /*
  * The set of flags that only affect watermark checking and reclaim
@@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page)
        set_page_count(page, 1);
 }
 
-static inline void __get_page_tail_foll(struct page *page,
-                                       bool get_page_head)
-{
-       /*
-        * If we're getting a tail page, the elevated page->_count is
-        * required only in the head page and we will elevate the head
-        * page->_count and tail page->_mapcount.
-        *
-        * We elevate page_tail->_mapcount for tail pages to force
-        * page_tail->_count to be zero at all times to avoid getting
-        * false positives from get_page_unless_zero() with
-        * speculative page access (like in
-        * page_cache_get_speculative()) on tail pages.
-        */
-       VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
-       if (get_page_head)
-               atomic_inc(&compound_head(page)->_count);
-       get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
-       if (unlikely(PageTail(page)))
-               /*
-                * This is safe only because
-                * __split_huge_page_refcount() can't run under
-                * get_page_foll() because we hold the proper PT lock.
-                */
-               __get_page_tail_foll(page, true);
-       else {
-               /*
-                * Getting a normal page or the head of a compound page
-                * requires to already have an elevated page->_count.
-                */
-               VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
-               atomic_inc(&page->_count);
-       }
-}
-
 extern unsigned long highest_memmap_pfn;
 
 /*
@@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
-                                struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+       pgoff_t pgoff = page_to_pgoff(page);
+       return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+       unsigned long address = __vma_address(page, vma);
+
+       /* page should be within @vma mapping range */
+       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+       return address;
+}
+
 #else /* !CONFIG_MMU */
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
index 2d162c5625f6a1b2564f2f2a4b1db87702eeaa18..ca6d2a06a6157fabdc01aafa06b797320364a87c 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
        up_read(&mm->mmap_sem);
 }
 
-static struct page *page_trans_compound_anon(struct page *page)
-{
-       if (PageTransCompound(page)) {
-               struct page *head = compound_head(page);
-               /*
-                * head may actually be splitted and freed from under
-                * us but it's ok here.
-                */
-               if (PageAnon(head))
-                       return head;
-       }
-       return NULL;
-}
-
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-       if (PageAnon(page) || page_trans_compound_anon(page)) {
+       if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -956,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        }
 
        get_page(kpage);
-       page_add_anon_rmap(kpage, vma, addr);
+       page_add_anon_rmap(kpage, vma, addr, false);
 
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
-       page_remove_rmap(page);
+       page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        put_page(page);
@@ -975,33 +961,6 @@ out:
        return err;
 }
 
-static int page_trans_compound_anon_split(struct page *page)
-{
-       int ret = 0;
-       struct page *transhuge_head = page_trans_compound_anon(page);
-       if (transhuge_head) {
-               /* Get the reference on the head to split it. */
-               if (get_page_unless_zero(transhuge_head)) {
-                       /*
-                        * Recheck we got the reference while the head
-                        * was still anonymous.
-                        */
-                       if (PageAnon(transhuge_head))
-                               ret = split_huge_page(transhuge_head);
-                       else
-                               /*
-                                * Retry later if split_huge_page run
-                                * from under us.
-                                */
-                               ret = 1;
-                       put_page(transhuge_head);
-               } else
-                       /* Retry later if split_huge_page run from under us. */
-                       ret = 1;
-       }
-       return ret;
-}
-
 /*
  * try_to_merge_one_page - take two pages and merge them into one
  * @vma: the vma that holds the pte pointing to page
@@ -1020,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (page == kpage)                      /* ksm page forked */
                return 0;
 
-       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
-               goto out;
-       BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
 
@@ -1035,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
         */
        if (!trylock_page(page))
                goto out;
+
+       if (PageTransCompound(page)) {
+               err = split_huge_page(page);
+               if (err)
+                       goto out_unlock;
+       }
+
        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
@@ -1050,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                         */
                        set_page_stable_node(page, NULL);
                        mark_page_accessed(page);
+                       /*
+                        * Page reclaim just frees a clean page with no dirty
+                        * ptes: make sure that the ksm page would be swapped.
+                        */
+                       if (!PageDirty(page))
+                               SetPageDirty(page);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
@@ -1065,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                }
        }
 
+out_unlock:
        unlock_page(page);
 out:
        return err;
@@ -1635,8 +1605,7 @@ next_mm:
                                cond_resched();
                                continue;
                        }
-                       if (PageAnon(*page) ||
-                           page_trans_compound_anon(*page)) {
+                       if (PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1899,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 
                SetPageDirty(new_page);
                __SetPageUptodate(new_page);
-               __set_page_locked(new_page);
+               __SetPageLocked(new_page);
        }
 
        return new_page;
index c889fcbb530e98d8779ef75750e1fde08bf786cf..f56825b6d2e103c920d2b21cfb56e316dcea9418 100644 (file)
@@ -20,6 +20,9 @@
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+       case MADV_FREE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma,
        return 0;
 }
 
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+                               unsigned long end, struct mm_walk *walk)
+
+{
+       struct mmu_gather *tlb = walk->private;
+       struct mm_struct *mm = tlb->mm;
+       struct vm_area_struct *vma = walk->vma;
+       spinlock_t *ptl;
+       pte_t *orig_pte, *pte, ptent;
+       struct page *page;
+       int nr_swap = 0;
+       unsigned long next;
+
+       next = pmd_addr_end(addr, end);
+       if (pmd_trans_huge(*pmd))
+               if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+                       goto next;
+
+       if (pmd_trans_unstable(pmd))
+               return 0;
+
+       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       arch_enter_lazy_mmu_mode();
+       for (; addr != end; pte++, addr += PAGE_SIZE) {
+               ptent = *pte;
+
+               if (pte_none(ptent))
+                       continue;
+               /*
+                * If the pte has swp_entry, just clear page table to
+                * prevent swap-in which is more expensive rather than
+                * (page allocation + zeroing).
+                */
+               if (!pte_present(ptent)) {
+                       swp_entry_t entry;
+
+                       entry = pte_to_swp_entry(ptent);
+                       if (non_swap_entry(entry))
+                               continue;
+                       nr_swap--;
+                       free_swap_and_cache(entry);
+                       pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                       continue;
+               }
+
+               page = vm_normal_page(vma, addr, ptent);
+               if (!page)
+                       continue;
+
+               /*
+                * If pmd isn't transhuge but the page is THP and
+                * is owned by only this process, split it and
+                * deactivate all pages.
+                */
+               if (PageTransCompound(page)) {
+                       if (page_mapcount(page) != 1)
+                               goto out;
+                       get_page(page);
+                       if (!trylock_page(page)) {
+                               put_page(page);
+                               goto out;
+                       }
+                       pte_unmap_unlock(orig_pte, ptl);
+                       if (split_huge_page(page)) {
+                               unlock_page(page);
+                               put_page(page);
+                               pte_offset_map_lock(mm, pmd, addr, &ptl);
+                               goto out;
+                       }
+                       put_page(page);
+                       unlock_page(page);
+                       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       pte--;
+                       addr -= PAGE_SIZE;
+                       continue;
+               }
+
+               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+               if (PageSwapCache(page) || PageDirty(page)) {
+                       if (!trylock_page(page))
+                               continue;
+                       /*
+                        * If page is shared with others, we couldn't clear
+                        * PG_dirty of the page.
+                        */
+                       if (page_mapcount(page) != 1) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (PageSwapCache(page) && !try_to_free_swap(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       ClearPageDirty(page);
+                       unlock_page(page);
+               }
+
+               if (pte_young(ptent) || pte_dirty(ptent)) {
+                       /*
+                        * Some of architecture(ex, PPC) don't update TLB
+                        * with set_pte_at and tlb_remove_tlb_entry so for
+                        * the portability, remap the pte with old|clean
+                        * after pte clearing.
+                        */
+                       ptent = ptep_get_and_clear_full(mm, addr, pte,
+                                                       tlb->fullmm);
+
+                       ptent = pte_mkold(ptent);
+                       ptent = pte_mkclean(ptent);
+                       set_pte_at(mm, addr, pte, ptent);
+                       if (PageActive(page))
+                               deactivate_page(page);
+                       tlb_remove_tlb_entry(tlb, pte, addr);
+               }
+       }
+out:
+       if (nr_swap) {
+               if (current->mm == mm)
+                       sync_mm_rss(mm);
+
+               add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+       }
+       arch_leave_lazy_mmu_mode();
+       pte_unmap_unlock(orig_pte, ptl);
+       cond_resched();
+next:
+       return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+                            struct vm_area_struct *vma,
+                            unsigned long addr, unsigned long end)
+{
+       struct mm_walk free_walk = {
+               .pmd_entry = madvise_free_pte_range,
+               .mm = vma->vm_mm,
+               .private = tlb,
+       };
+
+       tlb_start_vma(tlb, vma);
+       walk_page_range(addr, end, &free_walk);
+       tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+                       unsigned long start_addr, unsigned long end_addr)
+{
+       unsigned long start, end;
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_gather tlb;
+
+       if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+               return -EINVAL;
+
+       /* MADV_FREE works for only anon vma at the moment */
+       if (!vma_is_anonymous(vma))
+               return -EINVAL;
+
+       start = max(vma->vm_start, start_addr);
+       if (start >= vma->vm_end)
+               return -EINVAL;
+       end = min(vma->vm_end, end_addr);
+       if (end <= vma->vm_start)
+               return -EINVAL;
+
+       lru_add_drain();
+       tlb_gather_mmu(&tlb, mm, start, end);
+       update_hiwater_rss(mm);
+
+       mmu_notifier_invalidate_range_start(mm, start, end);
+       madvise_free_page_range(&tlb, vma, start, end);
+       mmu_notifier_invalidate_range_end(mm, start, end);
+       tlb_finish_mmu(&tlb, start, end);
+
+       return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+                            struct vm_area_struct **prev,
+                            unsigned long start, unsigned long end)
+{
+       *prev = vma;
+       return madvise_free_single_vma(vma, start, end);
+}
+
 /*
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
@@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
+       case MADV_FREE:
+               /*
+                * XXX: In this implementation, MADV_FREE works like
+                * MADV_DONTNEED on swapless system or full swap.
+                */
+               if (get_nr_swap_pages() > 0)
+                       return madvise_free(vma, prev, start, end);
+               /* passthrough */
        case MADV_DONTNEED:
                return madvise_dontneed(vma, prev, start, end);
        default:
@@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+       case MADV_FREE:
 #ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
index 54eae4f19d803a21321721f267ca9872764629dd..0eda67376df4323a62f8268975468a8dd4d91e04 100644 (file)
@@ -382,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 {
        struct mem_cgroup *memcg;
 
-       rcu_read_lock();
-
        memcg = page->mem_cgroup;
 
        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                memcg = root_mem_cgroup;
 
-       rcu_read_unlock();
        return &memcg->css;
 }
 
@@ -647,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                         struct page *page,
-                                        int nr_pages)
+                                        bool compound, int nr_pages)
 {
        /*
         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -660,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
 
-       if (PageTransHuge(page))
+       if (compound) {
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
                                nr_pages);
+       }
 
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
@@ -2431,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
 
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
@@ -3494,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 swap_buffers:
        /* Swap primary and spare array */
        thresholds->spare = thresholds->primary;
-       /* If all events are unregistered, free the spare array */
-       if (!new) {
-               kfree(thresholds->spare);
-               thresholds->spare = NULL;
-       }
 
        rcu_assign_pointer(thresholds->primary, new);
 
        /* To be sure that nobody uses thresholds */
        synchronize_rcu();
+
+       /* If all events are unregistered, free the spare array */
+       if (!new) {
+               kfree(thresholds->spare);
+               thresholds->spare = NULL;
+       }
 unlock:
        mutex_unlock(&memcg->thresholds_lock);
 }
@@ -4505,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  * @from: mem_cgroup which the page is moved from.
  * @to:        mem_cgroup which the page is moved to. @from != @to.
  *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
-                                  unsigned int nr_pages,
+                                  bool compound,
                                   struct mem_cgroup *from,
                                   struct mem_cgroup *to)
 {
        unsigned long flags;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret;
        bool anon;
 
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
-       /*
-        * The page is isolated from LRU. So, collapse function
-        * will not handle this page. But page splitting can happen.
-        * Do this check under compound_page_lock(). The caller should
-        * hold it.
-        */
-       ret = -EBUSY;
-       if (nr_pages > 1 && !PageTransHuge(page))
-               goto out;
+       VM_BUG_ON(compound && !PageTransHuge(page));
 
        /*
         * Prevent mem_cgroup_replace_page() from looking at
         * page->mem_cgroup of its source page while we change it.
         */
+       ret = -EBUSY;
        if (!trylock_page(page))
                goto out;
 
@@ -4591,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page,
        ret = 0;
 
        local_irq_disable();
-       mem_cgroup_charge_statistics(to, page, nr_pages);
+       mem_cgroup_charge_statistics(to, page, compound, nr_pages);
        memcg_check_events(to, page);
-       mem_cgroup_charge_statistics(from, page, -nr_pages);
+       mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
        memcg_check_events(from, page);
        local_irq_enable();
 out_unlock:
@@ -4683,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@ -4871,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        union mc_target target;
        struct page *page;
 
-       /*
-        * We don't take compound_lock() here but no race with splitting thp
-        * happens because:
-        *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
-        *    under splitting, which means there's no concurrent thp split,
-        *  - if another thread runs into split_huge_page() just after we
-        *    entered this if-block, the thread must wait for page table lock
-        *    to be unlocked in __split_huge_page_splitting(), where the main
-        *    part of thp split is not executed yet.
-        */
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (mc.precharge < HPAGE_PMD_NR) {
                        spin_unlock(ptl);
                        return 0;
@@ -4890,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                if (target_type == MC_TARGET_PAGE) {
                        page = target.page;
                        if (!isolate_lru_page(page)) {
-                               if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                               if (!mem_cgroup_move_account(page, true,
                                                             mc.from, mc.to)) {
                                        mc.precharge -= HPAGE_PMD_NR;
                                        mc.moved_charge += HPAGE_PMD_NR;
@@ -4917,9 +4897,18 @@ retry:
                switch (get_mctgt_type(vma, addr, ptent, &target)) {
                case MC_TARGET_PAGE:
                        page = target.page;
+                       /*
+                        * We can have a part of the split pmd here. Moving it
+                        * can be done but it would be too convoluted so simply
+                        * ignore such a partial THP and keep it in original
+                        * memcg. There should be somebody mapping the head.
+                        */
+                       if (PageTransCompound(page))
+                               goto put;
                        if (isolate_lru_page(page))
                                goto put;
-                       if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+                       if (!mem_cgroup_move_account(page, false,
+                                               mc.from, mc.to)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -5258,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  * with mem_cgroup_cancel_charge() in case page instantiation fails.
  */
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp)
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
 {
        struct mem_cgroup *memcg = NULL;
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret = 0;
 
        if (mem_cgroup_disabled())
@@ -5291,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                }
        }
 
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
-
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
 
@@ -5324,9 +5309,9 @@ out:
  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
  */
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare)
+                             bool lrucare, bool compound)
 {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
 
        VM_BUG_ON_PAGE(!page->mapping, page);
        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5343,13 +5328,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 
        commit_charge(page, memcg, lrucare);
 
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
-
        local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
+       mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
        memcg_check_events(memcg, page);
        local_irq_enable();
 
@@ -5371,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  *
  * Cancel a charge transaction started by mem_cgroup_try_charge().
  */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound)
 {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
 
        if (mem_cgroup_disabled())
                return;
@@ -5385,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
        if (!memcg)
                return;
 
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
-
        cancel_charge(memcg, nr_pages);
 }
 
@@ -5750,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, -1);
+       mem_cgroup_charge_statistics(memcg, page, false, -1);
        memcg_check_events(memcg, page);
 }
 
index 8424b64711ac35955772078804b2e19f7cd99620..ac595e7a3a955d457e8ff3b6934e8728e8db6d5d 100644 (file)
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
 {
        struct page *head = compound_head(page);
 
-       if (PageHuge(head))
-               return get_page_unless_zero(head);
-
-       /*
-        * Thp tail page has special refcounting rule (refcount of tail pages
-        * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
-        * directly for tail pages.
-        */
-       if (PageTransHuge(head)) {
+       if (!PageHuge(head) && PageTransHuge(head)) {
                /*
                 * Non anonymous thp exists only in allocation/free time. We
                 * can't handle such a case correctly, so let's give it up.
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
                                page_to_pfn(page));
                        return 0;
                }
-
-               if (get_page_unless_zero(head)) {
-                       if (PageTail(page))
-                               get_page(page);
-                       return 1;
-               } else {
-                       return 0;
-               }
        }
 
-       return get_page_unless_zero(page);
+       return get_page_unless_zero(head);
 }
 EXPORT_SYMBOL_GPL(get_hwpoison_page);
 
-/**
- * put_hwpoison_page() - Put refcount for memory error handling:
- * @page:      raw error page (hit by memory error)
- */
-void put_hwpoison_page(struct page *page)
-{
-       struct page *head = compound_head(page);
-
-       if (PageHuge(head)) {
-               put_page(head);
-               return;
-       }
-
-       if (PageTransHuge(head))
-               if (page != head)
-                       put_page(head);
-
-       put_page(page);
-}
-EXPORT_SYMBOL_GPL(put_hwpoison_page);
-
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
 
        if (!PageHuge(p) && PageTransHuge(hpage)) {
+               lock_page(hpage);
                if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+                       unlock_page(hpage);
                        if (!PageAnon(hpage))
                                pr_err("MCE: %#lx: non anonymous thp\n", pfn);
                        else
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        put_hwpoison_page(p);
                        return -EBUSY;
                }
+               unlock_page(hpage);
+               get_hwpoison_page(p);
+               put_hwpoison_page(hpage);
                VM_BUG_ON_PAGE(!page_count(p), p);
                hpage = compound_head(p);
        }
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        /*
         * We ignore non-LRU pages for good reasons.
         * - PG_locked is only well defined for LRU pages and a few others
-        * - to avoid races with __set_page_locked()
+        * - to avoid races with __SetPageLocked()
         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                 * Did it turn free?
                 */
                ret = __get_any_page(page, pfn, 0);
-               if (!PageLRU(page)) {
+               if (ret == 1 && !PageLRU(page)) {
                        /* Drop page reference which is from __get_any_page() */
                        put_hwpoison_page(page);
                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags)
        return ret;
 }
 
+static int soft_offline_in_use_page(struct page *page, int flags)
+{
+       int ret;
+       struct page *hpage = compound_head(page);
+
+       if (!PageHuge(page) && PageTransHuge(hpage)) {
+               lock_page(hpage);
+               if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+                       unlock_page(hpage);
+                       if (!PageAnon(hpage))
+                               pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
+                       else
+                               pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
+                       put_hwpoison_page(hpage);
+                       return -EBUSY;
+               }
+               unlock_page(hpage);
+               get_hwpoison_page(page);
+               put_hwpoison_page(hpage);
+       }
+
+       if (PageHuge(page))
+               ret = soft_offline_huge_page(page, flags);
+       else
+               ret = __soft_offline_page(page, flags);
+
+       return ret;
+}
+
+static void soft_offline_free_page(struct page *page)
+{
+       if (PageHuge(page)) {
+               struct page *hpage = compound_head(page);
+
+               set_page_hwpoison_huge_page(hpage);
+               if (!dequeue_hwpoisoned_huge_page(hpage))
+                       num_poisoned_pages_add(1 << compound_order(hpage));
+       } else {
+               if (!TestSetPageHWPoison(page))
+                       num_poisoned_pages_inc();
+       }
+}
+
 /**
  * soft_offline_page - Soft offline a page.
  * @page: page to offline
@@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
-       struct page *hpage = compound_head(page);
 
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags)
                        put_hwpoison_page(page);
                return -EBUSY;
        }
-       if (!PageHuge(page) && PageTransHuge(hpage)) {
-               if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
-                       pr_info("soft offline: %#lx: failed to split THP\n",
-                               pfn);
-                       if (flags & MF_COUNT_INCREASED)
-                               put_hwpoison_page(page);
-                       return -EBUSY;
-               }
-       }
 
        get_online_mems();
-
        ret = get_any_page(page, pfn, flags);
        put_online_mems();
-       if (ret > 0) { /* for in-use pages */
-               if (PageHuge(page))
-                       ret = soft_offline_huge_page(page, flags);
-               else
-                       ret = __soft_offline_page(page, flags);
-       } else if (ret == 0) { /* for free pages */
-               if (PageHuge(page)) {
-                       set_page_hwpoison_huge_page(hpage);
-                       if (!dequeue_hwpoisoned_huge_page(hpage))
-                               num_poisoned_pages_add(1 << compound_order(hpage));
-               } else {
-                       if (!TestSetPageHWPoison(page))
-                               num_poisoned_pages_inc();
-               }
-       }
+
+       if (ret > 0)
+               ret = soft_offline_in_use_page(page, flags);
+       else if (ret == 0)
+               soft_offline_free_page(page);
+
        return ret;
 }
index d4e4d37c1989545f27743b640d6340f702ccd432..ff17850a52d92c37817dd3dd45aba37b6bb4da7f 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
+#include <linux/pfn_t.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm, address);
-       int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
 
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
        ptl = pmd_lock(mm, pmd);
-       wait_split_huge_page = 0;
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                atomic_long_inc(&mm->nr_ptes);
                pmd_populate(mm, pmd, new);
                new = NULL;
-       } else if (unlikely(pmd_trans_splitting(*pmd)))
-               wait_split_huge_page = 1;
+       }
        spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
-       if (wait_split_huge_page)
-               wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
 
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-       } else
-               VM_BUG_ON(pmd_trans_splitting(*pmd));
+       }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -870,7 +865,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
-               page_dup_rmap(page);
+               page_dup_rmap(page, false);
                rss[mm_counter(page)]++;
        }
 
@@ -955,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*src_pmd)) {
+               if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
                        err = copy_huge_pmd(dst_mm, src_mm,
@@ -1118,7 +1113,7 @@ again:
                                        mark_page_accessed(page);
                        }
                        rss[mm_counter(page)]--;
-                       page_remove_rmap(page);
+                       page_remove_rmap(page, false);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1182,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*pmd)) {
+               if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
 #ifdef CONFIG_DEBUG_VM
                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
@@ -1193,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                        BUG();
                                }
 #endif
-                               split_huge_page_pmd(vma, addr, pmd);
+                               split_huge_pmd(vma, pmd, addr);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1506,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                       unsigned long pfn, pgprot_t prot)
+                       pfn_t pfn, pgprot_t prot)
 {
        struct mm_struct *mm = vma->vm_mm;
        int retval;
@@ -1522,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                goto out_unlock;
 
        /* Ok, finally just insert the thing.. */
-       entry = pte_mkspecial(pfn_pte(pfn, prot));
+       if (pfn_t_devmap(pfn))
+               entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
+       else
+               entry = pte_mkspecial(pfn_t_pte(pfn, prot));
        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
@@ -1569,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-       if (track_pfn_insert(vma, &pgprot, pfn))
+       if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
                return -EINVAL;
 
-       ret = insert_pfn(vma, addr, pfn, pgprot);
+       ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
 
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                       unsigned long pfn)
+                       pfn_t pfn)
 {
        BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
 
@@ -1593,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
-       if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+       if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
                struct page *page;
 
-               page = pfn_to_page(pfn);
+               page = pfn_t_to_page(pfn);
                return insert_page(vma, addr, page, vma->vm_page_prot);
        }
        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2087,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                cow_user_page(new_page, old_page, address, vma);
        }
 
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_new;
 
        __SetPageUptodate(new_page);
@@ -2118,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-               page_add_new_anon_rmap(new_page, vma, address);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, address, false);
+               mem_cgroup_commit_charge(new_page, memcg, false, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
                /*
                 * We call the notify macro here because, when using secondary
@@ -2151,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
-                       page_remove_rmap(old_page);
+                       page_remove_rmap(old_page, false);
                }
 
                /* Free the old page.. */
                new_page = old_page;
                page_copied = 1;
        } else {
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, false);
        }
 
        if (new_page)
@@ -2173,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
-                       munlock_vma_page(old_page);
+                       if (PageMlocked(old_page))
+                               munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
@@ -2533,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_page;
        }
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
                ret = VM_FAULT_OOM;
                goto out_page;
        }
@@ -2567,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
-               exclusive = 1;
+               exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(orig_pte))
@@ -2575,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, address, page_table, pte);
        if (page == swapcache) {
                do_page_add_anon_rmap(page, vma, address, exclusive);
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, address);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, address, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
 
@@ -2613,7 +2612,7 @@ unlock:
 out:
        return ret;
 out_nomap:
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
@@ -2707,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!page)
                goto oom;
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_page;
 
        /*
@@ -2728,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(page_table, ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
                page_cache_release(page);
                return handle_userfault(vma, address, flags,
                                        VM_UFFD_MISSING);
        }
 
        inc_mm_counter_fast(mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, address);
-       mem_cgroup_commit_charge(page, memcg, false);
+       page_add_new_anon_rmap(page, vma, address, false);
+       mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
 setpte:
        set_pte_at(mm, address, page_table, entry);
@@ -2747,7 +2746,7 @@ unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
 release:
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        goto unlock;
 oom_free_page:
@@ -2824,7 +2823,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, address);
+               page_add_new_anon_rmap(page, vma, address, false);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page);
@@ -3000,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!new_page)
                return VM_FAULT_OOM;
 
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
                page_cache_release(new_page);
                return VM_FAULT_OOM;
        }
@@ -3029,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto uncharge_out;
        }
        do_set_pte(vma, address, new_page, pte, true, true);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       mem_cgroup_commit_charge(new_page, memcg, false, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
@@ -3044,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        return ret;
 uncharge_out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, false);
        page_cache_release(new_page);
        return ret;
 }
@@ -3096,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
-       mapping = fault_page->mapping;
+       mapping = page_rmapping(fault_page);
        unlock_page(fault_page);
        if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
                /*
@@ -3198,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
 
+       /* TODO: handle PTE-mapped THP */
+       if (PageCompound(page)) {
+               pte_unmap_unlock(ptep, ptl);
+               return 0;
+       }
+
        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
@@ -3370,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                int ret;
 
                barrier();
-               if (pmd_trans_huge(orig_pmd)) {
+               if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-                       /*
-                        * If the pmd is splitting, return and retry the
-                        * the fault.  Alternative: wait until the split
-                        * is done, and goto retry.
-                        */
-                       if (pmd_trans_splitting(orig_pmd))
-                               return 0;
-
                        if (pmd_protnone(orig_pmd))
                                return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
@@ -3407,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
+       if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd
index 92f95952692b0f9aa20bec8afa3cd1725e573452..4af58a3a8ffa345c16fe190be76ba9f9e83113d4 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/memory.h>
+#include <linux/memremap.h>
 #include <linux/memory_hotplug.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
@@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
        unsigned long i;
        int err = 0;
        int start_sec, end_sec;
+       struct vmem_altmap *altmap;
+
        /* during initialize mem_map, align hot-added range to section */
        start_sec = pfn_to_section_nr(phys_start_pfn);
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 
+       altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
+       if (altmap) {
+               /*
+                * Validate altmap is within bounds of the total request
+                */
+               if (altmap->base_pfn != phys_start_pfn
+                               || vmem_altmap_offset(altmap) > nr_pages) {
+                       pr_warn_once("memory add fail, invalid altmap\n");
+                       return -EINVAL;
+               }
+               altmap->alloc = 0;
+       }
+
        for (i = start_sec; i <= end_sec; i++) {
                err = __add_section(nid, zone, section_nr_to_pfn(i));
 
@@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+static int __remove_section(struct zone *zone, struct mem_section *ms,
+               unsigned long map_offset)
 {
        unsigned long start_pfn;
        int scn_nr;
@@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        start_pfn = section_nr_to_pfn(scn_nr);
        __remove_zone(zone, start_pfn);
 
-       sparse_remove_one_section(zone, ms);
+       sparse_remove_one_section(zone, ms, map_offset);
        return 0;
 }
 
@@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
                 unsigned long nr_pages)
 {
        unsigned long i;
-       int sections_to_remove;
-       resource_size_t start, size;
-       int ret = 0;
+       unsigned long map_offset = 0;
+       int sections_to_remove, ret = 0;
+
+       /* In the ZONE_DEVICE case device driver owns the memory region */
+       if (is_dev_zone(zone)) {
+               struct page *page = pfn_to_page(phys_start_pfn);
+               struct vmem_altmap *altmap;
+
+               altmap = to_vmem_altmap((unsigned long) page);
+               if (altmap)
+                       map_offset = vmem_altmap_offset(altmap);
+       } else {
+               resource_size_t start, size;
+
+               start = phys_start_pfn << PAGE_SHIFT;
+               size = nr_pages * PAGE_SIZE;
+
+               ret = release_mem_region_adjustable(&iomem_resource, start,
+                                       size);
+               if (ret) {
+                       resource_size_t endres = start + size - 1;
+
+                       pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+                                       &start, &endres, ret);
+               }
+       }
 
        /*
         * We can only remove entire sections
@@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
 
-       start = phys_start_pfn << PAGE_SHIFT;
-       size = nr_pages * PAGE_SIZE;
-
-       /* in the ZONE_DEVICE case device driver owns the memory region */
-       if (!is_dev_zone(zone))
-               ret = release_mem_region_adjustable(&iomem_resource, start, size);
-       if (ret) {
-               resource_size_t endres = start + size - 1;
-
-               pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
-                               &start, &endres, ret);
-       }
-
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-               ret = __remove_section(zone, __pfn_to_section(pfn));
+
+               ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
+               map_offset = 0;
                if (ret)
                        break;
        }
index d8caff071a3053642eece912609a3877cb503be5..27d135408a22057a5b166e7eb2bf2e080bbd33f2 100644 (file)
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
-       int nid;
+       int nid, ret;
        pte_t *pte;
        spinlock_t *ptl;
 
-       split_huge_page_pmd(vma, addr, pmd);
-       if (pmd_trans_unstable(pmd))
-               return 0;
+       if (pmd_trans_huge(*pmd)) {
+               ptl = pmd_lock(walk->mm, pmd);
+               if (pmd_trans_huge(*pmd)) {
+                       page = pmd_page(*pmd);
+                       if (is_huge_zero_page(page)) {
+                               spin_unlock(ptl);
+                               split_huge_pmd(vma, pmd, addr);
+                       } else {
+                               get_page(page);
+                               spin_unlock(ptl);
+                               lock_page(page);
+                               ret = split_huge_page(page);
+                               unlock_page(page);
+                               put_page(page);
+                               if (ret)
+                                       return 0;
+                       }
+               } else {
+                       spin_unlock(ptl);
+               }
+       }
 
+retry:
        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
@@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                nid = page_to_nid(page);
                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                        continue;
+               if (PageTail(page) && PageAnon(page)) {
+                       get_page(page);
+                       pte_unmap_unlock(pte, ptl);
+                       lock_page(page);
+                       ret = split_huge_page(page);
+                       unlock_page(page);
+                       put_page(page);
+                       /* Failed to split -- skip. */
+                       if (ret) {
+                               pte = pte_offset_map_lock(walk->mm, pmd,
+                                               addr, &ptl);
+                               continue;
+                       }
+                       goto retry;
+               }
 
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                        migrate_page_add(page, qp->pagelist, flags);
@@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
-               if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+               if (vma_migratable(vma) &&
+                       vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }
index 7890d0bb5e23c3db75cc682878a2c5ec89b0e513..b1034f9c77e7d5a9bdbe60692396e5584c6991fc 100644 (file)
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                if (PageAnon(new))
                        hugepage_add_anon_rmap(new, vma, addr);
                else
-                       page_dup_rmap(new);
+                       page_dup_rmap(new, true);
        } else if (PageAnon(new))
-               page_add_anon_rmap(new, vma, addr);
+               page_add_anon_rmap(new, vma, addr, false);
        else
                page_add_file_rmap(new);
 
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                goto out;
        }
 
-       if (unlikely(PageTransHuge(page)))
-               if (unlikely(split_huge_page(page)))
+       if (unlikely(PageTransHuge(page))) {
+               lock_page(page);
+               rc = split_huge_page(page);
+               unlock_page(page);
+               if (rc)
                        goto out;
+       }
 
        rc = __unmap_and_move(page, newpage, force, mode);
        if (rc == MIGRATEPAGE_SUCCESS)
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                HPAGE_PMD_ORDER);
        if (!new_page)
                goto out_fail;
+       prep_transhuge_page(new_page);
 
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated) {
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                flush_tlb_range(vma, mmun_start, mmun_end);
 
        /* Prepare a page as a migration target */
-       __set_page_locked(new_page);
+       __SetPageLocked(new_page);
        SetPageSwapBacked(new_page);
 
        /* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1820,7 @@ fail_putback:
         * guarantee the copy is visible before the pagetable update.
         */
        flush_cache_range(vma, mmun_start, mmun_end);
-       page_add_anon_rmap(new_page, vma, mmun_start);
+       page_add_anon_rmap(new_page, vma, mmun_start, true);
        pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
        set_pmd_at(mm, mmun_start, pmd, entry);
        flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1831,14 @@ fail_putback:
                flush_tlb_range(vma, mmun_start, mmun_end);
                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                update_mmu_cache_pmd(vma, address, &entry);
-               page_remove_rmap(new_page);
+               page_remove_rmap(new_page, true);
                goto fail_putback;
        }
 
        mlock_migrate_page(new_page, page);
        set_page_memcg(new_page, page_memcg(page));
        set_page_memcg(page, NULL);
-       page_remove_rmap(page);
+       page_remove_rmap(page, true);
 
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
index 14bb9fb37f0ca4d91989bb9e37c269fbd6ec7d98..2a565ed8bb4907398a0d3d2619fd4939df777970 100644 (file)
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        unsigned char *vec = walk->private;
        int nr = (end - addr) >> PAGE_SHIFT;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                memset(vec, 1, nr);
                spin_unlock(ptl);
                goto out;
index 9cb87cbc40715c72261189822aa43b3b30ecd284..e1e2b1207bf2ee00604468d027b3879381e1b089 100644 (file)
 
 #include "internal.h"
 
-int can_do_mlock(void)
+bool can_do_mlock(void)
 {
        if (rlimit(RLIMIT_MEMLOCK) != 0)
-               return 1;
+               return true;
        if (capable(CAP_IPC_LOCK))
-               return 1;
-       return 0;
+               return true;
+       return false;
 }
 EXPORT_SYMBOL(can_do_mlock);
 
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
        /* Serialize with page migration */
        BUG_ON(!PageLocked(page));
 
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
        if (!TestSetPageMlocked(page)) {
                mod_zone_page_state(page_zone(page), NR_MLOCK,
                                    hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
        /* For try_to_munlock() and to serialize with page migration */
        BUG_ON(!PageLocked(page));
 
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
        /*
         * Serialize with any parallel __split_huge_page_refcount() which
         * might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
                if (!page || page_zone_id(page) != zoneid)
                        break;
 
+               /*
+                * Do not use pagevec for PTE-mapped THP,
+                * munlock_vma_pages_range() will handle them.
+                */
+               if (PageTransCompound(page))
+                       break;
+
                get_page(page);
                /*
                 * Increase the address that will be returned *before* the
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                &page_mask);
 
                if (page && !IS_ERR(page)) {
-                       if (PageTransHuge(page)) {
+                       if (PageTransTail(page)) {
+                               VM_BUG_ON_PAGE(PageMlocked(page), page);
+                               put_page(page); /* follow_page_mask() */
+                       } else if (PageTransHuge(page)) {
                                lock_page(page);
                                /*
                                 * Any THP page found by follow_page_mask() may
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                goto next;
                        }
                }
-               /* It's a bug to munlock in the middle of a THP page */
-               VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
                page_increm = 1 + page_mask;
                start += page_increm * PAGE_SIZE;
 next:
index b3f00b616b810e4362effddd437b320acdcedb7e..84b12624ceb01d83762172634179825b086961fd 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
- * We can take all the locks in random order because the VM code
- * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
- * takes more than one of them in a row. Secondly we're protected
- * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ * We take locks in following order, accordingly to comment at beginning
+ * of mm/rmap.c:
+ *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+ *     hugetlb mapping);
+ *   - all i_mmap_rwsem locks;
+ *   - all anon_vma->rwseml
+ *
+ * We can take all locks within these types randomly because the VM code
+ * doesn't nest them and we protected from parallel mm_take_all_locks() by
+ * mm_all_locks_mutex.
  *
  * mm_take_all_locks() and mm_drop_all_locks are expensive operations
  * that may have to take thousand of locks.
@@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm)
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
-               if (vma->vm_file && vma->vm_file->f_mapping)
+               if (vma->vm_file && vma->vm_file->f_mapping &&
+                               is_vm_hugetlb_page(vma))
+                       vm_lock_mapping(mm, vma->vm_file->f_mapping);
+       }
+
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               if (signal_pending(current))
+                       goto out_unlock;
+               if (vma->vm_file && vma->vm_file->f_mapping &&
+                               !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }
 
index c764402c464f10471d4b267fa26911f46e87e149..8eb7bb40dc40b6e8e89d05fbb7e05f8c836e05da 100644 (file)
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                unsigned long this_pages;
 
                next = pmd_addr_end(addr, end);
-               if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+               if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
+                               && pmd_none_or_clear_bad(pmd))
                        continue;
 
                /* invoke the mmu notifier if the pmd is populated */
@@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                        mmu_notifier_invalidate_range_start(mm, mni_start, end);
                }
 
-               if (pmd_trans_huge(*pmd)) {
+               if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
-                               split_huge_page_pmd(vma, addr, pmd);
+                               split_huge_pmd(vma, pmd, addr);
                        else {
                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
                                                newprot, prot_numa);
index e55b157865d5cfc437c4eacd0496dbad69d5d0ab..d77946a997f798ecc25ccaee2886139ff9d9a587 100644 (file)
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (!new_pmd)
                        break;
                if (pmd_trans_huge(*old_pmd)) {
-                       int err = 0;
                        if (extent == HPAGE_PMD_SIZE) {
+                               bool moved;
                                VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
                                              vma);
                                /* See comment in move_ptes() */
                                if (need_rmap_locks)
                                        anon_vma_lock_write(vma->anon_vma);
-                               err = move_huge_pmd(vma, new_vma, old_addr,
+                               moved = move_huge_pmd(vma, new_vma, old_addr,
                                                    new_addr, old_end,
                                                    old_pmd, new_pmd);
                                if (need_rmap_locks)
                                        anon_vma_unlock_write(vma->anon_vma);
+                               if (moved) {
+                                       need_flush = true;
+                                       continue;
+                               }
                        }
-                       if (err > 0) {
-                               need_flush = true;
-                               continue;
-                       } else if (!err) {
-                               split_huge_page_pmd(vma, old_addr, old_pmd);
-                       }
+                       split_huge_pmd(vma, old_pmd, old_addr);
                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
                }
                if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
index ce63d603820f1963fac120ce76e20583ca695f6e..63358d9f9aa98eff0848879b0503b5d80b9ea8d0 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
+#include <linux/memremap.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
@@ -222,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
 
-static void free_compound_page(struct page *page);
 compound_page_dtor * const compound_page_dtors[] = {
        NULL,
        free_compound_page,
 #ifdef CONFIG_HUGETLB_PAGE
        free_huge_page,
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       free_transhuge_page,
+#endif
 };
 
 int min_free_kbytes = 1024;
@@ -450,7 +453,7 @@ out:
  * This usage means that zero-order pages may not be compound.
  */
 
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
 {
        __free_pages_ok(page, compound_order(page));
 }
@@ -466,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                set_page_count(p, 0);
+               p->mapping = TAIL_MAPPING;
                set_compound_head(p, page);
        }
+       atomic_set(compound_mapcount_ptr(page), -1);
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -732,7 +737,7 @@ static inline int free_pages_check(struct page *page)
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
 
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@ -856,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
                ret = 0;
                goto out;
        }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
        if (unlikely(!PageTail(page))) {
                bad_page(page, "PageTail not set", 0);
                goto out;
@@ -866,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
        }
        ret = 0;
 out:
+       page->mapping = NULL;
        clear_compound_head(page);
        return ret;
 }
@@ -1329,7 +1356,7 @@ static inline int check_new_page(struct page *page)
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
 
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@ -4459,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn, enum memmap_context context)
 {
-       pg_data_t *pgdat = NODE_DATA(nid);
+       struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
        unsigned long end_pfn = start_pfn + size;
+       pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
-       struct zone *z;
        unsigned long nr_initialised = 0;
 
        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;
 
-       z = &pgdat->node_zones[zone];
+       /*
+        * Honor reservation requested by the driver for this ZONE_DEVICE
+        * memory
+        */
+       if (altmap && start_pfn == altmap->base_pfn)
+               start_pfn += altmap->reserve;
+
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
                 * There can be holes in boot-time mem_map[]s
index d5dd79041484588cdfb409e7610d9ea23ee3c370..4ea9c4ef5146b8b784848a6710b70e5fc0dfcd41 100644 (file)
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page,
                                        unsigned long addr, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
        pmd_t *pmd;
        pte_t *pte;
+       spinlock_t *ptl;
        bool referenced = false;
 
-       if (unlikely(PageTransHuge(page))) {
-               pmd = page_check_address_pmd(page, mm, addr,
-                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
-               if (pmd) {
-                       referenced = pmdp_clear_young_notify(vma, addr, pmd);
-                       spin_unlock(ptl);
-               }
+       if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
+               return SWAP_AGAIN;
+
+       if (pte) {
+               referenced = ptep_clear_young_notify(vma, addr, pte);
+               pte_unmap(pte);
+       } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+               referenced = pmdp_clear_young_notify(vma, addr, pmd);
        } else {
-               pte = page_check_address(page, mm, addr, &ptl, 0);
-               if (pte) {
-                       referenced = ptep_clear_young_notify(vma, addr, pte);
-                       pte_unmap_unlock(pte, ptl);
-               }
+               /* unexpected pmd-mapped page? */
+               WARN_ON_ONCE(1);
        }
+
+       spin_unlock(ptl);
+
        if (referenced) {
                clear_page_idle(page);
                /*
index 5e139fec6c6cc95a68c01d0841a5d41b05a20120..92c4c36501e7c55dc50b9ae330299c978c329d0a 100644 (file)
@@ -196,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 {
        unsigned long pfn;
        struct page *page;
-       BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
-       BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+
+       BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+       BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+
        for (pfn = start_pfn;
             pfn < end_pfn;
             pfn += pageblock_nr_pages) {
index 29f2f8b853ae51be4f9e35fbc1495ad69297ff82..207244489a681d10f16c318c0a6ff0423b5fe30b 100644 (file)
@@ -58,7 +58,7 @@ again:
                if (!walk->pte_entry)
                        continue;
 
-               split_huge_page_pmd_mm(walk->mm, addr, pmd);
+               split_huge_pmd(walk->vma, pmd, addr);
                if (pmd_trans_unstable(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
index 4c681baff3632aafc7ed16bb16543cc1c7b7f596..9d4767698a1cd6988d4f71b37ef3f384eff5b3b1 100644 (file)
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       VM_BUG_ON(!pmd_trans_huge(*pmdp));
+       VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-       /* tlb flush only to serialize against gup-fast */
-       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
index 622756c16ac84ccc905355d3d6b59a9b9dca35e4..79f3bf047f38497bdcdae741c2941370353a3ea7 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
  * inode->i_mutex      (while writing or truncating, not reading or faulting)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_rwsem
- *         anon_vma->rwsem
- *           mm->page_table_lock or pte_lock
- *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- *             swap_lock (in swap_duplicate, swap_info_get)
- *               mmlist_lock (in mmput, drain_mmlist and others)
- *               mapping->private_lock (in __set_page_dirty_buffers)
- *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- *                   mapping->tree_lock (widely used)
- *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- *                 sb_lock (within inode_lock in fs/fs-writeback.c)
- *                 mapping->tree_lock (widely used, in set_page_dirty,
- *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within bdi.wb->list_lock in __sync_single_inode)
+ *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ *         mapping->i_mmap_rwsem
+ *           anon_vma->rwsem
+ *             mm->page_table_lock or pte_lock
+ *               zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *               swap_lock (in swap_duplicate, swap_info_get)
+ *                 mmlist_lock (in mmput, drain_mmlist and others)
+ *                 mapping->private_lock (in __set_page_dirty_buffers)
+ *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                     mapping->tree_lock (widely used)
+ *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ *                   sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                   mapping->tree_lock (widely used, in set_page_dirty,
+ *                             in arch-dependent flush_dcache_mmap_lock,
+ *                             within bdi.wb->list_lock in __sync_single_inode)
  *
  * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
@@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
        anon_vma_unlock_read(anon_vma);
 }
 
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
-       pgoff_t pgoff = page_to_pgoff(page);
-       return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
-       unsigned long address = __vma_address(page, vma);
-
-       /* page should be within @vma mapping range */
-       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
-       return address;
-}
-
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 static void percpu_flush_tlb_batch_pages(void *data)
 {
@@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Check that @page is mapped at @address into @mm. In contrast to
+ * page_check_address(), this function can handle transparent huge pages.
+ *
+ * On success returns true with pte mapped and locked. For PMD-mapped
+ * transparent huge pages *@ptep is set to NULL.
+ */
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+                                 unsigned long address, pmd_t **pmdp,
+                                 pte_t **ptep, spinlock_t **ptlp)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       spinlock_t *ptl;
+
+       if (unlikely(PageHuge(page))) {
+               /* when pud is not present, pte will be NULL */
+               pte = huge_pte_offset(mm, address);
+               if (!pte)
+                       return false;
+
+               ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+               pmd = NULL;
+               goto check_pte;
+       }
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               return false;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return false;
+       pmd = pmd_offset(pud, address);
+
+       if (pmd_trans_huge(*pmd)) {
+               ptl = pmd_lock(mm, pmd);
+               if (!pmd_present(*pmd))
+                       goto unlock_pmd;
+               if (unlikely(!pmd_trans_huge(*pmd))) {
+                       spin_unlock(ptl);
+                       goto map_pte;
+               }
+
+               if (pmd_page(*pmd) != page)
+                       goto unlock_pmd;
+
+               pte = NULL;
+               goto found;
+unlock_pmd:
+               spin_unlock(ptl);
+               return false;
+       } else {
+               pmd_t pmde = *pmd;
+
+               barrier();
+               if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+                       return false;
+       }
+map_pte:
+       pte = pte_offset_map(pmd, address);
+       if (!pte_present(*pte)) {
+               pte_unmap(pte);
+               return false;
+       }
+
+       ptl = pte_lockptr(mm, pmd);
+check_pte:
+       spin_lock(ptl);
+
+       if (!pte_present(*pte)) {
+               pte_unmap_unlock(pte, ptl);
+               return false;
+       }
+
+       /* THP can be referenced by any subpage */
+       if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+               pte_unmap_unlock(pte, ptl);
+               return false;
+       }
+found:
+       *ptep = pte;
+       *pmdp = pmd;
+       *ptlp = ptl;
+       return true;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 struct page_referenced_arg {
        int mapcount;
        int referenced;
@@ -832,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
+       struct page_referenced_arg *pra = arg;
+       pmd_t *pmd;
+       pte_t *pte;
        spinlock_t *ptl;
        int referenced = 0;
-       struct page_referenced_arg *pra = arg;
-
-       if (unlikely(PageTransHuge(page))) {
-               pmd_t *pmd;
 
-               /*
-                * rmap might return false positives; we must filter
-                * these out using page_check_address_pmd().
-                */
-               pmd = page_check_address_pmd(page, mm, address,
-                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
-               if (!pmd)
-                       return SWAP_AGAIN;
-
-               if (vma->vm_flags & VM_LOCKED) {
-                       spin_unlock(ptl);
-                       pra->vm_flags |= VM_LOCKED;
-                       return SWAP_FAIL; /* To break the loop */
-               }
+       if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
+               return SWAP_AGAIN;
 
-               /* go ahead even if the pmd is pmd_trans_splitting() */
-               if (pmdp_clear_flush_young_notify(vma, address, pmd))
-                       referenced++;
+       if (vma->vm_flags & VM_LOCKED) {
+               if (pte)
+                       pte_unmap(pte);
                spin_unlock(ptl);
-       } else {
-               pte_t *pte;
-
-               /*
-                * rmap might return false positives; we must filter
-                * these out using page_check_address().
-                */
-               pte = page_check_address(page, mm, address, &ptl, 0);
-               if (!pte)
-                       return SWAP_AGAIN;
-
-               if (vma->vm_flags & VM_LOCKED) {
-                       pte_unmap_unlock(pte, ptl);
-                       pra->vm_flags |= VM_LOCKED;
-                       return SWAP_FAIL; /* To break the loop */
-               }
+               pra->vm_flags |= VM_LOCKED;
+               return SWAP_FAIL; /* To break the loop */
+       }
 
+       if (pte) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        /*
                         * Don't treat a reference through a sequentially read
@@ -886,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                referenced++;
                }
-               pte_unmap_unlock(pte, ptl);
+               pte_unmap(pte);
+       } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+               if (pmdp_clear_flush_young_notify(vma, address, pmd))
+                       referenced++;
+       } else {
+               /* unexpected pmd-mapped page? */
+               WARN_ON_ONCE(1);
        }
+       spin_unlock(ptl);
 
        if (referenced)
                clear_page_idle(page);
@@ -935,7 +987,7 @@ int page_referenced(struct page *page,
        int ret;
        int we_locked = 0;
        struct page_referenced_arg pra = {
-               .mapcount = page_mapcount(page),
+               .mapcount = total_mapcount(page),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
@@ -1124,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
         * over the call to page_add_new_anon_rmap.
         */
        BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
-       BUG_ON(page->index != linear_page_index(vma, address));
+       BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
 #endif
 }
 
@@ -1133,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page,
  * @page:      the page to add the mapping to
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
+ * @compound:  charge the page as compound or small page
  *
  * The caller needs to hold the pte lock, and the page must be locked in
  * the anon_vma case: to serialize mapping,index checking after setting,
@@ -1140,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page,
  * (but PageKsm is never downgraded to PageAnon).
  */
 void page_add_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address)
+       struct vm_area_struct *vma, unsigned long address, bool compound)
 {
-       do_page_add_anon_rmap(page, vma, address, 0);
+       do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
 }
 
 /*
@@ -1151,29 +1204,44 @@ void page_add_anon_rmap(struct page *page,
  * Everybody else should continue to use page_add_anon_rmap above.
  */
 void do_page_add_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address, int exclusive)
+       struct vm_area_struct *vma, unsigned long address, int flags)
 {
-       int first = atomic_inc_and_test(&page->_mapcount);
+       bool compound = flags & RMAP_COMPOUND;
+       bool first;
+
+       if (compound) {
+               atomic_t *mapcount;
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+               mapcount = compound_mapcount_ptr(page);
+               first = atomic_inc_and_test(mapcount);
+       } else {
+               first = atomic_inc_and_test(&page->_mapcount);
+       }
+
        if (first) {
+               int nr = compound ? hpage_nr_pages(page) : 1;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
                 * these counters are not modified in interrupt context, and
                 * pte lock(a spinlock) is held, which implies preemption
                 * disabled.
                 */
-               if (PageTransHuge(page))
+               if (compound) {
                        __inc_zone_page_state(page,
                                              NR_ANON_TRANSPARENT_HUGEPAGES);
-               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                               hpage_nr_pages(page));
+               }
+               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        }
        if (unlikely(PageKsm(page)))
                return;
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
+
        /* address might be in next vma when migration races vma_adjust */
        if (first)
-               __page_set_anon_rmap(page, vma, address, exclusive);
+               __page_set_anon_rmap(page, vma, address,
+                               flags & RMAP_EXCLUSIVE);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -1183,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page,
  * @page:      the page to add the mapping to
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
+ * @compound:  charge the page as compound or small page
  *
  * Same as page_add_anon_rmap but must only be called on *new* pages.
  * This means the inc-and-test can be bypassed.
  * Page does not have to be locked.
  */
 void page_add_new_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address)
+       struct vm_area_struct *vma, unsigned long address, bool compound)
 {
+       int nr = compound ? hpage_nr_pages(page) : 1;
+
        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        SetPageSwapBacked(page);
-       atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-       if (PageTransHuge(page))
+       if (compound) {
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+               /* increment count (starts at -1) */
+               atomic_set(compound_mapcount_ptr(page), 0);
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                       hpage_nr_pages(page));
+       } else {
+               /* Anon THP always mapped first with PMD */
+               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               /* increment count (starts at -1) */
+               atomic_set(&page->_mapcount, 0);
+       }
+       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        __page_set_anon_rmap(page, vma, address, 1);
 }
 
@@ -1225,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page)
 
        memcg = mem_cgroup_begin_page_stat(page);
 
-       /* page still mapped by someone else? */
-       if (!atomic_add_negative(-1, &page->_mapcount))
+       /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+       if (unlikely(PageHuge(page))) {
+               /* hugetlb pages are always mapped with pmds */
+               atomic_dec(compound_mapcount_ptr(page));
                goto out;
+       }
 
-       /* Hugepages are not counted in NR_FILE_MAPPED for now. */
-       if (unlikely(PageHuge(page)))
+       /* page still mapped by someone else? */
+       if (!atomic_add_negative(-1, &page->_mapcount))
                goto out;
 
        /*
@@ -1247,41 +1328,79 @@ out:
        mem_cgroup_end_page_stat(memcg);
 }
 
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+       int i, nr;
+
+       if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+               return;
+
+       /* Hugepages are not counted in NR_ANON_PAGES for now. */
+       if (unlikely(PageHuge(page)))
+               return;
+
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+               return;
+
+       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+       if (TestClearPageDoubleMap(page)) {
+               /*
+                * Subpages can be mapped with PTEs too. Check how many of
+                * themi are still mapped.
+                */
+               for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+                       if (atomic_add_negative(-1, &page[i]._mapcount))
+                               nr++;
+               }
+       } else {
+               nr = HPAGE_PMD_NR;
+       }
+
+       if (unlikely(PageMlocked(page)))
+               clear_page_mlock(page);
+
+       if (nr) {
+               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+               deferred_split_huge_page(page);
+       }
+}
+
 /**
  * page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page:      page to remove mapping from
+ * @compound:  uncharge the page as compound or small page
  *
  * The caller needs to hold the pte lock.
  */
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
 {
        if (!PageAnon(page)) {
+               VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
                page_remove_file_rmap(page);
                return;
        }
 
+       if (compound)
+               return page_remove_anon_compound_rmap(page);
+
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
                return;
 
-       /* Hugepages are not counted in NR_ANON_PAGES for now. */
-       if (unlikely(PageHuge(page)))
-               return;
-
        /*
         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
-       if (PageTransHuge(page))
-               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
-       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                             -hpage_nr_pages(page));
+       __dec_zone_page_state(page, NR_ANON_PAGES);
 
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
 
+       if (PageTransCompound(page))
+               deferred_split_huge_page(compound_head(page));
+
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1293,6 +1412,11 @@ void page_remove_rmap(struct page *page)
         */
 }
 
+struct rmap_private {
+       enum ttu_flags flags;
+       int lazyfreed;
+};
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1304,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        pte_t pteval;
        spinlock_t *ptl;
        int ret = SWAP_AGAIN;
-       enum ttu_flags flags = (enum ttu_flags)arg;
+       struct rmap_private *rp = arg;
+       enum ttu_flags flags = rp->flags;
 
        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1396,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                 * See handle_pte_fault() ...
                 */
                VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+
+               if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+                       /* It's a freeable page by MADV_FREE */
+                       dec_mm_counter(mm, MM_ANONPAGES);
+                       rp->lazyfreed++;
+                       goto discard;
+               }
+
                if (swap_duplicate(entry) < 0) {
                        set_pte_at(mm, address, pte, pteval);
                        ret = SWAP_FAIL;
@@ -1416,7 +1549,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        } else
                dec_mm_counter(mm, mm_counter_file(page));
 
-       page_remove_rmap(page);
+discard:
+       page_remove_rmap(page, PageHuge(page));
        page_cache_release(page);
 
 out_unmap:
@@ -1468,9 +1602,14 @@ static int page_not_mapped(struct page *page)
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
+       struct rmap_private rp = {
+               .flags = flags,
+               .lazyfreed = 0,
+       };
+
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-               .arg = (void *)flags,
+               .arg = &rp,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
        };
@@ -1490,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 
        ret = rmap_walk(page, &rwc);
 
-       if (ret != SWAP_MLOCK && !page_mapped(page))
+       if (ret != SWAP_MLOCK && !page_mapped(page)) {
                ret = SWAP_SUCCESS;
+               if (rp.lazyfreed && !PageDirty(page))
+                       ret = SWAP_LZFREE;
+       }
        return ret;
 }
 
@@ -1513,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 int try_to_munlock(struct page *page)
 {
        int ret;
+       struct rmap_private rp = {
+               .flags = TTU_MUNLOCK,
+               .lazyfreed = 0,
+       };
+
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-               .arg = (void *)TTU_MUNLOCK,
+               .arg = &rp,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
 
@@ -1698,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page,
        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
-       first = atomic_inc_and_test(&page->_mapcount);
+       first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
                __hugepage_set_anon_rmap(page, vma, address, 0);
 }
@@ -1707,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-       atomic_set(&page->_mapcount, 0);
+       atomic_set(compound_mapcount_ptr(page), 0);
        __hugepage_set_anon_rmap(page, vma, address, 1);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
index 970ff5b80853e131ee1e4abd618340faccf00f44..b98e1011858cdefc67108291a3717a48d0324096 100644 (file)
@@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
         */
-       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+                       false);
        if (error)
                goto out;
        /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        if (error) {
                if (error != -ENOMEM)
                        error = 0;
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
        } else
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
 out:
        unlock_page(page);
        page_cache_release(page);
@@ -1085,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        copy_highpage(newpage, oldpage);
        flush_dcache_page(newpage);
 
-       __set_page_locked(newpage);
+       __SetPageLocked(newpage);
        SetPageUptodate(newpage);
        SetPageSwapBacked(newpage);
        set_page_private(newpage, swap_index);
@@ -1218,7 +1219,8 @@ repeat:
                                goto failed;
                }
 
-               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+                               false);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                swp_to_radix_entry(swap));
@@ -1235,14 +1237,14 @@ repeat:
                         * "repeat": reading a hole and writing should succeed.
                         */
                        if (error) {
-                               mem_cgroup_cancel_charge(page, memcg);
+                               mem_cgroup_cancel_charge(page, memcg, false);
                                delete_from_swap_cache(page);
                        }
                }
                if (error)
                        goto failed;
 
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
 
                spin_lock(&info->lock);
                info->swapped--;
@@ -1277,11 +1279,12 @@ repeat:
                }
 
                __SetPageSwapBacked(page);
-               __set_page_locked(page);
+               __SetPageLocked(page);
                if (sgp == SGP_WRITE)
                        __SetPageReferenced(page);
 
-               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+                               false);
                if (error)
                        goto decused;
                error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1291,10 +1294,10 @@ repeat:
                        radix_tree_preload_end();
                }
                if (error) {
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, false);
                        goto decused;
                }
-               mem_cgroup_commit_charge(page, memcg, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_anon(page);
 
                spin_lock(&info->lock);
index 2d0e610d195ae908586953126f0f125aebf94574..b21fd24b08b1fc886a4d43ca8930e84a810bb5ea 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
  */
 static __always_inline void slab_lock(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        bit_spin_lock(PG_locked, &page->flags);
 }
 
 static __always_inline void slab_unlock(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        __bit_spin_unlock(PG_locked, &page->flags);
 }
 
index 4cba9c2783a147077150505dcf6114cf4592843a..b60802b3e5ead91c346ffd1b47d90b77155bdf79 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
+#include <linux/memremap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 }
 
 /* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+static void * __meminit alloc_block_buf(unsigned long size, int node)
 {
        void *ptr;
 
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
        return ptr;
 }
 
+static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
+{
+       return altmap->base_pfn + altmap->reserve + altmap->alloc
+               + altmap->align;
+}
+
+static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
+{
+       unsigned long allocated = altmap->alloc + altmap->align;
+
+       if (altmap->free > allocated)
+               return altmap->free - allocated;
+       return 0;
+}
+
+/**
+ * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
+ * @altmap - reserved page pool for the allocation
+ * @nr_pfns - size (in pages) of the allocation
+ *
+ * Allocations are aligned to the size of the request
+ */
+static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
+               unsigned long nr_pfns)
+{
+       unsigned long pfn = vmem_altmap_next_pfn(altmap);
+       unsigned long nr_align;
+
+       nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
+       nr_align = ALIGN(pfn, nr_align) - pfn;
+
+       if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+               return ULONG_MAX;
+       altmap->alloc += nr_pfns;
+       altmap->align += nr_align;
+       return pfn + nr_align;
+}
+
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+               struct vmem_altmap *altmap)
+{
+       unsigned long pfn, nr_pfns;
+       void *ptr;
+
+       if (size & ~PAGE_MASK) {
+               pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
+                               __func__, size);
+               return NULL;
+       }
+
+       nr_pfns = size >> PAGE_SHIFT;
+       pfn = vmem_altmap_alloc(altmap, nr_pfns);
+       if (pfn < ULONG_MAX)
+               ptr = __va(__pfn_to_phys(pfn));
+       else
+               ptr = NULL;
+       pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
+                       __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+
+       return ptr;
+}
+
+/* need to make sure size is all the same during early stage */
+void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
+               struct vmem_altmap *altmap)
+{
+       if (altmap)
+               return altmap_alloc_block_buf(size, altmap);
+       return alloc_block_buf(size, node);
+}
+
 void __meminit vmemmap_verify(pte_t *pte, int node,
                                unsigned long start, unsigned long end)
 {
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(*pte)) {
                pte_t entry;
-               void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+               void *p = alloc_block_buf(PAGE_SIZE, node);
                if (!p)
                        return NULL;
                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
index d1b48b691ac8c20040a262337cc7e0cbf566420f..3717ceed4177c9183626673134116b6e0f87adda 100644 (file)
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
        if (!memmap)
                return;
 
-       for (i = 0; i < PAGES_PER_SECTION; i++) {
+       for (i = 0; i < nr_pages; i++) {
                if (PageHWPoison(&memmap[i])) {
                        atomic_long_sub(1, &num_poisoned_pages);
                        ClearPageHWPoison(&memmap[i]);
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
                free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+               unsigned long map_offset)
 {
        struct page *memmap = NULL;
        unsigned long *usemap = NULL, flags;
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
        }
        pgdat_resize_unlock(pgdat, &flags);
 
-       clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
+       clear_hwpoisoned_pages(memmap + map_offset,
+                       PAGES_PER_SECTION - map_offset);
        free_section_usemap(memmap, usemap);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
index 39395fb549c013e20b34d5fe0be8afa6eee5afbf..09fe5e97714a2ac9756a537c78b617d9ed7a1ffc 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/mm_inline.h>
 #include <linux/percpu_counter.h>
+#include <linux/memremap.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
@@ -45,6 +46,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
 /*
  * This path almost never happens for VM activity - pages are normally
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
        (*dtor)(page);
 }
 
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- *    PageHeadHuge will remain true until the compound page
- *    is released and enters the buddy allocator, and it could
- *    not be split by __split_huge_page_refcount().
- *
- *    So if we see PageHeadHuge set, and we have the tail page pin,
- *    then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- *    PG_slab is cleared before the slab frees the head page, and
- *    tail pin cannot be the last reference left on the head page,
- *    because the slab code is free to reuse the compound page
- *    after a kfree/kmem_cache_free without having to check if
- *    there's any tail pin left.  In turn all tail pinsmust be always
- *    released while the head is still pinned by the slab code
- *    and so we know PG_slab will be still set too.
- *
- *    So if we see PageSlab set, and we have the tail page pin,
- *    then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
-       /*
-        * If @page is a THP tail, we must read the tail page
-        * flags after the head page flags. The
-        * __split_huge_page_refcount side enforces write memory barriers
-        * between clearing PageTail and before the head page
-        * can be freed and reallocated.
-        */
-       smp_rmb();
-       if (likely(PageTail(page))) {
-               /*
-                * __split_huge_page_refcount cannot race
-                * here, see the comment above this function.
-                */
-               VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-               if (put_page_testzero(page_head)) {
-                       /*
-                        * If this is the tail of a slab THP page,
-                        * the tail pin must not be the last reference
-                        * held on the page, because the PG_slab cannot
-                        * be cleared before all tail pins (which skips
-                        * the _mapcount tail refcounting) have been
-                        * released.
-                        *
-                        * If this is the tail of a hugetlbfs page,
-                        * the tail pin may be the last reference on
-                        * the page instead, because PageHeadHuge will
-                        * not go away until the compound page enters
-                        * the buddy allocator.
-                        */
-                       VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
-                       __put_compound_page(page_head);
-               }
-       } else
-               /*
-                * __split_huge_page_refcount run before us,
-                * @page was a THP tail. The split @page_head
-                * has been freed and reallocated as slab or
-                * hugetlbfs page of smaller order (only
-                * possible if reallocated as slab on x86).
-                */
-               if (put_page_testzero(page))
-                       __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
-       if (likely(page != page_head && get_page_unless_zero(page_head))) {
-               unsigned long flags;
-
-               /*
-                * @page_head wasn't a dangling pointer but it may not
-                * be a head page anymore by the time we obtain the
-                * lock. That is ok as long as it can't be freed from
-                * under us.
-                */
-               flags = compound_lock_irqsave(page_head);
-               if (unlikely(!PageTail(page))) {
-                       /* __split_huge_page_refcount run before us */
-                       compound_unlock_irqrestore(page_head, flags);
-                       if (put_page_testzero(page_head)) {
-                               /*
-                                * The @page_head may have been freed
-                                * and reallocated as a compound page
-                                * of smaller order and then freed
-                                * again.  All we know is that it
-                                * cannot have become: a THP page, a
-                                * compound page of higher order, a
-                                * tail page.  That is because we
-                                * still hold the refcount of the
-                                * split THP tail and page_head was
-                                * the THP head before the split.
-                                */
-                               if (PageHead(page_head))
-                                       __put_compound_page(page_head);
-                               else
-                                       __put_single_page(page_head);
-                       }
-out_put_single:
-                       if (put_page_testzero(page))
-                               __put_single_page(page);
-                       return;
-               }
-               VM_BUG_ON_PAGE(page_head != compound_head(page), page);
-               /*
-                * We can release the refcount taken by
-                * get_page_unless_zero() now that
-                * __split_huge_page_refcount() is blocked on the
-                * compound_lock.
-                */
-               if (put_page_testzero(page_head))
-                       VM_BUG_ON_PAGE(1, page_head);
-               /* __split_huge_page_refcount will wait now */
-               VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
-               atomic_dec(&page->_mapcount);
-               VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
-               VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-               compound_unlock_irqrestore(page_head, flags);
-
-               if (put_page_testzero(page_head)) {
-                       if (PageHead(page_head))
-                               __put_compound_page(page_head);
-                       else
-                               __put_single_page(page_head);
-               }
-       } else {
-               /* @page_head is a dangling pointer */
-               VM_BUG_ON_PAGE(PageTail(page), page);
-               goto out_put_single;
-       }
-}
-
-static void put_compound_page(struct page *page)
-{
-       struct page *page_head;
-
-       /*
-        * We see the PageCompound set and PageTail not set, so @page maybe:
-        *  1. hugetlbfs head page, or
-        *  2. THP head page.
-        */
-       if (likely(!PageTail(page))) {
-               if (put_page_testzero(page)) {
-                       /*
-                        * By the time all refcounts have been released
-                        * split_huge_page cannot run anymore from under us.
-                        */
-                       if (PageHead(page))
-                               __put_compound_page(page);
-                       else
-                               __put_single_page(page);
-               }
-               return;
-       }
-
-       /*
-        * We see the PageCompound set and PageTail set, so @page maybe:
-        *  1. a tail hugetlbfs page, or
-        *  2. a tail THP page, or
-        *  3. a split THP page.
-        *
-        *  Case 3 is possible, as we may race with
-        *  __split_huge_page_refcount tearing down a THP page.
-        */
-       page_head = compound_head(page);
-       if (!__compound_tail_refcounted(page_head))
-               put_unrefcounted_compound_page(page_head, page);
-       else
-               put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
 {
        if (unlikely(PageCompound(page)))
-               put_compound_page(page);
-       else if (put_page_testzero(page))
+               __put_compound_page(page);
+       else
                __put_single_page(page);
 }
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
-       /*
-        * This takes care of get_page() if run on a tail page
-        * returned by one of the get_user_pages/follow_page variants.
-        * get_user_pages/follow_page itself doesn't need the compound
-        * lock because it runs __get_page_tail_foll() under the
-        * proper PT lock that already serializes against
-        * split_huge_page().
-        */
-       unsigned long flags;
-       bool got;
-       struct page *page_head = compound_head(page);
-
-       /* Ref to put_compound_page() comment. */
-       if (!__compound_tail_refcounted(page_head)) {
-               smp_rmb();
-               if (likely(PageTail(page))) {
-                       /*
-                        * This is a hugetlbfs page or a slab
-                        * page. __split_huge_page_refcount
-                        * cannot race here.
-                        */
-                       VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                       __get_page_tail_foll(page, true);
-                       return true;
-               } else {
-                       /*
-                        * __split_huge_page_refcount run
-                        * before us, "page" was a THP
-                        * tail. The split page_head has been
-                        * freed and reallocated as slab or
-                        * hugetlbfs page of smaller order
-                        * (only possible if reallocated as
-                        * slab on x86).
-                        */
-                       return false;
-               }
-       }
-
-       got = false;
-       if (likely(page != page_head && get_page_unless_zero(page_head))) {
-               /*
-                * page_head wasn't a dangling pointer but it
-                * may not be a head page anymore by the time
-                * we obtain the lock. That is ok as long as it
-                * can't be freed from under us.
-                */
-               flags = compound_lock_irqsave(page_head);
-               /* here __split_huge_page_refcount won't run anymore */
-               if (likely(PageTail(page))) {
-                       __get_page_tail_foll(page, false);
-                       got = true;
-               }
-               compound_unlock_irqrestore(page_head, flags);
-               if (unlikely(!got))
-                       put_page(page_head);
-       }
-       return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
 
 /**
  * put_pages_list() - release a list of pages
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
  */
 void mark_page_accessed(struct page *page)
 {
+       page = compound_head(page);
        if (!PageActive(page) && !PageUnevictable(page) &&
                        PageReferenced(page)) {
 
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
        update_page_reclaim_stat(lruvec, file, 0);
 }
 
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+                           void *arg)
+{
+       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+               int file = page_is_file_cache(page);
+               int lru = page_lru_base_type(page);
+
+               del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+               ClearPageActive(page);
+               ClearPageReferenced(page);
+               add_page_to_lru_list(page, lruvec, lru);
+
+               __count_vm_event(PGDEACTIVATE);
+               update_page_reclaim_stat(lruvec, file, 0);
+       }
+}
+
 /*
  * Drain pages out of the cpu's pagevecs.
  * Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
+       pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+       if (pagevec_count(pvec))
+               pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
        activate_page_drain(cpu);
 }
 
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
        }
 }
 
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page.  This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+               page_cache_get(page);
+               if (!pagevec_add(pvec, page))
+                       pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+               put_cpu_var(lru_deactivate_pvecs);
+       }
+}
+
 void lru_add_drain(void)
 {
        lru_add_drain_cpu(get_cpu());
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+                   pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
 
-               if (unlikely(PageCompound(page))) {
-                       if (zone) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
-                               zone = NULL;
-                       }
-                       put_compound_page(page);
-                       continue;
-               }
-
                /*
                 * Make sure the IRQ-safe lock-holding time does not get
                 * excessive with a continuous string of pages from the
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
                        zone = NULL;
                }
 
+               page = compound_head(page);
                if (!put_page_testzero(page))
                        continue;
 
+               if (PageCompound(page)) {
+                       if (zone) {
+                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               zone = NULL;
+                       }
+                       __put_compound_page(page);
+                       continue;
+               }
+
                if (PageLRU(page)) {
                        struct zone *pagezone = page_zone(page);
 
index d504adb7fa5f08ced98eeb2a285976c0db64a9ae..676ff2991380120275ba5d81d9660592bdc15b75 100644 (file)
@@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list)
         * deadlock in the swap out path.
         */
        /*
-        * Add it to the swap cache and mark it dirty
+        * Add it to the swap cache.
         */
        err = add_to_swap_cache(page, entry,
                        __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
-       if (!err) {     /* Success */
-               SetPageDirty(page);
+       if (!err) {
                return 1;
        } else {        /* -ENOMEM radix-tree allocation failure */
                /*
@@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                }
 
                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
-               __set_page_locked(new_page);
+               __SetPageLocked(new_page);
                SetPageSwapBacked(new_page);
                err = __add_to_swap_cache(new_page, entry);
                if (likely(!err)) {
@@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                }
                radix_tree_preload_end();
                ClearPageSwapBacked(new_page);
-               __clear_page_locked(new_page);
+               __ClearPageLocked(new_page);
                /*
                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
                 * clear SWAP_HAS_CACHE flag.
index e6b8591a3ed27641b70179265609671135ae4127..2bb30aa3a4123a547bd29e1585a1234644a92152 100644 (file)
@@ -926,6 +926,9 @@ int reuse_swap_page(struct page *page)
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return 0;
+       /* The page is part of THP and cannot be reused */
+       if (PageTransCompound(page))
+               return 0;
        count = page_mapcount(page);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
@@ -1108,19 +1111,9 @@ unsigned int count_swap_pages(int type, int free)
 }
 #endif /* CONFIG_HIBERNATION */
 
-static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
+static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
 {
-#ifdef CONFIG_MEM_SOFT_DIRTY
-       /*
-        * When pte keeps soft dirty bit the pte generated
-        * from swap entry does not has it, still it's same
-        * pte from logical point of view.
-        */
-       pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
-       return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
-#else
-       return pte_same(pte, swp_pte);
-#endif
+       return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
 }
 
 /*
@@ -1142,14 +1135,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        if (unlikely(!page))
                return -ENOMEM;
 
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+                               &memcg, false)) {
                ret = -ENOMEM;
                goto out_nolock;
        }
 
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-       if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
-               mem_cgroup_cancel_charge(page, memcg);
+       if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+               mem_cgroup_cancel_charge(page, memcg, false);
                ret = 0;
                goto out;
        }
@@ -1160,11 +1154,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        if (page == swapcache) {
-               page_add_anon_rmap(page, vma, addr);
-               mem_cgroup_commit_charge(page, memcg, true);
+               page_add_anon_rmap(page, vma, addr, false);
+               mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, addr);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, addr, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
        swap_free(entry);
@@ -1206,7 +1200,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 * swapoff spends a _lot_ of time in this loop!
                 * Test inline before going to call unuse_pte.
                 */
-               if (unlikely(maybe_same_pte(*pte, swp_pte))) {
+               if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
                        pte_unmap(pte);
                        ret = unuse_pte(vma, pmd, addr, entry, page);
                        if (ret)
index 77fee9325a5727825b9adcdac156b4e22247b6cf..806b0c758c5b6113de9b921ff0cfd01fd45b6424 100644 (file)
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
        __SetPageUptodate(page);
 
        ret = -ENOMEM;
-       if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
                goto out_release;
 
        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
                goto out_release_uncharge_unlock;
 
        inc_mm_counter(dst_mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, dst_vma, dst_addr);
-       mem_cgroup_commit_charge(page, memcg, false);
+       page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+       mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, dst_vma);
 
        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
        return ret;
 out_release_uncharge_unlock:
        pte_unmap_unlock(dst_pte, ptl);
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
 out_release:
        page_cache_release(page);
        goto out;
index 2d28f79300431422397f0c3d82f637bc9f8d5a87..6d1f9200f74e794c403ec687bae513e4f6e40700 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page)
 
 struct address_space *page_mapping(struct page *page)
 {
-       unsigned long mapping;
+       struct address_space *mapping;
+
+       page = compound_head(page);
 
        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
@@ -399,11 +401,25 @@ struct address_space *page_mapping(struct page *page)
                return swap_address_space(entry);
        }
 
-       mapping = (unsigned long)page->mapping;
-       if (mapping & PAGE_MAPPING_FLAGS)
+       mapping = page->mapping;
+       if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
                return NULL;
-       return page->mapping;
+       return mapping;
+}
+
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+       int ret;
+
+       ret = atomic_read(&page->_mapcount) + 1;
+       page = compound_head(page);
+       ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+       if (PageDoubleMap(page))
+               ret--;
+       return ret;
 }
+EXPORT_SYMBOL_GPL(__page_mapcount);
 
 int overcommit_ratio_handler(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp,
index 58ceeb107960b367f88bebcbb067ded860b1d010..fb42a5bffe4733f6e5b1e65d09009934ef2b781b 100644 (file)
@@ -455,7 +455,7 @@ found:
        free_vmap_cache = &va->rb_node;
        spin_unlock(&vmap_area_lock);
 
-       BUG_ON(va->va_start & (align-1));
+       BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);
 
@@ -1086,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
-       BUG_ON(addr & (PAGE_SIZE-1));
+       BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
 
        debug_check_no_locks_freed(mem, size);
        vmap_debug_free_range(addr, addr+size);
index 108bd119f2f6ba3f55247d7136684f937243b0c8..5ac86956ff9dc09d7dd2f16f02486756b7453320 100644 (file)
@@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
+               bool lazyfree = false;
+               int ret = SWAP_SUCCESS;
 
                cond_resched();
 
@@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
+                       lazyfree = true;
                        may_enter_fs = 1;
 
                        /* Adding to swap updated mapping */
@@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page,
-                                       ttu_flags|TTU_BATCH_FLUSH)) {
+                       switch (ret = try_to_unmap(page, lazyfree ?
+                               (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
+                               (ttu_flags | TTU_BATCH_FLUSH))) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
                                goto keep_locked;
                        case SWAP_MLOCK:
                                goto cull_mlocked;
+                       case SWAP_LZFREE:
+                               goto lazyfree;
                        case SWAP_SUCCESS:
                                ; /* try to free the page below */
                        }
@@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                }
 
+lazyfree:
                if (!mapping || !__remove_mapping(mapping, page, true))
                        goto keep_locked;
 
@@ -1184,8 +1191,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * we obviously don't have to worry about waking up a process
                 * waiting on the page lock, because there are no references.
                 */
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
 free_it:
+               if (ret == SWAP_LZFREE)
+                       count_vm_event(PGLAZYFREED);
+
                nr_reclaimed++;
 
                /*
index 83a003bc3cae54e3c2b1071249a5c282f70d3220..64bd0aa13f75cc25247609542f0949c18995c8c1 100644 (file)
@@ -783,6 +783,7 @@ const char * const vmstat_text[] = {
 
        "pgfault",
        "pgmajfault",
+       "pglazyfreed",
 
        TEXTS_FOR_ZONES("pgrefill")
        TEXTS_FOR_ZONES("pgsteal_kswapd")
@@ -844,7 +845,9 @@ const char * const vmstat_text[] = {
        "thp_fault_fallback",
        "thp_collapse_alloc",
        "thp_collapse_alloc_failed",
-       "thp_split",
+       "thp_split_page",
+       "thp_split_page_failed",
+       "thp_split_pmd",
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
 #endif
index 262889046703ea02055fcec52cfc6b3e2ebf1dd0..76f131ebc192228e6928bcc3cedcd47034d7a3b7 100755 (executable)
@@ -193,7 +193,6 @@ exuberant()
        --regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/'      \
        --regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/'  \
        --regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \
-       --regex-c++='/__TESTCLEARFLAG_FALSE\(([^,)]*).*/__TestClearPage\1/' \
        --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/'                \
        --regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/'       \
        --regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/'    \
@@ -260,7 +259,6 @@ emacs()
        --regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \
        --regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \
        --regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \
-       --regex='/__TESTCLEARFLAG_FALSE(\([^,)]*\).*/__TestClearPage\1/' \
        --regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/'           \
        --regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/'        \
        --regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/'    \
index 314c7774652e761bd299716fab1e1cffe9da8050..a11cfd20a6a0d2aa86b1d06b8552bc41a8bfd8c9 100644 (file)
@@ -111,7 +111,7 @@ static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
-static void kvm_release_pfn_dirty(pfn_t pfn);
+static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
 
 __visible bool kvm_rebooting;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
-bool kvm_is_reserved_pfn(pfn_t pfn)
+bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 {
        if (pfn_valid(pfn))
                return PageReserved(pfn_to_page(pfn));
@@ -1289,7 +1289,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
  * true indicates success, otherwise false is returned.
  */
 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
-                           bool write_fault, bool *writable, pfn_t *pfn)
+                           bool write_fault, bool *writable, kvm_pfn_t *pfn)
 {
        struct page *page[1];
        int npages;
@@ -1322,7 +1322,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
  * 1 indicates success, -errno is returned if error is detected.
  */
 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-                          bool *writable, pfn_t *pfn)
+                          bool *writable, kvm_pfn_t *pfn)
 {
        struct page *page[1];
        int npages = 0;
@@ -1386,11 +1386,11 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
  * 2): @write_fault = false && @writable, @writable will tell the caller
  *     whether the mapping is writable.
  */
-static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
                        bool write_fault, bool *writable)
 {
        struct vm_area_struct *vma;
-       pfn_t pfn = 0;
+       kvm_pfn_t pfn = 0;
        int npages;
 
        /* we can do it either atomically or asynchronously, not both */
@@ -1431,8 +1431,9 @@ exit:
        return pfn;
 }
 
-pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
-                          bool *async, bool write_fault, bool *writable)
+kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
+                              bool atomic, bool *async, bool write_fault,
+                              bool *writable)
 {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
@@ -1453,7 +1454,7 @@ pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
 }
 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
-pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable)
 {
        return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
@@ -1461,37 +1462,37 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
-pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
-pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
        return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
-pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
 
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
        return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
-pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
 }
@@ -1514,7 +1515,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
-static struct page *kvm_pfn_to_page(pfn_t pfn)
+static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
 {
        if (is_error_noslot_pfn(pfn))
                return KVM_ERR_PTR_BAD_PAGE;
@@ -1529,7 +1530,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        pfn = gfn_to_pfn(kvm, gfn);
 
@@ -1539,7 +1540,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
 
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
-       pfn_t pfn;
+       kvm_pfn_t pfn;
 
        pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
 
@@ -1555,7 +1556,7 @@ void kvm_release_page_clean(struct page *page)
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
-void kvm_release_pfn_clean(pfn_t pfn)
+void kvm_release_pfn_clean(kvm_pfn_t pfn)
 {
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
                put_page(pfn_to_page(pfn));
@@ -1570,13 +1571,13 @@ void kvm_release_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
-static void kvm_release_pfn_dirty(pfn_t pfn)
+static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
        kvm_set_pfn_dirty(pfn);
        kvm_release_pfn_clean(pfn);
 }
 
-void kvm_set_pfn_dirty(pfn_t pfn)
+void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
@@ -1587,14 +1588,14 @@ void kvm_set_pfn_dirty(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
-void kvm_set_pfn_accessed(pfn_t pfn)
+void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
-void kvm_get_pfn(pfn_t pfn)
+void kvm_get_pfn(kvm_pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn))
                get_page(pfn_to_page(pfn));