]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/commitdiff
powerpc/mm: Move book3s64 specifics in subdirectory mm/book3s64
authorChristophe Leroy <christophe.leroy@c-s.fr>
Fri, 29 Mar 2019 10:00:00 +0000 (10:00 +0000)
committerMichael Ellerman <mpe@ellerman.id.au>
Thu, 2 May 2019 15:18:38 +0000 (01:18 +1000)
Many files in arch/powerpc/mm are only for book3S64. This patch
creates a subdirectory for them.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Update the selftest sym links, shorten new filenames, cleanup some
      whitespace and formatting in the new files.]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
43 files changed:
arch/powerpc/mm/Makefile
arch/powerpc/mm/book3s64/Makefile [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_4k.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_64k.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_hugepage.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_hugetlbpage.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_native.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_pgtable.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_tlb.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/hash_utils.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/iommu_api.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/mmu_context.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/pgtable.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/pkeys.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/radix_hugetlbpage.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/radix_pgtable.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/radix_tlb.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/slb.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/subpage_prot.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/vphn.c [new file with mode: 0644]
arch/powerpc/mm/book3s64/vphn.h [new file with mode: 0644]
arch/powerpc/mm/hash64_4k.c [deleted file]
arch/powerpc/mm/hash64_64k.c [deleted file]
arch/powerpc/mm/hash_native_64.c [deleted file]
arch/powerpc/mm/hash_utils_64.c [deleted file]
arch/powerpc/mm/hugepage-hash64.c [deleted file]
arch/powerpc/mm/hugetlbpage-hash64.c [deleted file]
arch/powerpc/mm/hugetlbpage-radix.c [deleted file]
arch/powerpc/mm/mmu_context_book3s64.c [deleted file]
arch/powerpc/mm/mmu_context_iommu.c [deleted file]
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable-book3s64.c [deleted file]
arch/powerpc/mm/pgtable-hash64.c [deleted file]
arch/powerpc/mm/pgtable-radix.c [deleted file]
arch/powerpc/mm/pkeys.c [deleted file]
arch/powerpc/mm/slb.c [deleted file]
arch/powerpc/mm/subpage-prot.c [deleted file]
arch/powerpc/mm/tlb-radix.c [deleted file]
arch/powerpc/mm/tlb_hash64.c [deleted file]
arch/powerpc/mm/vphn.c [deleted file]
arch/powerpc/mm/vphn.h [deleted file]
tools/testing/selftests/powerpc/vphn/vphn.c
tools/testing/selftests/powerpc/vphn/vphn.h

index 3c1bd9fa23cd9610c7e8013771119913167bf1ef..a137fdf775e25adc5ac9383bee7ec09ea4e7e2d8 100644 (file)
@@ -5,53 +5,34 @@
 
 ccflags-$(CONFIG_PPC64)        := $(NO_MINIMAL_TOC)
 
-CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
-
 obj-y                          := fault.o mem.o pgtable.o mmap.o \
                                   init_$(BITS).o pgtable_$(BITS).o \
                                   init-common.o mmu_context.o drmem.o
 obj-$(CONFIG_PPC_MMU_NOHASH)   += mmu_context_nohash.o tlb_nohash.o \
                                   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)       += tlb_low_$(BITS)e.o
-hash64-$(CONFIG_PPC_NATIVE)    := hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
-obj-$(CONFIG_PPC_BOOK3S_64)    += pgtable-hash64.o hash_utils_64.o slb.o \
-                                  $(hash64-y) mmu_context_book3s64.o \
-                                  pgtable-book3s64.o pgtable-frag.o
+obj-$(CONFIG_PPC_BOOK3S_64)    += book3s64/
+obj-$(CONFIG_PPC_BOOK3S_64)    += pgtable-frag.o
 obj-$(CONFIG_PPC32)            += pgtable-frag.o
-obj-$(CONFIG_PPC_RADIX_MMU)    += pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_BOOK3S_32)    += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_BOOK3S)       += tlb_hash$(BITS).o
-ifdef CONFIG_PPC_BOOK3S_64
-obj-$(CONFIG_PPC_4K_PAGES)     += hash64_4k.o
-obj-$(CONFIG_PPC_64K_PAGES)    += hash64_64k.o
-endif
+obj-$(CONFIG_PPC_BOOK3S_32)    += tlb_hash32.o
 obj-$(CONFIG_40x)              += 40x_mmu.o
 obj-$(CONFIG_44x)              += 44x_mmu.o
 obj-$(CONFIG_PPC_8xx)          += 8xx_mmu.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)   += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
-obj-$(CONFIG_PPC_SPLPAR)       += vphn.o
 obj-$(CONFIG_PPC_MM_SLICES)    += slice.o
 obj-y                          += hugetlbpage.o
 ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_BOOK3S_64)    += hugetlbpage-hash64.o
-obj-$(CONFIG_PPC_RADIX_MMU)    += hugetlbpage-radix.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += hugetlbpage-book3e.o
 endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
-obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_iommu.o
 obj-$(CONFIG_PPC_PTDUMP)       += ptdump/
-obj-$(CONFIG_PPC_MEM_KEYS)     += pkeys.o
 
 # Disable kcov instrumentation on sensitive code
 # This is necessary for booting with kcov enabled on book3e machines
 KCOV_INSTRUMENT_tlb_nohash.o := n
 KCOV_INSTRUMENT_fsl_booke_mmu.o := n
-
-# Instrumenting the SLB fault path can lead to duplicate SLB entries
-KCOV_INSTRUMENT_slb.o := n
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
new file mode 100644 (file)
index 0000000..974b4fc
--- /dev/null
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y      := $(NO_MINIMAL_TOC)
+
+CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
+
+obj-y                          += hash_pgtable.o hash_utils.o slb.o \
+                                  mmu_context.o pgtable.o hash_tlb.o
+obj-$(CONFIG_PPC_NATIVE)       += hash_native.o
+obj-$(CONFIG_PPC_RADIX_MMU)    += radix_pgtable.o radix_tlb.o
+obj-$(CONFIG_PPC_4K_PAGES)     += hash_4k.o
+obj-$(CONFIG_PPC_64K_PAGES)    += hash_64k.o
+obj-$(CONFIG_PPC_SPLPAR)       += vphn.o
+obj-$(CONFIG_HUGETLB_PAGE)     += hash_hugetlbpage.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_RADIX_MMU)    += radix_hugetlbpage.o
+endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)  += iommu_api.o
+obj-$(CONFIG_PPC_MEM_KEYS)     += pkeys.o
+
+# Instrumenting the SLB fault path can lead to duplicate SLB entries
+KCOV_INSTRUMENT_slb.o := n
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
new file mode 100644 (file)
index 0000000..22e7871
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+                  pte_t *ptep, unsigned long trap, unsigned long flags,
+                  int ssize, int subpg_prot)
+{
+       real_pte_t rpte;
+       unsigned long hpte_group;
+       unsigned long rflags, pa;
+       unsigned long old_pte, new_pte;
+       unsigned long vpn, hash, slot;
+       unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+       /*
+        * atomically mark the linux large page PTE busy and dirty
+        */
+       do {
+               pte_t pte = READ_ONCE(*ptep);
+
+               old_pte = pte_val(pte);
+               /* If PTE busy, retry the access */
+               if (unlikely(old_pte & H_PAGE_BUSY))
+                       return 0;
+               /* If PTE permissions don't match, take page fault */
+               if (unlikely(!check_pte_access(access, old_pte)))
+                       return 1;
+               /*
+                * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                * a write access. Since this is 4K insert of 64K page size
+                * also add H_PAGE_COMBO
+                */
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
+                       new_pte |= _PAGE_DIRTY;
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+       /*
+        * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+        * need to add in 0x1 if it's a read-only user page
+        */
+       rflags = htab_convert_pte_flags(new_pte);
+       rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+       if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+           !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+       vpn  = hpt_vpn(ea, vsid, ssize);
+       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+               /*
+                * There MIGHT be an HPTE for this pte
+                */
+               unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
+                                                        rpte, 0);
+
+               if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
+                                              MMU_PAGE_4K, ssize, flags) == -1)
+                       old_pte &= ~_PAGE_HPTEFLAGS;
+       }
+
+       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+               hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+               /* Insert into the hash table, primary slot */
+               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+                                               MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+               /*
+                * Primary is full, try the secondary
+                */
+               if (unlikely(slot == -1)) {
+                       hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+                       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+                                                       rflags,
+                                                       HPTE_V_SECONDARY,
+                                                       MMU_PAGE_4K,
+                                                       MMU_PAGE_4K, ssize);
+                       if (slot == -1) {
+                               if (mftb() & 0x1)
+                                       hpte_group = (hash & htab_hash_mask) *
+                                                       HPTES_PER_GROUP;
+                               mmu_hash_ops.hpte_remove(hpte_group);
+                               /*
+                                * FIXME!! Should be try the group from which we removed ?
+                                */
+                               goto repeat;
+                       }
+               }
+               /*
+                * Hypervisor failure. Restore old pte and return -1
+                * similar to __hash_page_*
+                */
+               if (unlikely(slot == -2)) {
+                       *ptep = __pte(old_pte);
+                       hash_failure_debug(ea, access, vsid, trap, ssize,
+                                          MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+                       return -1;
+               }
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+               new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+       }
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+       return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
new file mode 100644 (file)
index 0000000..7084ce2
--- /dev/null
@@ -0,0 +1,333 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+/*
+ * Return true, if the entry has a slot value which
+ * the software considers as invalid.
+ */
+static inline bool hpte_soft_invalid(unsigned long hidx)
+{
+       return ((hidx & 0xfUL) == 0xfUL);
+}
+
+/*
+ * index from 0 - 15
+ */
+bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
+{
+       return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
+}
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+                  pte_t *ptep, unsigned long trap, unsigned long flags,
+                  int ssize, int subpg_prot)
+{
+       real_pte_t rpte;
+       unsigned long hpte_group;
+       unsigned int subpg_index;
+       unsigned long rflags, pa;
+       unsigned long old_pte, new_pte, subpg_pte;
+       unsigned long vpn, hash, slot, gslot;
+       unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+       /*
+        * atomically mark the linux large page PTE busy and dirty
+        */
+       do {
+               pte_t pte = READ_ONCE(*ptep);
+
+               old_pte = pte_val(pte);
+               /* If PTE busy, retry the access */
+               if (unlikely(old_pte & H_PAGE_BUSY))
+                       return 0;
+               /* If PTE permissions don't match, take page fault */
+               if (unlikely(!check_pte_access(access, old_pte)))
+                       return 1;
+               /*
+                * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                * a write access. Since this is 4K insert of 64K page size
+                * also add H_PAGE_COMBO
+                */
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
+               if (access & _PAGE_WRITE)
+                       new_pte |= _PAGE_DIRTY;
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+       /*
+        * Handle the subpage protection bits
+        */
+       subpg_pte = new_pte & ~subpg_prot;
+       rflags = htab_convert_pte_flags(subpg_pte);
+
+       if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+           !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+               /*
+                * No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case
+                */
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+       }
+
+       subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
+       vpn  = hpt_vpn(ea, vsid, ssize);
+       rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+       /*
+        *None of the sub 4k page is hashed
+        */
+       if (!(old_pte & H_PAGE_HASHPTE))
+               goto htab_insert_hpte;
+       /*
+        * Check if the pte was already inserted into the hash table
+        * as a 64k HW page, and invalidate the 64k HPTE if so.
+        */
+       if (!(old_pte & H_PAGE_COMBO)) {
+               flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
+               /*
+                * clear the old slot details from the old and new pte.
+                * On hash insert failure we use old pte value and we don't
+                * want slot information there if we have a insert failure.
+                */
+               old_pte &= ~H_PAGE_HASHPTE;
+               new_pte &= ~H_PAGE_HASHPTE;
+               goto htab_insert_hpte;
+       }
+       /*
+        * Check for sub page valid and update
+        */
+       if (__rpte_sub_valid(rpte, subpg_index)) {
+               int ret;
+
+               gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
+                                          subpg_index);
+               ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
+                                                MMU_PAGE_4K, MMU_PAGE_4K,
+                                                ssize, flags);
+
+               /*
+                * If we failed because typically the HPTE wasn't really here
+                * we try an insertion.
+                */
+               if (ret == -1)
+                       goto htab_insert_hpte;
+
+               *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+               return 0;
+       }
+
+htab_insert_hpte:
+
+       /*
+        * Initialize all hidx entries to invalid value, the first time
+        * the PTE is about to allocate a 4K HPTE.
+        */
+       if (!(old_pte & H_PAGE_COMBO))
+               rpte.hidx = INVALID_RPTE_HIDX;
+
+       /*
+        * handle H_PAGE_4K_PFN case
+        */
+       if (old_pte & H_PAGE_4K_PFN) {
+               /*
+                * All the sub 4k page have the same
+                * physical address.
+                */
+               pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
+       } else {
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+               pa += (subpg_index << shift);
+       }
+       hash = hpt_hash(vpn, shift, ssize);
+repeat:
+       hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+       /* Insert into the hash table, primary slot */
+       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+                                       MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+       /*
+        * Primary is full, try the secondary
+        */
+       if (unlikely(slot == -1)) {
+               bool soft_invalid;
+
+               hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+                                               rflags, HPTE_V_SECONDARY,
+                                               MMU_PAGE_4K, MMU_PAGE_4K,
+                                               ssize);
+
+               soft_invalid = hpte_soft_invalid(slot);
+               if (unlikely(soft_invalid)) {
+                       /*
+                        * We got a valid slot from a hardware point of view.
+                        * but we cannot use it, because we use this special
+                        * value; as defined by hpte_soft_invalid(), to track
+                        * invalid slots. We cannot use it. So invalidate it.
+                        */
+                       gslot = slot & _PTEIDX_GROUP_IX;
+                       mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
+                                                    MMU_PAGE_4K, MMU_PAGE_4K,
+                                                    ssize, 0);
+               }
+
+               if (unlikely(slot == -1 || soft_invalid)) {
+                       /*
+                        * For soft invalid slot, let's ensure that we release a
+                        * slot from the primary, with the hope that we will
+                        * acquire that slot next time we try. This will ensure
+                        * that we do not get the same soft-invalid slot.
+                        */
+                       if (soft_invalid || (mftb() & 0x1))
+                               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+                       mmu_hash_ops.hpte_remove(hpte_group);
+                       /*
+                        * FIXME!! Should be try the group from which we removed ?
+                        */
+                       goto repeat;
+               }
+       }
+       /*
+        * Hypervisor failure. Restore old pte and return -1
+        * similar to __hash_page_*
+        */
+       if (unlikely(slot == -2)) {
+               *ptep = __pte(old_pte);
+               hash_failure_debug(ea, access, vsid, trap, ssize,
+                                  MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+               return -1;
+       }
+
+       new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
+       new_pte |= H_PAGE_HASHPTE;
+
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+       return 0;
+}
+
+int __hash_page_64K(unsigned long ea, unsigned long access,
+                   unsigned long vsid, pte_t *ptep, unsigned long trap,
+                   unsigned long flags, int ssize)
+{
+       real_pte_t rpte;
+       unsigned long hpte_group;
+       unsigned long rflags, pa;
+       unsigned long old_pte, new_pte;
+       unsigned long vpn, hash, slot;
+       unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
+
+       /*
+        * atomically mark the linux large page PTE busy and dirty
+        */
+       do {
+               pte_t pte = READ_ONCE(*ptep);
+
+               old_pte = pte_val(pte);
+               /* If PTE busy, retry the access */
+               if (unlikely(old_pte & H_PAGE_BUSY))
+                       return 0;
+               /* If PTE permissions don't match, take page fault */
+               if (unlikely(!check_pte_access(access, old_pte)))
+                       return 1;
+               /*
+                * Check if PTE has the cache-inhibit bit set
+                * If so, bail out and refault as a 4k page
+                */
+               if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
+                   unlikely(pte_ci(pte)))
+                       return 0;
+               /*
+                * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                * a write access.
+                */
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
+                       new_pte |= _PAGE_DIRTY;
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+       rflags = htab_convert_pte_flags(new_pte);
+       rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+       if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+           !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+       vpn  = hpt_vpn(ea, vsid, ssize);
+       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+               unsigned long gslot;
+
+               /*
+                * There MIGHT be an HPTE for this pte
+                */
+               gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+               if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
+                                              MMU_PAGE_64K, ssize,
+                                              flags) == -1)
+                       old_pte &= ~_PAGE_HPTEFLAGS;
+       }
+
+       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+               hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+               /* Insert into the hash table, primary slot */
+               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+                                               MMU_PAGE_64K, MMU_PAGE_64K,
+                                               ssize);
+               /*
+                * Primary is full, try the secondary
+                */
+               if (unlikely(slot == -1)) {
+                       hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+                       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+                                                       rflags,
+                                                       HPTE_V_SECONDARY,
+                                                       MMU_PAGE_64K,
+                                                       MMU_PAGE_64K, ssize);
+                       if (slot == -1) {
+                               if (mftb() & 0x1)
+                                       hpte_group = (hash & htab_hash_mask) *
+                                                       HPTES_PER_GROUP;
+                               mmu_hash_ops.hpte_remove(hpte_group);
+                               /*
+                                * FIXME!! Should be try the group from which we removed ?
+                                */
+                               goto repeat;
+                       }
+               }
+               /*
+                * Hypervisor failure. Restore old pte and return -1
+                * similar to __hash_page_*
+                */
+               if (unlikely(slot == -2)) {
+                       *ptep = __pte(old_pte);
+                       hash_failure_debug(ea, access, vsid, trap, ssize,
+                                          MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
+                       return -1;
+               }
+
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+               new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+       }
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+       return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
new file mode 100644 (file)
index 0000000..4408237
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+                   pmd_t *pmdp, unsigned long trap, unsigned long flags,
+                   int ssize, unsigned int psize)
+{
+       unsigned int index, valid;
+       unsigned char *hpte_slot_array;
+       unsigned long rflags, pa, hidx;
+       unsigned long old_pmd, new_pmd;
+       int ret, lpsize = MMU_PAGE_16M;
+       unsigned long vpn, hash, shift, slot;
+
+       /*
+        * atomically mark the linux large page PMD busy and dirty
+        */
+       do {
+               pmd_t pmd = READ_ONCE(*pmdp);
+
+               old_pmd = pmd_val(pmd);
+               /* If PMD busy, retry the access */
+               if (unlikely(old_pmd & H_PAGE_BUSY))
+                       return 0;
+               /* If PMD permissions don't match, take page fault */
+               if (unlikely(!check_pte_access(access, old_pmd)))
+                       return 1;
+               /*
+                * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                * a write access
+                */
+               new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
+                       new_pmd |= _PAGE_DIRTY;
+       } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+
+       /*
+        * Make sure this is thp or devmap entry
+        */
+       if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+               return 0;
+
+       rflags = htab_convert_pte_flags(new_pmd);
+
+#if 0
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+               /*
+                * No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case
+                */
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+       }
+#endif
+       /*
+        * Find the slot index details for this ea, using base page size.
+        */
+       shift = mmu_psize_defs[psize].shift;
+       index = (ea & ~HPAGE_PMD_MASK) >> shift;
+       BUG_ON(index >= PTE_FRAG_SIZE);
+
+       vpn = hpt_vpn(ea, vsid, ssize);
+       hpte_slot_array = get_hpte_slot_array(pmdp);
+       if (psize == MMU_PAGE_4K) {
+               /*
+                * invalidate the old hpte entry if we have that mapped via 64K
+                * base page size. This is because demote_segment won't flush
+                * hash page table entries.
+                */
+               if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
+                       flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
+                                           ssize, flags);
+                       /*
+                        * With THP, we also clear the slot information with
+                        * respect to all the 64K hash pte mapping the 16MB
+                        * page. They are all invalid now. This make sure we
+                        * don't find the slot valid when we fault with 4k
+                        * base page size.
+                        *
+                        */
+                       memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
+               }
+       }
+
+       valid = hpte_valid(hpte_slot_array, index);
+       if (valid) {
+               /* update the hpte bits */
+               hash = hpt_hash(vpn, shift, ssize);
+               hidx =  hpte_hash_index(hpte_slot_array, index);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+
+               ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+                                                psize, lpsize, ssize, flags);
+               /*
+                * We failed to update, try to insert a new entry.
+                */
+               if (ret == -1) {
+                       /*
+                        * large pte is marked busy, so we can be sure
+                        * nobody is looking at hpte_slot_array. hence we can
+                        * safely update this here.
+                        */
+                       valid = 0;
+                       hpte_slot_array[index] = 0;
+               }
+       }
+
+       if (!valid) {
+               unsigned long hpte_group;
+
+               hash = hpt_hash(vpn, shift, ssize);
+               /* insert new entry */
+               pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+               new_pmd |= H_PAGE_HASHPTE;
+
+repeat:
+               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+               /* Insert into the hash table, primary slot */
+               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+                                               psize, lpsize, ssize);
+               /*
+                * Primary is full, try the secondary
+                */
+               if (unlikely(slot == -1)) {
+                       hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+                       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+                                                       rflags,
+                                                       HPTE_V_SECONDARY,
+                                                       psize, lpsize, ssize);
+                       if (slot == -1) {
+                               if (mftb() & 0x1)
+                                       hpte_group = (hash & htab_hash_mask) *
+                                                       HPTES_PER_GROUP;
+
+                               mmu_hash_ops.hpte_remove(hpte_group);
+                               goto repeat;
+                       }
+               }
+               /*
+                * Hypervisor failure. Restore old pmd and return -1
+                * similar to __hash_page_*
+                */
+               if (unlikely(slot == -2)) {
+                       *pmdp = __pmd(old_pmd);
+                       hash_failure_debug(ea, access, vsid, trap, ssize,
+                                          psize, lpsize, old_pmd);
+                       return -1;
+               }
+               /*
+                * large pte is marked busy, so we can be sure
+                * nobody is looking at hpte_slot_array. hence we can
+                * safely update this here.
+                */
+               mark_hpte_slot_valid(hpte_slot_array, index, slot);
+       }
+       /*
+        * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
+        * base page size 4k.
+        */
+       if (psize == MMU_PAGE_4K)
+               new_pmd |= H_PAGE_COMBO;
+       /*
+        * The hpte valid is stored in the pgtable whose address is in the
+        * second half of the PMD. Order this against clearing of the busy bit in
+        * huge pmd.
+        */
+       smp_wmb();
+       *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
+       return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
new file mode 100644 (file)
index 0000000..2d4e02a
--- /dev/null
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+                                 unsigned long pa, unsigned long rlags,
+                                 unsigned long vflags, int psize, int ssize);
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+                    pte_t *ptep, unsigned long trap, unsigned long flags,
+                    int ssize, unsigned int shift, unsigned int mmu_psize)
+{
+       real_pte_t rpte;
+       unsigned long vpn;
+       unsigned long old_pte, new_pte;
+       unsigned long rflags, pa;
+       long slot, offset;
+
+       BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+       /* Search the Linux page table for a match with va */
+       vpn = hpt_vpn(ea, vsid, ssize);
+
+       /*
+        * At this point, we have a pte (old_pte) which can be used to build
+        * or update an HPTE. There are 2 cases:
+        *
+        * 1. There is a valid (present) pte with no associated HPTE (this is
+        *      the most common case)
+        * 2. There is a valid (present) pte with an associated HPTE. The
+        *      current values of the pp bits in the HPTE prevent access
+        *      because we are doing software DIRTY bit management and the
+        *      page is currently not DIRTY.
+        */
+
+
+       do {
+               old_pte = pte_val(*ptep);
+               /* If PTE busy, retry the access */
+               if (unlikely(old_pte & H_PAGE_BUSY))
+                       return 0;
+               /* If PTE permissions don't match, take page fault */
+               if (unlikely(!check_pte_access(access, old_pte)))
+                       return 1;
+
+               /*
+                * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                * a write access
+                */
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
+                       new_pte |= _PAGE_DIRTY;
+       } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+       /* Make sure this is a hugetlb entry */
+       if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+               return 0;
+
+       rflags = htab_convert_pte_flags(new_pte);
+       if (unlikely(mmu_psize == MMU_PAGE_16G))
+               offset = PTRS_PER_PUD;
+       else
+               offset = PTRS_PER_PMD;
+       rpte = __real_pte(__pte(old_pte), ptep, offset);
+
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               /*
+                * No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case
+                */
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+       /* Check if pte already has an hpte (case 2) */
+       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+               /* There MIGHT be an HPTE for this pte */
+               unsigned long gslot;
+
+               gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+               if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
+                                              mmu_psize, ssize, flags) == -1)
+                       old_pte &= ~_PAGE_HPTEFLAGS;
+       }
+
+       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+               unsigned long hash = hpt_hash(vpn, shift, ssize);
+
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+               /* clear HPTE slot informations in new PTE */
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+
+               slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+                                            mmu_psize, ssize);
+
+               /*
+                * Hypervisor failure. Restore old pte and return -1
+                * similar to __hash_page_*
+                */
+               if (unlikely(slot == -2)) {
+                       *ptep = __pte(old_pte);
+                       hash_failure_debug(ea, access, vsid, trap, ssize,
+                                          mmu_psize, mmu_psize, old_pte);
+                       return -1;
+               }
+
+               new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
+       }
+
+       /*
+        * No need to use ldarx/stdcx here
+        */
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+       return 0;
+}
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+                                 unsigned long addr, pte_t *ptep)
+{
+       unsigned long pte_val;
+       /*
+        * Clear the _PAGE_PRESENT so that no hardware parallel update is
+        * possible. Also keep the pte_present true so that we don't take
+        * wrong fault.
+        */
+       pte_val = pte_update(vma->vm_mm, addr, ptep,
+                            _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+       return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+                                 pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+
+       if (radix_enabled())
+               return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+                                                          old_pte, pte);
+       set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
new file mode 100644 (file)
index 0000000..aaa28fd
--- /dev/null
@@ -0,0 +1,884 @@
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG_LOW
+
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/of.h>
+#include <linux/processor.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/trace.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/kexec.h>
+#include <asm/ppc-opcode.h>
+#include <asm/feature-fixups.h>
+
+#include <misc/cxl-base.h>
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#ifdef __BIG_ENDIAN__
+#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
+
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+       unsigned long rb;
+
+       rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+       asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+                                       unsigned int pid,
+                                       unsigned int ric, unsigned int prs)
+{
+       unsigned long rb;
+       unsigned long rs;
+       unsigned int r = 0; /* hash format */
+
+       rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+       rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+       asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+                    : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
+                    : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+       unsigned int set;
+
+       asm volatile("ptesync": : :"memory");
+
+       for (set = 0; set < num_sets; set++)
+               tlbiel_hash_set_isa206(set, is);
+
+       asm volatile("ptesync": : :"memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+       unsigned int set;
+
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Flush the first set of the TLB, and any caching of partition table
+        * entries. Then flush the remaining sets of the TLB. Hash mode uses
+        * partition scoped TLB translations.
+        */
+       tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+       for (set = 1; set < num_sets; set++)
+               tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+       /*
+        * Now invalidate the process table cache.
+        *
+        * From ISA v3.0B p. 1078:
+        *     The following forms are invalid.
+        *      * PRS=1, R=0, and RIC!=2 (The only process-scoped
+        *        HPT caching is of the Process Table.)
+        */
+       tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+       asm volatile("ptesync": : :"memory");
+
+       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+       unsigned int is;
+
+       switch (action) {
+       case TLB_INVAL_SCOPE_GLOBAL:
+               is = 3;
+               break;
+       case TLB_INVAL_SCOPE_LPID:
+               is = 2;
+               break;
+       default:
+               BUG();
+       }
+
+       if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+               tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+       else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+               tlbiel_all_isa206(POWER8_TLB_SETS, is);
+       else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+               tlbiel_all_isa206(POWER7_TLB_SETS, is);
+       else
+               WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
+static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
+                                               int apsize, int ssize)
+{
+       unsigned long va;
+       unsigned int penc;
+       unsigned long sllp;
+
+       /*
+        * We need 14 to 65 bits of va for a tlibe of 4K page
+        * With vpn we ignore the lower VPN_SHIFT bits already.
+        * And top two bits are already ignored because we can
+        * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
+        * of 12.
+        */
+       va = vpn << VPN_SHIFT;
+       /*
+        * clear top 16 bits of 64bit va, non SLS segment
+        * Older versions of the architecture (2.02 and earler) require the
+        * masking of the top 16 bits.
+        */
+       if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+               va &= ~(0xffffULL << 48);
+
+       switch (psize) {
+       case MMU_PAGE_4K:
+               /* clear out bits after (52) [0....52.....63] */
+               va &= ~((1ul << (64 - 52)) - 1);
+               va |= ssize << 8;
+               sllp = get_sllp_encoding(apsize);
+               va |= sllp << 5;
+               asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
+                            : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+                            : "memory");
+               break;
+       default:
+               /* We need 14 to 14 + i bits of va */
+               penc = mmu_psize_defs[psize].penc[apsize];
+               va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
+               va |= penc << 12;
+               va |= ssize << 8;
+               /*
+                * AVAL bits:
+                * We don't need all the bits, but rest of the bits
+                * must be ignored by the processor.
+                * vpn cover upto 65 bits of va. (0...65) and we need
+                * 58..64 bits of va.
+                */
+               va |= (vpn & 0xfe); /* AVAL */
+               va |= 1; /* L */
+               asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
+                            : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+                            : "memory");
+               break;
+       }
+       return va;
+}
+
+static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+               /* Need the extra ptesync to ensure we don't reorder tlbie*/
+               asm volatile("ptesync": : :"memory");
+               ___tlbie(vpn, psize, apsize, ssize);
+       }
+}
+
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+       unsigned long rb;
+
+       rb = ___tlbie(vpn, psize, apsize, ssize);
+       trace_tlbie(0, 0, rb, 0, 0, 0, 0);
+}
+
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
+{
+       unsigned long va;
+       unsigned int penc;
+       unsigned long sllp;
+
+       /* VPN_SHIFT can be atmost 12 */
+       va = vpn << VPN_SHIFT;
+       /*
+        * clear top 16 bits of 64 bit va, non SLS segment
+        * Older versions of the architecture (2.02 and earler) require the
+        * masking of the top 16 bits.
+        */
+       if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+               va &= ~(0xffffULL << 48);
+
+       switch (psize) {
+       case MMU_PAGE_4K:
+               /* clear out bits after(52) [0....52.....63] */
+               va &= ~((1ul << (64 - 52)) - 1);
+               va |= ssize << 8;
+               sllp = get_sllp_encoding(apsize);
+               va |= sllp << 5;
+               asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
+                            : : "r" (va), "i" (CPU_FTR_ARCH_206)
+                            : "memory");
+               break;
+       default:
+               /* We need 14 to 14 + i bits of va */
+               penc = mmu_psize_defs[psize].penc[apsize];
+               va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
+               va |= penc << 12;
+               va |= ssize << 8;
+               /*
+                * AVAL bits:
+                * We don't need all the bits, but rest of the bits
+                * must be ignored by the processor.
+                * vpn cover upto 65 bits of va. (0...65) and we need
+                * 58..64 bits of va.
+                */
+               va |= (vpn & 0xfe);
+               va |= 1; /* L */
+               asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
+                            : : "r" (va), "i" (CPU_FTR_ARCH_206)
+                            : "memory");
+               break;
+       }
+       trace_tlbie(0, 1, va, 0, 0, 0, 0);
+
+}
+
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+                        int ssize, int local)
+{
+       unsigned int use_local;
+       int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+       use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
+
+       if (use_local)
+               use_local = mmu_psize_defs[psize].tlbiel;
+       if (lock_tlbie && !use_local)
+               raw_spin_lock(&native_tlbie_lock);
+       asm volatile("ptesync": : :"memory");
+       if (use_local) {
+               __tlbiel(vpn, psize, apsize, ssize);
+               asm volatile("ptesync": : :"memory");
+       } else {
+               __tlbie(vpn, psize, apsize, ssize);
+               fixup_tlbie(vpn, psize, apsize, ssize);
+               asm volatile("eieio; tlbsync; ptesync": : :"memory");
+       }
+       if (lock_tlbie && !use_local)
+               raw_spin_unlock(&native_tlbie_lock);
+}
+
+static inline void native_lock_hpte(struct hash_pte *hptep)
+{
+       unsigned long *word = (unsigned long *)&hptep->v;
+
+       while (1) {
+               if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
+                       break;
+               spin_begin();
+               while(test_bit(HPTE_LOCK_BIT, word))
+                       spin_cpu_relax();
+               spin_end();
+       }
+}
+
+static inline void native_unlock_hpte(struct hash_pte *hptep)
+{
+       unsigned long *word = (unsigned long *)&hptep->v;
+
+       clear_bit_unlock(HPTE_LOCK_BIT, word);
+}
+
+static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
+                       unsigned long pa, unsigned long rflags,
+                       unsigned long vflags, int psize, int apsize, int ssize)
+{
+       struct hash_pte *hptep = htab_address + hpte_group;
+       unsigned long hpte_v, hpte_r;
+       int i;
+
+       if (!(vflags & HPTE_V_BOLTED)) {
+               DBG_LOW("    insert(group=%lx, vpn=%016lx, pa=%016lx,"
+                       " rflags=%lx, vflags=%lx, psize=%d)\n",
+                       hpte_group, vpn, pa, rflags, vflags, psize);
+       }
+
+       for (i = 0; i < HPTES_PER_GROUP; i++) {
+               if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
+                       /* retry with lock held */
+                       native_lock_hpte(hptep);
+                       if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
+                               break;
+                       native_unlock_hpte(hptep);
+               }
+
+               hptep++;
+       }
+
+       if (i == HPTES_PER_GROUP)
+               return -1;
+
+       hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+       hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+
+       if (!(vflags & HPTE_V_BOLTED)) {
+               DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
+                       i, hpte_v, hpte_r);
+       }
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
+               hpte_v = hpte_old_to_new_v(hpte_v);
+       }
+
+       hptep->r = cpu_to_be64(hpte_r);
+       /* Guarantee the second dword is visible before the valid bit */
+       eieio();
+       /*
+        * Now set the first dword including the valid bit
+        * NOTE: this also unlocks the hpte
+        */
+       hptep->v = cpu_to_be64(hpte_v);
+
+       __asm__ __volatile__ ("ptesync" : : : "memory");
+
+       return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+       struct hash_pte *hptep;
+       int i;
+       int slot_offset;
+       unsigned long hpte_v;
+
+       DBG_LOW("    remove(group=%lx)\n", hpte_group);
+
+       /* pick a random entry to start at */
+       slot_offset = mftb() & 0x7;
+
+       for (i = 0; i < HPTES_PER_GROUP; i++) {
+               hptep = htab_address + hpte_group + slot_offset;
+               hpte_v = be64_to_cpu(hptep->v);
+
+               if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
+                       /* retry with lock held */
+                       native_lock_hpte(hptep);
+                       hpte_v = be64_to_cpu(hptep->v);
+                       if ((hpte_v & HPTE_V_VALID)
+                           && !(hpte_v & HPTE_V_BOLTED))
+                               break;
+                       native_unlock_hpte(hptep);
+               }
+
+               slot_offset++;
+               slot_offset &= 0x7;
+       }
+
+       if (i == HPTES_PER_GROUP)
+               return -1;
+
+       /* Invalidate the hpte. NOTE: this also unlocks it */
+       hptep->v = 0;
+
+       return i;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+                                unsigned long vpn, int bpsize,
+                                int apsize, int ssize, unsigned long flags)
+{
+       struct hash_pte *hptep = htab_address + slot;
+       unsigned long hpte_v, want_v;
+       int ret = 0, local = 0;
+
+       want_v = hpte_encode_avpn(vpn, bpsize, ssize);
+
+       DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
+               vpn, want_v & HPTE_V_AVPN, slot, newpp);
+
+       hpte_v = hpte_get_old_v(hptep);
+       /*
+        * We need to invalidate the TLB always because hpte_remove doesn't do
+        * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+        * random entry from it. When we do that we don't invalidate the TLB
+        * (hpte_remove) because we assume the old translation is still
+        * technically "valid".
+        */
+       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
+               DBG_LOW(" -> miss\n");
+               ret = -1;
+       } else {
+               native_lock_hpte(hptep);
+               /* recheck with locks held */
+               hpte_v = hpte_get_old_v(hptep);
+               if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
+                            !(hpte_v & HPTE_V_VALID))) {
+                       ret = -1;
+               } else {
+                       DBG_LOW(" -> hit\n");
+                       /* Update the HPTE */
+                       hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+                                               ~(HPTE_R_PPP | HPTE_R_N)) |
+                                              (newpp & (HPTE_R_PPP | HPTE_R_N |
+                                                        HPTE_R_C)));
+               }
+               native_unlock_hpte(hptep);
+       }
+
+       if (flags & HPTE_LOCAL_UPDATE)
+               local = 1;
+       /*
+        * Ensure it is out of the tlb too if it is not a nohpte fault
+        */
+       if (!(flags & HPTE_NOHPTE_UPDATE))
+               tlbie(vpn, bpsize, apsize, ssize, local);
+
+       return ret;
+}
+
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+       struct hash_pte *hptep;
+       unsigned long hash;
+       unsigned long i;
+       long slot;
+       unsigned long want_v, hpte_v;
+
+       hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+       want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+       /* Bolted mappings are only ever in the primary group */
+       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+       for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+               hptep = htab_address + slot;
+               hpte_v = hpte_get_old_v(hptep);
+               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+                       /* HPTE matches */
+                       return slot;
+               ++slot;
+       }
+
+       return -1;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+                                      int psize, int ssize)
+{
+       unsigned long vpn;
+       unsigned long vsid;
+       long slot;
+       struct hash_pte *hptep;
+
+       vsid = get_kernel_vsid(ea, ssize);
+       vpn = hpt_vpn(ea, vsid, ssize);
+
+       slot = native_hpte_find(vpn, psize, ssize);
+       if (slot == -1)
+               panic("could not find page to bolt\n");
+       hptep = htab_address + slot;
+
+       /* Update the HPTE */
+       hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+                               ~(HPTE_R_PPP | HPTE_R_N)) |
+                              (newpp & (HPTE_R_PPP | HPTE_R_N)));
+       /*
+        * Ensure it is out of the tlb too. Bolted entries base and
+        * actual page size will be same.
+        */
+       tlbie(vpn, psize, psize, ssize, 0);
+}
+
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+       unsigned long vpn;
+       unsigned long vsid;
+       long slot;
+       struct hash_pte *hptep;
+
+       vsid = get_kernel_vsid(ea, ssize);
+       vpn = hpt_vpn(ea, vsid, ssize);
+
+       slot = native_hpte_find(vpn, psize, ssize);
+       if (slot == -1)
+               return -ENOENT;
+
+       hptep = htab_address + slot;
+
+       VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
+
+       /* Invalidate the hpte */
+       hptep->v = 0;
+
+       /* Invalidate the TLB */
+       tlbie(vpn, psize, psize, ssize, 0);
+       return 0;
+}
+
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
+                                  int bpsize, int apsize, int ssize, int local)
+{
+       struct hash_pte *hptep = htab_address + slot;
+       unsigned long hpte_v;
+       unsigned long want_v;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
+
+       want_v = hpte_encode_avpn(vpn, bpsize, ssize);
+       hpte_v = hpte_get_old_v(hptep);
+
+       if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+               native_lock_hpte(hptep);
+               /* recheck with locks held */
+               hpte_v = hpte_get_old_v(hptep);
+
+               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+                       /* Invalidate the hpte. NOTE: this also unlocks it */
+                       hptep->v = 0;
+               else
+                       native_unlock_hpte(hptep);
+       }
+       /*
+        * We need to invalidate the TLB always because hpte_remove doesn't do
+        * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+        * random entry from it. When we do that we don't invalidate the TLB
+        * (hpte_remove) because we assume the old translation is still
+        * technically "valid".
+        */
+       tlbie(vpn, bpsize, apsize, ssize, local);
+
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void native_hugepage_invalidate(unsigned long vsid,
+                                      unsigned long addr,
+                                      unsigned char *hpte_slot_array,
+                                      int psize, int ssize, int local)
+{
+       int i;
+       struct hash_pte *hptep;
+       int actual_psize = MMU_PAGE_16M;
+       unsigned int max_hpte_count, valid;
+       unsigned long flags, s_addr = addr;
+       unsigned long hpte_v, want_v, shift;
+       unsigned long hidx, vpn = 0, hash, slot;
+
+       shift = mmu_psize_defs[psize].shift;
+       max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+       local_irq_save(flags);
+       for (i = 0; i < max_hpte_count; i++) {
+               valid = hpte_valid(hpte_slot_array, i);
+               if (!valid)
+                       continue;
+               hidx =  hpte_hash_index(hpte_slot_array, i);
+
+               /* get the vpn */
+               addr = s_addr + (i * (1ul << shift));
+               vpn = hpt_vpn(addr, vsid, ssize);
+               hash = hpt_hash(vpn, shift, ssize);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+
+               hptep = htab_address + slot;
+               want_v = hpte_encode_avpn(vpn, psize, ssize);
+               hpte_v = hpte_get_old_v(hptep);
+
+               /* Even if we miss, we need to invalidate the TLB */
+               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+                       /* recheck with locks held */
+                       native_lock_hpte(hptep);
+                       hpte_v = hpte_get_old_v(hptep);
+
+                       if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+                               /*
+                                * Invalidate the hpte. NOTE: this also unlocks it
+                                */
+
+                               hptep->v = 0;
+                       } else
+                               native_unlock_hpte(hptep);
+               }
+               /*
+                * We need to do tlb invalidate for all the address, tlbie
+                * instruction compares entry_VA in tlb with the VA specified
+                * here
+                */
+               tlbie(vpn, psize, actual_psize, ssize, local);
+       }
+       local_irq_restore(flags);
+}
+#else
+static void native_hugepage_invalidate(unsigned long vsid,
+                                      unsigned long addr,
+                                      unsigned char *hpte_slot_array,
+                                      int psize, int ssize, int local)
+{
+       WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
+
+static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
+                       int *psize, int *apsize, int *ssize, unsigned long *vpn)
+{
+       unsigned long avpn, pteg, vpi;
+       unsigned long hpte_v = be64_to_cpu(hpte->v);
+       unsigned long hpte_r = be64_to_cpu(hpte->r);
+       unsigned long vsid, seg_off;
+       int size, a_size, shift;
+       /* Look at the 8 bit LP value */
+       unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
+               hpte_r = hpte_new_to_old_r(hpte_r);
+       }
+       if (!(hpte_v & HPTE_V_LARGE)) {
+               size   = MMU_PAGE_4K;
+               a_size = MMU_PAGE_4K;
+       } else {
+               size = hpte_page_sizes[lp] & 0xf;
+               a_size = hpte_page_sizes[lp] >> 4;
+       }
+       /* This works for all page sizes, and for 256M and 1T segments */
+       *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+       shift = mmu_psize_defs[size].shift;
+
+       avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
+       pteg = slot / HPTES_PER_GROUP;
+       if (hpte_v & HPTE_V_SECONDARY)
+               pteg = ~pteg;
+
+       switch (*ssize) {
+       case MMU_SEGSIZE_256M:
+               /* We only have 28 - 23 bits of seg_off in avpn */
+               seg_off = (avpn & 0x1f) << 23;
+               vsid    =  avpn >> 5;
+               /* We can find more bits from the pteg value */
+               if (shift < 23) {
+                       vpi = (vsid ^ pteg) & htab_hash_mask;
+                       seg_off |= vpi << shift;
+               }
+               *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+               break;
+       case MMU_SEGSIZE_1T:
+               /* We only have 40 - 23 bits of seg_off in avpn */
+               seg_off = (avpn & 0x1ffff) << 23;
+               vsid    = avpn >> 17;
+               if (shift < 23) {
+                       vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
+                       seg_off |= vpi << shift;
+               }
+               *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+               break;
+       default:
+               *vpn = size = 0;
+       }
+       *psize  = size;
+       *apsize = a_size;
+}
+
+/*
+ * clear all mappings on kexec.  All cpus are in real mode (or they will
+ * be when they isi), and we are the only one left.  We rely on our kernel
+ * mapping being 0xC0's and the hardware ignoring those two real bits.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * Taking the native_tlbie_lock is unsafe here due to the possibility of
+ * lockdep being on. On pre POWER5 hardware, not taking the lock could
+ * cause deadlock. POWER5 and newer not taking the lock is fine. This only
+ * gets called during boot before secondary CPUs have come up and during
+ * crashdump and all bets are off anyway.
+ *
+ * TODO: add batching support when enabled.  remember, no dynamic memory here,
+ * although there is the control page available...
+ */
+static void native_hpte_clear(void)
+{
+       unsigned long vpn = 0;
+       unsigned long slot, slots;
+       struct hash_pte *hptep = htab_address;
+       unsigned long hpte_v;
+       unsigned long pteg_count;
+       int psize, apsize, ssize;
+
+       pteg_count = htab_hash_mask + 1;
+
+       slots = pteg_count * HPTES_PER_GROUP;
+
+       for (slot = 0; slot < slots; slot++, hptep++) {
+               /*
+                * we could lock the pte here, but we are the only cpu
+                * running,  right?  and for crash dump, we probably
+                * don't want to wait for a maybe bad cpu.
+                */
+               hpte_v = be64_to_cpu(hptep->v);
+
+               /*
+                * Call __tlbie() here rather than tlbie() since we can't take the
+                * native_tlbie_lock.
+                */
+               if (hpte_v & HPTE_V_VALID) {
+                       hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
+                       hptep->v = 0;
+                       ___tlbie(vpn, psize, apsize, ssize);
+               }
+       }
+
+       asm volatile("eieio; tlbsync; ptesync":::"memory");
+}
+
+/*
+ * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
+ * the lock all the time
+ */
+static void native_flush_hash_range(unsigned long number, int local)
+{
+       unsigned long vpn = 0;
+       unsigned long hash, index, hidx, shift, slot;
+       struct hash_pte *hptep;
+       unsigned long hpte_v;
+       unsigned long want_v;
+       unsigned long flags;
+       real_pte_t pte;
+       struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+       unsigned long psize = batch->psize;
+       int ssize = batch->ssize;
+       int i;
+       unsigned int use_local;
+
+       use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
+               mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
+
+       local_irq_save(flags);
+
+       for (i = 0; i < number; i++) {
+               vpn = batch->vpn[i];
+               pte = batch->pte[i];
+
+               pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+                       hash = hpt_hash(vpn, shift, ssize);
+                       hidx = __rpte_to_hidx(pte, index);
+                       if (hidx & _PTEIDX_SECONDARY)
+                               hash = ~hash;
+                       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                       slot += hidx & _PTEIDX_GROUP_IX;
+                       hptep = htab_address + slot;
+                       want_v = hpte_encode_avpn(vpn, psize, ssize);
+                       hpte_v = hpte_get_old_v(hptep);
+
+                       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+                               continue;
+                       /* lock and try again */
+                       native_lock_hpte(hptep);
+                       hpte_v = hpte_get_old_v(hptep);
+
+                       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+                               native_unlock_hpte(hptep);
+                       else
+                               hptep->v = 0;
+
+               } pte_iterate_hashed_end();
+       }
+
+       if (use_local) {
+               asm volatile("ptesync":::"memory");
+               for (i = 0; i < number; i++) {
+                       vpn = batch->vpn[i];
+                       pte = batch->pte[i];
+
+                       pte_iterate_hashed_subpages(pte, psize,
+                                                   vpn, index, shift) {
+                               __tlbiel(vpn, psize, psize, ssize);
+                       } pte_iterate_hashed_end();
+               }
+               asm volatile("ptesync":::"memory");
+       } else {
+               int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+               if (lock_tlbie)
+                       raw_spin_lock(&native_tlbie_lock);
+
+               asm volatile("ptesync":::"memory");
+               for (i = 0; i < number; i++) {
+                       vpn = batch->vpn[i];
+                       pte = batch->pte[i];
+
+                       pte_iterate_hashed_subpages(pte, psize,
+                                                   vpn, index, shift) {
+                               __tlbie(vpn, psize, psize, ssize);
+                       } pte_iterate_hashed_end();
+               }
+               /*
+                * Just do one more with the last used values.
+                */
+               fixup_tlbie(vpn, psize, psize, ssize);
+               asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+               if (lock_tlbie)
+                       raw_spin_unlock(&native_tlbie_lock);
+       }
+
+       local_irq_restore(flags);
+}
+
+void __init hpte_init_native(void)
+{
+       mmu_hash_ops.hpte_invalidate    = native_hpte_invalidate;
+       mmu_hash_ops.hpte_updatepp      = native_hpte_updatepp;
+       mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+       mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
+       mmu_hash_ops.hpte_insert        = native_hpte_insert;
+       mmu_hash_ops.hpte_remove        = native_hpte_remove;
+       mmu_hash_ops.hpte_clear_all     = native_hpte_clear;
+       mmu_hash_ops.flush_hash_range = native_flush_hash_range;
+       mmu_hash_ops.hugepage_invalidate   = native_hugepage_invalidate;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
new file mode 100644 (file)
index 0000000..1fd025d
--- /dev/null
@@ -0,0 +1,463 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mmu.h>
+#include <asm/tlb.h>
+
+#include <mm/mmu_decl.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
+#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * vmemmap is the starting address of the virtual address space where
+ * struct pages are allocated for all possible PFNs present on the system
+ * including holes and bad memory (hence sparse). These virtual struct
+ * pages are stored in sequence in this virtual address space irrespective
+ * of the fact whether the corresponding PFN is valid or not. This achieves
+ * constant relationship between address of struct page and its PFN.
+ *
+ * During boot or memory hotplug operation when a new memory section is
+ * added, physical memory allocation (including hash table bolting) will
+ * be performed for the set of struct pages which are part of the memory
+ * section. This saves memory by not allocating struct pages for PFNs
+ * which are not valid.
+ *
+ *             ----------------------------------------------
+ *             | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
+ *             ----------------------------------------------
+ *
+ *        f000000000000000                  c000000000000000
+ * vmemmap +--------------+                  +--------------+
+ *  +      |  page struct | +--------------> |  page struct |
+ *  |      +--------------+                  +--------------+
+ *  |      |  page struct | +--------------> |  page struct |
+ *  |      +--------------+ |                +--------------+
+ *  |      |  page struct | +       +------> |  page struct |
+ *  |      +--------------+         |        +--------------+
+ *  |      |  page struct |         |   +--> |  page struct |
+ *  |      +--------------+         |   |    +--------------+
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct | +-------+   |
+ *  |      +--------------+             |
+ *  |      |  page struct | +-----------+
+ *  |      +--------------+
+ *  |      |  page struct | No mapping
+ *  |      +--------------+
+ *  |      |  page struct | No mapping
+ *  v      +--------------+
+ *
+ *             -----------------------------------------
+ *             | RELATION BETWEEN STRUCT PAGES AND PFNS|
+ *             -----------------------------------------
+ *
+ * vmemmap +--------------+                 +---------------+
+ *  +      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |              |
+ *  |      +--------------+
+ *  |      |              |
+ *  |      +--------------+
+ *  |      |              |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |              |
+ *  |      +--------------+
+ *  |      |              |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  v      +--------------+                 +---------------+
+ */
+/*
+ * On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ */
+int __meminit hash__vmemmap_create_mapping(unsigned long start,
+                                      unsigned long page_size,
+                                      unsigned long phys)
+{
+       int rc;
+
+       if ((start + page_size) >= H_VMEMMAP_END) {
+               pr_warn("Outside the supported range\n");
+               return -1;
+       }
+
+       rc = htab_bolt_mapping(start, start + page_size, phys,
+                              pgprot_val(PAGE_KERNEL),
+                              mmu_vmemmap_psize, mmu_kernel_ssize);
+       if (rc < 0) {
+               int rc2 = htab_remove_mapping(start, start + page_size,
+                                             mmu_vmemmap_psize,
+                                             mmu_kernel_ssize);
+               BUG_ON(rc2 && (rc2 != -ENOENT));
+       }
+       return rc;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void hash__vmemmap_remove_mapping(unsigned long start,
+                             unsigned long page_size)
+{
+       int rc = htab_remove_mapping(start, start + page_size,
+                                    mmu_vmemmap_psize,
+                                    mmu_kernel_ssize);
+       BUG_ON((rc < 0) && (rc != -ENOENT));
+       WARN_ON(rc == -ENOENT);
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
+       if (slab_is_available()) {
+               pgdp = pgd_offset_k(ea);
+               pudp = pud_alloc(&init_mm, pgdp, ea);
+               if (!pudp)
+                       return -ENOMEM;
+               pmdp = pmd_alloc(&init_mm, pudp, ea);
+               if (!pmdp)
+                       return -ENOMEM;
+               ptep = pte_alloc_kernel(pmdp, ea);
+               if (!ptep)
+                       return -ENOMEM;
+               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+       } else {
+               /*
+                * If the mm subsystem is not fully up, we cannot create a
+                * linux page table entry for this mapping.  Simply bolt an
+                * entry in the hardware page table.
+                *
+                */
+               if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
+                                     mmu_io_psize, mmu_kernel_ssize)) {
+                       printk(KERN_ERR "Failed to do bolted mapping IO "
+                              "memory at %016lx !\n", pa);
+                       return -ENOMEM;
+               }
+       }
+
+       smp_wmb();
+       return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                   pmd_t *pmdp, unsigned long clr,
+                                   unsigned long set)
+{
+       __be64 old_be, tmp;
+       unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+       __asm__ __volatile__(
+       "1:     ldarx   %0,0,%3\n\
+               and.    %1,%0,%6\n\
+               bne-    1b \n\
+               andc    %1,%0,%4 \n\
+               or      %1,%1,%7\n\
+               stdcx.  %1,0,%3 \n\
+               bne-    1b"
+       : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
+       : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
+         "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
+       : "cc" );
+
+       old = be64_to_cpu(old_be);
+
+       trace_hugepage_update(addr, old, clr, set);
+       if (old & H_PAGE_HASHPTE)
+               hpte_do_hugepage_flush(mm, addr, pmdp, old);
+       return old;
+}
+
+pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp)
+{
+       pmd_t pmd;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(pmd_trans_huge(*pmdp));
+       VM_BUG_ON(pmd_devmap(*pmdp));
+
+       pmd = *pmdp;
+       pmd_clear(pmdp);
+       /*
+        * Wait for all pending hash_page to finish. This is needed
+        * in case of subpage collapse. When we collapse normal pages
+        * to hugepage, we first clear the pmd, then invalidate all
+        * the PTE entries. The assumption here is that any low level
+        * page fault will see a none pmd and take the slow path that
+        * will wait on mmap_sem. But we could very well be in a
+        * hash_page with local ptep pointer value. Such a hash page
+        * can result in adding new HPTE entries for normal subpages.
+        * That means we could be modifying the page content as we
+        * copy them to a huge page. So wait for parallel hash_page
+        * to finish before invalidating HPTE entries. We can do this
+        * by sending an IPI to all the cpus and executing a dummy
+        * function there.
+        */
+       serialize_against_pte_lookup(vma->vm_mm);
+       /*
+        * Now invalidate the hpte entries in the range
+        * covered by pmd. This make sure we take a
+        * fault and will find the pmd as none, which will
+        * result in a major fault which takes mmap_sem and
+        * hence wait for collapse to complete. Without this
+        * the __collapse_huge_page_copy can result in copying
+        * the old content.
+        */
+       flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+       return pmd;
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                 pgtable_t pgtable)
+{
+       pgtable_t *pgtable_slot;
+
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+       /*
+        * we store the pgtable in the second half of PMD
+        */
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       *pgtable_slot = pgtable;
+       /*
+        * expose the deposited pgtable to other cpus.
+        * before we set the hugepage PTE at pmd level
+        * hash fault code looks at the deposted pgtable
+        * to store hash index values.
+        */
+       smp_wmb();
+}
+
+pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+       pgtable_t pgtable;
+       pgtable_t *pgtable_slot;
+
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       pgtable = *pgtable_slot;
+       /*
+        * Once we withdraw, mark the entry NULL.
+        */
+       *pgtable_slot = NULL;
+       /*
+        * We store HPTE information in the deposited PTE fragment.
+        * zero out the content on withdraw.
+        */
+       memset(pgtable, 0, PTE_FRAG_SIZE);
+       return pgtable;
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp, unsigned long old_pmd)
+{
+       int ssize;
+       unsigned int psize;
+       unsigned long vsid;
+       unsigned long flags = 0;
+
+       /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
+       psize = get_slice_psize(mm, addr);
+       BUG_ON(psize == MMU_PAGE_16M);
+#endif
+       if (old_pmd & H_PAGE_COMBO)
+               psize = MMU_PAGE_4K;
+       else
+               psize = MMU_PAGE_64K;
+
+       if (!is_kernel_addr(addr)) {
+               ssize = user_segment_size(addr);
+               vsid = get_user_vsid(&mm->context, addr, ssize);
+               WARN_ON(vsid == 0);
+       } else {
+               vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+               ssize = mmu_kernel_ssize;
+       }
+
+       if (mm_is_thread_local(mm))
+               flags |= HPTE_LOCAL_UPDATE;
+
+       return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
+}
+
+pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                               unsigned long addr, pmd_t *pmdp)
+{
+       pmd_t old_pmd;
+       pgtable_t pgtable;
+       unsigned long old;
+       pgtable_t *pgtable_slot;
+
+       old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+       old_pmd = __pmd(old);
+       /*
+        * We have pmd == none and we are holding page_table_lock.
+        * So we can safely go and clear the pgtable hash
+        * index info.
+        */
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       pgtable = *pgtable_slot;
+       /*
+        * Let's zero out old valid and hash index details
+        * hash fault look at them.
+        */
+       memset(pgtable, 0, PTE_FRAG_SIZE);
+       /*
+        * Serialize against find_current_mm_pte variants which does lock-less
+        * lookup in page tables with local interrupts disabled. For huge pages
+        * it casts pmd_t to pte_t. Since format of pte_t is different from
+        * pmd_t we want to prevent transit from pmd pointing to page table
+        * to pmd pointing to huge page (and back) while interrupts are disabled.
+        * We clear pmd to possibly replace it with page table pointer in
+        * different code paths. So make sure we wait for the parallel
+        * find_curren_mm_pte to finish.
+        */
+       serialize_against_pte_lookup(mm);
+       return old_pmd;
+}
+
+int hash__has_transparent_hugepage(void)
+{
+
+       if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+               return 0;
+       /*
+        * We support THP only if PMD_SIZE is 16MB.
+        */
+       if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+               return 0;
+       /*
+        * We need to make sure that we support 16MB hugepage in a segement
+        * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+        * of 64K.
+        */
+       /*
+        * If we have 64K HPTE, we will be using that by default
+        */
+       if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+           (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+               return 0;
+       /*
+        * Ok we only have 4K HPTE
+        */
+       if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+               return 0;
+
+       return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+static bool hash__change_memory_range(unsigned long start, unsigned long end,
+                                     unsigned long newpp)
+{
+       unsigned long idx;
+       unsigned int step, shift;
+
+       shift = mmu_psize_defs[mmu_linear_psize].shift;
+       step = 1 << shift;
+
+       start = ALIGN_DOWN(start, step);
+       end = ALIGN(end, step); // aligns up
+
+       if (start >= end)
+               return false;
+
+       pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
+                start, end, newpp, step);
+
+       for (idx = start; idx < end; idx += step)
+               /* Not sure if we can do much with the return value */
+               mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
+                                                       mmu_kernel_ssize);
+
+       return true;
+}
+
+void hash__mark_rodata_ro(void)
+{
+       unsigned long start, end;
+
+       start = (unsigned long)_stext;
+       end = (unsigned long)__init_begin;
+
+       WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
+}
+
+void hash__mark_initmem_nx(void)
+{
+       unsigned long start, end, pp;
+
+       start = (unsigned long)__init_begin;
+       end = (unsigned long)__init_end;
+
+       pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+
+       WARN_ON(!hash__change_memory_range(start, end, pp));
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
new file mode 100644 (file)
index 0000000..d4f0101
--- /dev/null
@@ -0,0 +1,265 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/bug.h>
+#include <asm/pte-walk.h>
+
+
+#include <trace/events/thp.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/*
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ */
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, unsigned long pte, int huge)
+{
+       unsigned long vpn;
+       struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
+       unsigned long vsid;
+       unsigned int psize;
+       int ssize;
+       real_pte_t rpte;
+       int i, offset;
+
+       i = batch->index;
+
+       /*
+        * Get page size (maybe move back to caller).
+        *
+        * NOTE: when using special 64K mappings in 4K environment like
+        * for SPEs, we obtain the page size from the slice, which thus
+        * must still exist (and thus the VMA not reused) at the time
+        * of this call
+        */
+       if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+               psize = get_slice_psize(mm, addr);
+               /* Mask the address for the correct page size */
+               addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+               if (unlikely(psize == MMU_PAGE_16G))
+                       offset = PTRS_PER_PUD;
+               else
+                       offset = PTRS_PER_PMD;
+#else
+               BUG();
+               psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
+#endif
+       } else {
+               psize = pte_pagesize_index(mm, addr, pte);
+               /*
+                * Mask the address for the standard page size.  If we
+                * have a 64k page kernel, but the hardware does not
+                * support 64k pages, this might be different from the
+                * hardware page size encoded in the slice table.
+                */
+               addr &= PAGE_MASK;
+               offset = PTRS_PER_PTE;
+       }
+
+
+       /* Build full vaddr */
+       if (!is_kernel_addr(addr)) {
+               ssize = user_segment_size(addr);
+               vsid = get_user_vsid(&mm->context, addr, ssize);
+       } else {
+               vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+               ssize = mmu_kernel_ssize;
+       }
+       WARN_ON(vsid == 0);
+       vpn = hpt_vpn(addr, vsid, ssize);
+       rpte = __real_pte(__pte(pte), ptep, offset);
+
+       /*
+        * Check if we have an active batch on this CPU. If not, just
+        * flush now and return.
+        */
+       if (!batch->active) {
+               flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
+               put_cpu_var(ppc64_tlb_batch);
+               return;
+       }
+
+       /*
+        * This can happen when we are in the middle of a TLB batch and
+        * we encounter memory pressure (eg copy_page_range when it tries
+        * to allocate a new pte). If we have to reclaim memory and end
+        * up scanning and resetting referenced bits then our batch context
+        * will change mid stream.
+        *
+        * We also need to ensure only one page size is present in a given
+        * batch
+        */
+       if (i != 0 && (mm != batch->mm || batch->psize != psize ||
+                      batch->ssize != ssize)) {
+               __flush_tlb_pending(batch);
+               i = 0;
+       }
+       if (i == 0) {
+               batch->mm = mm;
+               batch->psize = psize;
+               batch->ssize = ssize;
+       }
+       batch->pte[i] = rpte;
+       batch->vpn[i] = vpn;
+       batch->index = ++i;
+       if (i >= PPC64_TLB_BATCH_NR)
+               __flush_tlb_pending(batch);
+       put_cpu_var(ppc64_tlb_batch);
+}
+
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+       int i, local;
+
+       i = batch->index;
+       local = mm_is_thread_local(batch->mm);
+       if (i == 1)
+               flush_hash_page(batch->vpn[0], batch->pte[0],
+                               batch->psize, batch->ssize, local);
+       else
+               flush_hash_range(i, local);
+       batch->index = 0;
+}
+
+void hash__tlb_flush(struct mmu_gather *tlb)
+{
+       struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
+
+       /*
+        * If there's a TLB batch pending, then we must flush it because the
+        * pages are going to be freed and we really don't want to have a CPU
+        * access a freed page because it has a stale TLB
+        */
+       if (tlbbatch->index)
+               __flush_tlb_pending(tlbbatch);
+
+       put_cpu_var(ppc64_tlb_batch);
+}
+
+/**
+ * __flush_hash_table_range - Flush all HPTEs for a given address range
+ *                            from the hash table (and the TLB). But keeps
+ *                            the linux PTEs intact.
+ *
+ * @mm         : mm_struct of the target address space (generally init_mm)
+ * @start      : starting address
+ * @end         : ending address (not included in the flush)
+ *
+ * This function is mostly to be used by some IO hotplug code in order
+ * to remove all hash entries from a given address range used to map IO
+ * space on a removed PCI-PCI bidge without tearing down the full mapping
+ * since 64K pages may overlap with other bridges when using 64K pages
+ * with 4K HW pages on IO space.
+ *
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
+ */
+void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+                             unsigned long end)
+{
+       bool is_thp;
+       int hugepage_shift;
+       unsigned long flags;
+
+       start = _ALIGN_DOWN(start, PAGE_SIZE);
+       end = _ALIGN_UP(end, PAGE_SIZE);
+
+       BUG_ON(!mm->pgd);
+
+       /*
+        * Note: Normally, we should only ever use a batch within a
+        * PTE locked section. This violates the rule, but will work
+        * since we don't actually modify the PTEs, we just flush the
+        * hash while leaving the PTEs intact (including their reference
+        * to being hashed). This is not the most performance oriented
+        * way to do things but is fine for our needs here.
+        */
+       local_irq_save(flags);
+       arch_enter_lazy_mmu_mode();
+       for (; start < end; start += PAGE_SIZE) {
+               pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+                                                 &hugepage_shift);
+               unsigned long pte;
+
+               if (ptep == NULL)
+                       continue;
+               pte = pte_val(*ptep);
+               if (is_thp)
+                       trace_hugepage_invalidate(start, pte);
+               if (!(pte & H_PAGE_HASHPTE))
+                       continue;
+               if (unlikely(is_thp))
+                       hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
+               else
+                       hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+       }
+       arch_leave_lazy_mmu_mode();
+       local_irq_restore(flags);
+}
+
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+       pte_t *pte;
+       pte_t *start_pte;
+       unsigned long flags;
+
+       addr = _ALIGN_DOWN(addr, PMD_SIZE);
+       /*
+        * Note: Normally, we should only ever use a batch within a
+        * PTE locked section. This violates the rule, but will work
+        * since we don't actually modify the PTEs, we just flush the
+        * hash while leaving the PTEs intact (including their reference
+        * to being hashed). This is not the most performance oriented
+        * way to do things but is fine for our needs here.
+        */
+       local_irq_save(flags);
+       arch_enter_lazy_mmu_mode();
+       start_pte = pte_offset_map(pmd, addr);
+       for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+               unsigned long pteval = pte_val(*pte);
+               if (pteval & H_PAGE_HASHPTE)
+                       hpte_need_flush(mm, addr, pte, pteval, 0);
+               addr += PAGE_SIZE;
+       }
+       arch_leave_lazy_mmu_mode();
+       local_irq_restore(flags);
+}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
new file mode 100644 (file)
index 0000000..b21a81d
--- /dev/null
@@ -0,0 +1,1946 @@
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ *   {mikejc|engebret}@us.ibm.com
+ *
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ *    Module name: htab.c
+ *
+ *    Description:
+ *      PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+#undef DEBUG_LOW
+
+#define pr_fmt(fmt) "hash-mmu: " fmt
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/memblock.h>
+#include <linux/context_tracking.h>
+#include <linux/libfdt.h>
+#include <linux/pkeys.h>
+
+#include <asm/debugfs.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/copro.h>
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
+#include <asm/trace.h>
+#include <asm/ps3.h>
+#include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+#define GB (1024L*MB)
+
+/*
+ * Note:  pte   --> Linux PTE
+ *        HPTE  --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ *   htab_initialize is called with the MMU off (of course), but
+ *   the kernel has been copied down to zero so it can directly
+ *   reference global data.  At this point it is very difficult
+ *   to print debug info.
+ *
+ */
+
+static unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
+struct hash_pte *htab_address;
+unsigned long htab_size_bytes;
+unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
+int mmu_linear_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_linear_psize);
+int mmu_virtual_psize = MMU_PAGE_4K;
+int mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+int mmu_io_psize = MMU_PAGE_4K;
+int mmu_kernel_ssize = MMU_SEGSIZE_256M;
+EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
+int mmu_highuser_ssize = MMU_SEGSIZE_256M;
+u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
+#ifdef CONFIG_PPC_64K_PAGES
+int mmu_ci_restrictions;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_SPINLOCK(linear_map_hash_lock);
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+struct mmu_hash_ops mmu_hash_ops;
+EXPORT_SYMBOL(mmu_hash_ops);
+
+/*
+ * These are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
+
+/*
+ * Fallback (4k pages only)
+ */
+static struct mmu_psize_def mmu_psize_defaults[] = {
+       [MMU_PAGE_4K] = {
+               .shift  = 12,
+               .sllp   = 0,
+               .penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+               .avpnm  = 0,
+               .tlbiel = 0,
+       },
+};
+
+/*
+ * POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
+       [MMU_PAGE_4K] = {
+               .shift  = 12,
+               .sllp   = 0,
+               .penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+               .avpnm  = 0,
+               .tlbiel = 1,
+       },
+       [MMU_PAGE_16M] = {
+               .shift  = 24,
+               .sllp   = SLB_VSID_L,
+               .penc   = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+                           [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
+               .avpnm  = 0x1UL,
+               .tlbiel = 0,
+       },
+};
+
+/*
+ * 'R' and 'C' update notes:
+ *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
+ *     create writeable HPTEs without C set, because the hcall H_PROTECT
+ *     that we use in that case will not update C
+ *  - The above is however not a problem, because we also don't do that
+ *     fancy "no flush" variant of eviction and we use H_REMOVE which will
+ *     do the right thing and thus we don't have the race I described earlier
+ *
+ *    - Under bare metal,  we do have the race, so we need R and C set
+ *    - We make sure R is always set and never lost
+ *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
+ */
+unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+       unsigned long rflags = 0;
+
+       /* _PAGE_EXEC -> NOEXEC */
+       if ((pteflags & _PAGE_EXEC) == 0)
+               rflags |= HPTE_R_N;
+       /*
+        * PPP bits:
+        * Linux uses slb key 0 for kernel and 1 for user.
+        * kernel RW areas are mapped with PPP=0b000
+        * User area is mapped with PPP=0b010 for read/write
+        * or PPP=0b011 for read-only (including writeable but clean pages).
+        */
+       if (pteflags & _PAGE_PRIVILEGED) {
+               /*
+                * Kernel read only mapped with ppp bits 0b110
+                */
+               if (!(pteflags & _PAGE_WRITE)) {
+                       if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+                               rflags |= (HPTE_R_PP0 | 0x2);
+                       else
+                               rflags |= 0x3;
+               }
+       } else {
+               if (pteflags & _PAGE_RWX)
+                       rflags |= 0x2;
+               if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
+                       rflags |= 0x1;
+       }
+       /*
+        * We can't allow hardware to update hpte bits. Hence always
+        * set 'R' bit and set 'C' if it is a write fault
+        */
+       rflags |=  HPTE_R_R;
+
+       if (pteflags & _PAGE_DIRTY)
+               rflags |= HPTE_R_C;
+       /*
+        * Add in WIG bits
+        */
+
+       if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
+               rflags |= HPTE_R_I;
+       else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
+               rflags |= (HPTE_R_I | HPTE_R_G);
+       else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+               rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
+       else
+               /*
+                * Add memory coherence if cache inhibited is not set
+                */
+               rflags |= HPTE_R_M;
+
+       rflags |= pte_to_hpte_pkey_bits(pteflags);
+       return rflags;
+}
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+                     unsigned long pstart, unsigned long prot,
+                     int psize, int ssize)
+{
+       unsigned long vaddr, paddr;
+       unsigned int step, shift;
+       int ret = 0;
+
+       shift = mmu_psize_defs[psize].shift;
+       step = 1 << shift;
+
+       prot = htab_convert_pte_flags(prot);
+
+       DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+           vstart, vend, pstart, prot, psize, ssize);
+
+       for (vaddr = vstart, paddr = pstart; vaddr < vend;
+            vaddr += step, paddr += step) {
+               unsigned long hash, hpteg;
+               unsigned long vsid = get_kernel_vsid(vaddr, ssize);
+               unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
+               unsigned long tprot = prot;
+
+               /*
+                * If we hit a bad address return error.
+                */
+               if (!vsid)
+                       return -1;
+               /* Make kernel text executable */
+               if (overlaps_kernel_text(vaddr, vaddr + step))
+                       tprot &= ~HPTE_R_N;
+
+               /* Make kvm guest trampolines executable */
+               if (overlaps_kvm_tmp(vaddr, vaddr + step))
+                       tprot &= ~HPTE_R_N;
+
+               /*
+                * If relocatable, check if it overlaps interrupt vectors that
+                * are copied down to real 0. For relocatable kernel
+                * (e.g. kdump case) we copy interrupt vectors down to real
+                * address 0. Mark that region as executable. This is
+                * because on p8 system with relocation on exception feature
+                * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+                * in order to execute the interrupt handlers in virtual
+                * mode the vector region need to be marked as executable.
+                */
+               if ((PHYSICAL_START > MEMORY_START) &&
+                       overlaps_interrupt_vector_text(vaddr, vaddr + step))
+                               tprot &= ~HPTE_R_N;
+
+               hash = hpt_hash(vpn, shift, ssize);
+               hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+               BUG_ON(!mmu_hash_ops.hpte_insert);
+               ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
+                                              HPTE_V_BOLTED, psize, psize,
+                                              ssize);
+
+               if (ret < 0)
+                       break;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               if (debug_pagealloc_enabled() &&
+                       (paddr >> PAGE_SHIFT) < linear_map_hash_count)
+                       linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+       }
+       return ret < 0 ? ret : 0;
+}
+
+int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+                     int psize, int ssize)
+{
+       unsigned long vaddr;
+       unsigned int step, shift;
+       int rc;
+       int ret = 0;
+
+       shift = mmu_psize_defs[psize].shift;
+       step = 1 << shift;
+
+       if (!mmu_hash_ops.hpte_removebolted)
+               return -ENODEV;
+
+       for (vaddr = vstart; vaddr < vend; vaddr += step) {
+               rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+               if (rc == -ENOENT) {
+                       ret = -ENOENT;
+                       continue;
+               }
+               if (rc < 0)
+                       return rc;
+       }
+
+       return ret;
+}
+
+static bool disable_1tb_segments = false;
+
+static int __init parse_disable_1tb_segments(char *p)
+{
+       disable_1tb_segments = true;
+       return 0;
+}
+early_param("disable_1tb_segments", parse_disable_1tb_segments);
+
+static int __init htab_dt_scan_seg_sizes(unsigned long node,
+                                        const char *uname, int depth,
+                                        void *data)
+{
+       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+       const __be32 *prop;
+       int size = 0;
+
+       /* We are scanning "cpu" nodes only */
+       if (type == NULL || strcmp(type, "cpu") != 0)
+               return 0;
+
+       prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
+       if (prop == NULL)
+               return 0;
+       for (; size >= 4; size -= 4, ++prop) {
+               if (be32_to_cpu(prop[0]) == 40) {
+                       DBG("1T segment support detected\n");
+
+                       if (disable_1tb_segments) {
+                               DBG("1T segments disabled by command line\n");
+                               break;
+                       }
+
+                       cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
+                       return 1;
+               }
+       }
+       cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+       return 0;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+       int idx = -1;
+
+       switch (shift) {
+       case 0xc:
+               idx = MMU_PAGE_4K;
+               break;
+       case 0x10:
+               idx = MMU_PAGE_64K;
+               break;
+       case 0x14:
+               idx = MMU_PAGE_1M;
+               break;
+       case 0x18:
+               idx = MMU_PAGE_16M;
+               break;
+       case 0x22:
+               idx = MMU_PAGE_16G;
+               break;
+       }
+       return idx;
+}
+
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+                                         const char *uname, int depth,
+                                         void *data)
+{
+       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+       const __be32 *prop;
+       int size = 0;
+
+       /* We are scanning "cpu" nodes only */
+       if (type == NULL || strcmp(type, "cpu") != 0)
+               return 0;
+
+       prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
+       if (!prop)
+               return 0;
+
+       pr_info("Page sizes from device-tree:\n");
+       size /= 4;
+       cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
+       while(size > 0) {
+               unsigned int base_shift = be32_to_cpu(prop[0]);
+               unsigned int slbenc = be32_to_cpu(prop[1]);
+               unsigned int lpnum = be32_to_cpu(prop[2]);
+               struct mmu_psize_def *def;
+               int idx, base_idx;
+
+               size -= 3; prop += 3;
+               base_idx = get_idx_from_shift(base_shift);
+               if (base_idx < 0) {
+                       /* skip the pte encoding also */
+                       prop += lpnum * 2; size -= lpnum * 2;
+                       continue;
+               }
+               def = &mmu_psize_defs[base_idx];
+               if (base_idx == MMU_PAGE_16M)
+                       cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+
+               def->shift = base_shift;
+               if (base_shift <= 23)
+                       def->avpnm = 0;
+               else
+                       def->avpnm = (1 << (base_shift - 23)) - 1;
+               def->sllp = slbenc;
+               /*
+                * We don't know for sure what's up with tlbiel, so
+                * for now we only set it for 4K and 64K pages
+                */
+               if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
+                       def->tlbiel = 1;
+               else
+                       def->tlbiel = 0;
+
+               while (size > 0 && lpnum) {
+                       unsigned int shift = be32_to_cpu(prop[0]);
+                       int penc  = be32_to_cpu(prop[1]);
+
+                       prop += 2; size -= 2;
+                       lpnum--;
+
+                       idx = get_idx_from_shift(shift);
+                       if (idx < 0)
+                               continue;
+
+                       if (penc == -1)
+                               pr_err("Invalid penc for base_shift=%d "
+                                      "shift=%d\n", base_shift, shift);
+
+                       def->penc[idx] = penc;
+                       pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+                               " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+                               base_shift, shift, def->sllp,
+                               def->avpnm, def->tlbiel, def->penc[idx]);
+               }
+       }
+
+       return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+                                       const char *uname, int depth,
+                                       void *data) {
+       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+       const __be64 *addr_prop;
+       const __be32 *page_count_prop;
+       unsigned int expected_pages;
+       long unsigned int phys_addr;
+       long unsigned int block_size;
+
+       /* We are scanning "memory" nodes only */
+       if (type == NULL || strcmp(type, "memory") != 0)
+               return 0;
+
+       /*
+        * This property is the log base 2 of the number of virtual pages that
+        * will represent this memory block.
+        */
+       page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+       if (page_count_prop == NULL)
+               return 0;
+       expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
+       addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+       if (addr_prop == NULL)
+               return 0;
+       phys_addr = be64_to_cpu(addr_prop[0]);
+       block_size = be64_to_cpu(addr_prop[1]);
+       if (block_size != (16 * GB))
+               return 0;
+       printk(KERN_INFO "Huge page(16GB) memory: "
+                       "addr = 0x%lX size = 0x%lX pages = %d\n",
+                       phys_addr, block_size, expected_pages);
+       if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
+               memblock_reserve(phys_addr, block_size * expected_pages);
+               pseries_add_gpage(phys_addr, block_size, expected_pages);
+       }
+       return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void mmu_psize_set_default_penc(void)
+{
+       int bpsize, apsize;
+       for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+               for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+                       mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+       /*
+        * The HEA ethernet adapter requires awareness of the
+        * GX bus. Without that awareness we can easily assume
+        * we will never see an HEA ethernet device.
+        */
+#ifdef CONFIG_IBMEBUS
+       return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
+               firmware_has_feature(FW_FEATURE_SPLPAR);
+#else
+       return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
+static void __init htab_scan_page_sizes(void)
+{
+       int rc;
+
+       /* se the invalid penc to -1 */
+       mmu_psize_set_default_penc();
+
+       /* Default to 4K pages only */
+       memcpy(mmu_psize_defs, mmu_psize_defaults,
+              sizeof(mmu_psize_defaults));
+
+       /*
+        * Try to find the available page sizes in the device-tree
+        */
+       rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+       if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
+               /*
+                * Nothing in the device-tree, but the CPU supports 16M pages,
+                * so let's fallback on a known size list for 16M capable CPUs.
+                */
+               memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+                      sizeof(mmu_psize_defaults_gp));
+       }
+
+#ifdef CONFIG_HUGETLB_PAGE
+       if (!hugetlb_disabled) {
+               /* Reserve 16G huge page memory sections for huge pages */
+               of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+       }
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz                >=8KB
+ *    rrrr rrzz                >=16KB
+ *    rrrr rzzz                >=32KB
+ *    rrrr zzzz                >=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+       long int ap, bp;
+       long int shift, penc;
+
+       for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+               if (!mmu_psize_defs[bp].shift)
+                       continue;       /* not a supported page size */
+               for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+                       penc = mmu_psize_defs[bp].penc[ap];
+                       if (penc == -1 || !mmu_psize_defs[ap].shift)
+                               continue;
+                       shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+                       if (shift <= 0)
+                               continue;       /* should never happen */
+                       /*
+                        * For page sizes less than 1MB, this loop
+                        * replicates the entry for all possible values
+                        * of the rrrr bits.
+                        */
+                       while (penc < (1 << LP_BITS)) {
+                               hpte_page_sizes[penc] = (ap << 4) | bp;
+                               penc += 1 << shift;
+                       }
+               }
+       }
+}
+
+static void __init htab_init_page_sizes(void)
+{
+       init_hpte_page_sizes();
+
+       if (!debug_pagealloc_enabled()) {
+               /*
+                * Pick a size for the linear mapping. Currently, we only
+                * support 16M, 1M and 4K which is the default
+                */
+               if (mmu_psize_defs[MMU_PAGE_16M].shift)
+                       mmu_linear_psize = MMU_PAGE_16M;
+               else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+                       mmu_linear_psize = MMU_PAGE_1M;
+       }
+
+#ifdef CONFIG_PPC_64K_PAGES
+       /*
+        * Pick a size for the ordinary pages. Default is 4K, we support
+        * 64K for user mappings and vmalloc if supported by the processor.
+        * We only use 64k for ioremap if the processor
+        * (and firmware) support cache-inhibited large pages.
+        * If not, we use 4k and set mmu_ci_restrictions so that
+        * hash_page knows to switch processes that use cache-inhibited
+        * mappings to 4k pages.
+        */
+       if (mmu_psize_defs[MMU_PAGE_64K].shift) {
+               mmu_virtual_psize = MMU_PAGE_64K;
+               mmu_vmalloc_psize = MMU_PAGE_64K;
+               if (mmu_linear_psize == MMU_PAGE_4K)
+                       mmu_linear_psize = MMU_PAGE_64K;
+               if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
+                       /*
+                        * When running on pSeries using 64k pages for ioremap
+                        * would stop us accessing the HEA ethernet. So if we
+                        * have the chance of ever seeing one, stay at 4k.
+                        */
+                       if (!might_have_hea())
+                               mmu_io_psize = MMU_PAGE_64K;
+               } else
+                       mmu_ci_restrictions = 1;
+       }
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       /*
+        * We try to use 16M pages for vmemmap if that is supported
+        * and we have at least 1G of RAM at boot
+        */
+       if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+           memblock_phys_mem_size() >= 0x40000000)
+               mmu_vmemmap_psize = MMU_PAGE_16M;
+       else if (mmu_psize_defs[MMU_PAGE_64K].shift)
+               mmu_vmemmap_psize = MMU_PAGE_64K;
+       else
+               mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+       printk(KERN_DEBUG "Page orders: linear mapping = %d, "
+              "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+              ", vmemmap = %d"
+#endif
+              "\n",
+              mmu_psize_defs[mmu_linear_psize].shift,
+              mmu_psize_defs[mmu_virtual_psize].shift,
+              mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+              ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+              );
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+                                      const char *uname, int depth,
+                                      void *data)
+{
+       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+       const __be32 *prop;
+
+       /* We are scanning "cpu" nodes only */
+       if (type == NULL || strcmp(type, "cpu") != 0)
+               return 0;
+
+       prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+       if (prop != NULL) {
+               /* pft_size[0] is the NUMA CEC cookie */
+               ppc64_pft_size = be32_to_cpu(prop[1]);
+               return 1;
+       }
+       return 0;
+}
+
+unsigned htab_shift_for_mem_size(unsigned long mem_size)
+{
+       unsigned memshift = __ilog2(mem_size);
+       unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
+       unsigned pteg_shift;
+
+       /* round mem_size up to next power of 2 */
+       if ((1UL << memshift) < mem_size)
+               memshift += 1;
+
+       /* aim for 2 pages / pteg */
+       pteg_shift = memshift - (pshift + 1);
+
+       /*
+        * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
+        * size permitted by the architecture.
+        */
+       return max(pteg_shift + 7, 18U);
+}
+
+static unsigned long __init htab_get_table_size(void)
+{
+       /*
+        * If hash size isn't already provided by the platform, we try to
+        * retrieve it from the device-tree. If it's not there neither, we
+        * calculate it now based on the total RAM size
+        */
+       if (ppc64_pft_size == 0)
+               of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
+       if (ppc64_pft_size)
+               return 1UL << ppc64_pft_size;
+
+       return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int resize_hpt_for_hotplug(unsigned long new_mem_size)
+{
+       unsigned target_hpt_shift;
+
+       if (!mmu_hash_ops.resize_hpt)
+               return 0;
+
+       target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
+
+       /*
+        * To avoid lots of HPT resizes if memory size is fluctuating
+        * across a boundary, we deliberately have some hysterisis
+        * here: we immediately increase the HPT size if the target
+        * shift exceeds the current shift, but we won't attempt to
+        * reduce unless the target shift is at least 2 below the
+        * current shift
+        */
+       if (target_hpt_shift > ppc64_pft_size ||
+           target_hpt_shift < ppc64_pft_size - 1)
+               return mmu_hash_ops.resize_hpt(target_hpt_shift);
+
+       return 0;
+}
+
+int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+       int rc;
+
+       if (end >= H_VMALLOC_START) {
+               pr_warn("Outside the supported range\n");
+               return -1;
+       }
+
+       rc = htab_bolt_mapping(start, end, __pa(start),
+                              pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+                              mmu_kernel_ssize);
+
+       if (rc < 0) {
+               int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
+                                             mmu_kernel_ssize);
+               BUG_ON(rc2 && (rc2 != -ENOENT));
+       }
+       return rc;
+}
+
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
+{
+       int rc = htab_remove_mapping(start, end, mmu_linear_psize,
+                                    mmu_kernel_ssize);
+       WARN_ON(rc < 0);
+       return rc;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static void __init hash_init_partition_table(phys_addr_t hash_table,
+                                            unsigned long htab_size)
+{
+       mmu_partition_table_init();
+
+       /*
+        * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
+        * For now, UPRT is 0 and we have no segment table.
+        */
+       htab_size =  __ilog2(htab_size) - 18;
+       mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
+       pr_info("Partition table %p\n", partition_tb);
+}
+
+static void __init htab_initialize(void)
+{
+       unsigned long table;
+       unsigned long pteg_count;
+       unsigned long prot;
+       unsigned long base = 0, size = 0;
+       struct memblock_region *reg;
+
+       DBG(" -> htab_initialize()\n");
+
+       if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+               mmu_kernel_ssize = MMU_SEGSIZE_1T;
+               mmu_highuser_ssize = MMU_SEGSIZE_1T;
+               printk(KERN_INFO "Using 1TB segments\n");
+       }
+
+       /*
+        * Calculate the required size of the htab.  We want the number of
+        * PTEGs to equal one half the number of real pages.
+        */ 
+       htab_size_bytes = htab_get_table_size();
+       pteg_count = htab_size_bytes >> 7;
+
+       htab_hash_mask = pteg_count - 1;
+
+       if (firmware_has_feature(FW_FEATURE_LPAR) ||
+           firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+               /* Using a hypervisor which owns the htab */
+               htab_address = NULL;
+               _SDR1 = 0; 
+               /*
+                * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+                * to inform the hypervisor that we wish to use the HPT.
+                */
+               if (cpu_has_feature(CPU_FTR_ARCH_300))
+                       register_process_table(0, 0, 0);
+#ifdef CONFIG_FA_DUMP
+               /*
+                * If firmware assisted dump is active firmware preserves
+                * the contents of htab along with entire partition memory.
+                * Clear the htab if firmware assisted dump is active so
+                * that we dont end up using old mappings.
+                */
+               if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
+                       mmu_hash_ops.hpte_clear_all();
+#endif
+       } else {
+               unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
+
+#ifdef CONFIG_PPC_CELL
+               /*
+                * Cell may require the hash table down low when using the
+                * Axon IOMMU in order to fit the dynamic region over it, see
+                * comments in cell/iommu.c
+                */
+               if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
+                       limit = 0x80000000;
+                       pr_info("Hash table forced below 2G for Axon IOMMU\n");
+               }
+#endif /* CONFIG_PPC_CELL */
+
+               table = memblock_phys_alloc_range(htab_size_bytes,
+                                                 htab_size_bytes,
+                                                 0, limit);
+               if (!table)
+                       panic("ERROR: Failed to allocate %pa bytes below %pa\n",
+                             &htab_size_bytes, &limit);
+
+               DBG("Hash table allocated at %lx, size: %lx\n", table,
+                   htab_size_bytes);
+
+               htab_address = __va(table);
+
+               /* htab absolute addr + encoded htabsize */
+               _SDR1 = table + __ilog2(htab_size_bytes) - 18;
+
+               /* Initialize the HPT with no entries */
+               memset((void *)table, 0, htab_size_bytes);
+
+               if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                       /* Set SDR1 */
+                       mtspr(SPRN_SDR1, _SDR1);
+               else
+                       hash_init_partition_table(table, htab_size_bytes);
+       }
+
+       prot = pgprot_val(PAGE_KERNEL);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       if (debug_pagealloc_enabled()) {
+               linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+               linear_map_hash_slots = memblock_alloc_try_nid(
+                               linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+                               ppc64_rma_size, NUMA_NO_NODE);
+               if (!linear_map_hash_slots)
+                       panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+                             __func__, linear_map_hash_count, &ppc64_rma_size);
+       }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+       /* create bolted the linear mapping in the hash table */
+       for_each_memblock(memory, reg) {
+               base = (unsigned long)__va(reg->base);
+               size = reg->size;
+
+               DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
+                   base, size, prot);
+
+               if ((base + size) >= H_VMALLOC_START) {
+                       pr_warn("Outside the supported range\n");
+                       continue;
+               }
+
+               BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
+                               prot, mmu_linear_psize, mmu_kernel_ssize));
+       }
+       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+       /*
+        * If we have a memory_limit and we've allocated TCEs then we need to
+        * explicitly map the TCE area at the top of RAM. We also cope with the
+        * case that the TCEs start below memory_limit.
+        * tce_alloc_start/end are 16MB aligned so the mapping should work
+        * for either 4K or 16MB pages.
+        */
+       if (tce_alloc_start) {
+               tce_alloc_start = (unsigned long)__va(tce_alloc_start);
+               tce_alloc_end = (unsigned long)__va(tce_alloc_end);
+
+               if (base + size >= tce_alloc_start)
+                       tce_alloc_start = base + size + 1;
+
+               BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+                                        __pa(tce_alloc_start), prot,
+                                        mmu_linear_psize, mmu_kernel_ssize));
+       }
+
+
+       DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+void __init hash__early_init_devtree(void)
+{
+       /* Initialize segment sizes */
+       of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+
+       /* Initialize page sizes */
+       htab_scan_page_sizes();
+}
+
+struct hash_mm_context init_hash_mm_context;
+void __init hash__early_init_mmu(void)
+{
+#ifndef CONFIG_PPC_64K_PAGES
+       /*
+        * We have code in __hash_page_4K() and elsewhere, which assumes it can
+        * do the following:
+        *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+        *
+        * Where the slot number is between 0-15, and values of 8-15 indicate
+        * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+        * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+        * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+        * with a BUILD_BUG_ON().
+        */
+       BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
+#endif /* CONFIG_PPC_64K_PAGES */
+
+       htab_init_page_sizes();
+
+       /*
+        * initialize page table size
+        */
+       __pte_frag_nr = H_PTE_FRAG_NR;
+       __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+       __pmd_frag_nr = H_PMD_FRAG_NR;
+       __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
+
+       __pte_index_size = H_PTE_INDEX_SIZE;
+       __pmd_index_size = H_PMD_INDEX_SIZE;
+       __pud_index_size = H_PUD_INDEX_SIZE;
+       __pgd_index_size = H_PGD_INDEX_SIZE;
+       __pud_cache_index = H_PUD_CACHE_INDEX;
+       __pte_table_size = H_PTE_TABLE_SIZE;
+       __pmd_table_size = H_PMD_TABLE_SIZE;
+       __pud_table_size = H_PUD_TABLE_SIZE;
+       __pgd_table_size = H_PGD_TABLE_SIZE;
+       /*
+        * 4k use hugepd format, so for hash set then to
+        * zero
+        */
+       __pmd_val_bits = HASH_PMD_VAL_BITS;
+       __pud_val_bits = HASH_PUD_VAL_BITS;
+       __pgd_val_bits = HASH_PGD_VAL_BITS;
+
+       __kernel_virt_start = H_KERN_VIRT_START;
+       __vmalloc_start = H_VMALLOC_START;
+       __vmalloc_end = H_VMALLOC_END;
+       __kernel_io_start = H_KERN_IO_START;
+       __kernel_io_end = H_KERN_IO_END;
+       vmemmap = (struct page *)H_VMEMMAP_START;
+       ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+       pci_io_base = ISA_IO_BASE;
+#endif
+
+       /* Select appropriate backend */
+       if (firmware_has_feature(FW_FEATURE_PS3_LV1))
+               ps3_early_mm_init();
+       else if (firmware_has_feature(FW_FEATURE_LPAR))
+               hpte_init_pseries();
+       else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+               hpte_init_native();
+
+       if (!mmu_hash_ops.hpte_insert)
+               panic("hash__early_init_mmu: No MMU hash ops defined!\n");
+
+       /*
+        * Initialize the MMU Hash table and create the linear mapping
+        * of memory. Has to be done before SLB initialization as this is
+        * currently where the page size encoding is obtained.
+        */
+       htab_initialize();
+
+       init_mm.context.hash_context = &init_hash_mm_context;
+       init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+
+       pr_info("Initializing hash mmu with SLB\n");
+       /* Initialize SLB management */
+       slb_initialize();
+
+       if (cpu_has_feature(CPU_FTR_ARCH_206)
+                       && cpu_has_feature(CPU_FTR_HVMODE))
+               tlbiel_all();
+}
+
+#ifdef CONFIG_SMP
+void hash__early_init_mmu_secondary(void)
+{
+       /* Initialize hash table for that CPU */
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+
+               if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                       mtspr(SPRN_SDR1, _SDR1);
+               else
+                       mtspr(SPRN_PTCR,
+                             __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+       }
+       /* Initialize SLB */
+       slb_initialize();
+
+       if (cpu_has_feature(CPU_FTR_ARCH_206)
+                       && cpu_has_feature(CPU_FTR_HVMODE))
+               tlbiel_all();
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+       struct page *page;
+
+       if (!pfn_valid(pte_pfn(pte)))
+               return pp;
+
+       page = pte_page(pte);
+
+       /* page is dirty */
+       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+               if (trap == 0x400) {
+                       flush_dcache_icache_page(page);
+                       set_bit(PG_arch_1, &page->flags);
+               } else
+                       pp |= HPTE_R_N;
+       }
+       return pp;
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+static unsigned int get_paca_psize(unsigned long addr)
+{
+       unsigned char *psizes;
+       unsigned long index, mask_index;
+
+       if (addr < SLICE_LOW_TOP) {
+               psizes = get_paca()->mm_ctx_low_slices_psize;
+               index = GET_LOW_SLICE_INDEX(addr);
+       } else {
+               psizes = get_paca()->mm_ctx_high_slices_psize;
+               index = GET_HIGH_SLICE_INDEX(addr);
+       }
+       mask_index = index & 0x1;
+       return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+       return get_paca()->mm_ctx_user_psize;
+}
+#endif
+
+/*
+ * Demote a segment to using 4k pages.
+ * For now this makes the whole process use 4k pages.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
+{
+       if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
+               return;
+       slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
+       copro_flush_all_slbs(mm);
+       if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
+
+               copy_mm_to_paca(mm);
+               slb_flush_and_restore_bolted();
+       }
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
+ * Userspace sets the subpage permissions using the subpage_prot system call.
+ *
+ * Result is 0: full permissions, _PAGE_RW: read-only,
+ * _PAGE_RWX: no access.
+ */
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+       struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+       u32 spp = 0;
+       u32 **sbpm, *sbpp;
+
+       if (!spt)
+               return 0;
+
+       if (ea >= spt->maxaddr)
+               return 0;
+       if (ea < 0x100000000UL) {
+               /* addresses below 4GB use spt->low_prot */
+               sbpm = spt->low_prot;
+       } else {
+               sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
+               if (!sbpm)
+                       return 0;
+       }
+       sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+       if (!sbpp)
+               return 0;
+       spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
+
+       /* extract 2-bit bitfield for this 4k subpage */
+       spp >>= 30 - 2 * ((ea >> 12) & 0xf);
+
+       /*
+        * 0 -> full premission
+        * 1 -> Read only
+        * 2 -> no access.
+        * We return the flag that need to be cleared.
+        */
+       spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
+       return spp;
+}
+
+#else /* CONFIG_PPC_SUBPAGE_PROT */
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+       return 0;
+}
+#endif
+
+void hash_failure_debug(unsigned long ea, unsigned long access,
+                       unsigned long vsid, unsigned long trap,
+                       int ssize, int psize, int lpsize, unsigned long pte)
+{
+       if (!printk_ratelimit())
+               return;
+       pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
+               ea, access, current->comm);
+       pr_info("    trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+               trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+                            int psize, bool user_region)
+{
+       if (user_region) {
+               if (psize != get_paca_psize(ea)) {
+                       copy_mm_to_paca(mm);
+                       slb_flush_and_restore_bolted();
+               }
+       } else if (get_paca()->vmalloc_sllp !=
+                  mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+               get_paca()->vmalloc_sllp =
+                       mmu_psize_defs[mmu_vmalloc_psize].sllp;
+               slb_vmalloc_update();
+       }
+}
+
+/*
+ * Result code is:
+ *  0 - handled
+ *  1 - normal page fault
+ * -1 - critical hash insertion error
+ * -2 - access not permitted by subpage protection mechanism
+ */
+int hash_page_mm(struct mm_struct *mm, unsigned long ea,
+                unsigned long access, unsigned long trap,
+                unsigned long flags)
+{
+       bool is_thp;
+       enum ctx_state prev_state = exception_enter();
+       pgd_t *pgdir;
+       unsigned long vsid;
+       pte_t *ptep;
+       unsigned hugeshift;
+       int rc, user_region = 0;
+       int psize, ssize;
+
+       DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+               ea, access, trap);
+       trace_hash_fault(ea, access, trap);
+
+       /* Get region & vsid */
+       switch (get_region_id(ea)) {
+       case USER_REGION_ID:
+               user_region = 1;
+               if (! mm) {
+                       DBG_LOW(" user region with no mm !\n");
+                       rc = 1;
+                       goto bail;
+               }
+               psize = get_slice_psize(mm, ea);
+               ssize = user_segment_size(ea);
+               vsid = get_user_vsid(&mm->context, ea, ssize);
+               break;
+       case VMALLOC_REGION_ID:
+               vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+               psize = mmu_vmalloc_psize;
+               ssize = mmu_kernel_ssize;
+               break;
+
+       case IO_REGION_ID:
+               vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+               psize = mmu_io_psize;
+               ssize = mmu_kernel_ssize;
+               break;
+       default:
+               /*
+                * Not a valid range
+                * Send the problem up to do_page_fault()
+                */
+               rc = 1;
+               goto bail;
+       }
+       DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+
+       /* Bad address. */
+       if (!vsid) {
+               DBG_LOW("Bad address!\n");
+               rc = 1;
+               goto bail;
+       }
+       /* Get pgdir */
+       pgdir = mm->pgd;
+       if (pgdir == NULL) {
+               rc = 1;
+               goto bail;
+       }
+
+       /* Check CPU locality */
+       if (user_region && mm_is_thread_local(mm))
+               flags |= HPTE_LOCAL_UPDATE;
+
+#ifndef CONFIG_PPC_64K_PAGES
+       /*
+        * If we use 4K pages and our psize is not 4K, then we might
+        * be hitting a special driver mapping, and need to align the
+        * address before we fetch the PTE.
+        *
+        * It could also be a hugepage mapping, in which case this is
+        * not necessary, but it's not harmful, either.
+        */
+       if (psize != MMU_PAGE_4K)
+               ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+       /* Get PTE and page size from page tables */
+       ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
+       if (ptep == NULL || !pte_present(*ptep)) {
+               DBG_LOW(" no PTE !\n");
+               rc = 1;
+               goto bail;
+       }
+
+       /* Add _PAGE_PRESENT to the required access perm */
+       access |= _PAGE_PRESENT;
+
+       /*
+        * Pre-check access permissions (will be re-checked atomically
+        * in __hash_page_XX but this pre-check is a fast path
+        */
+       if (!check_pte_access(access, pte_val(*ptep))) {
+               DBG_LOW(" no access !\n");
+               rc = 1;
+               goto bail;
+       }
+
+       if (hugeshift) {
+               if (is_thp)
+                       rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+                                            trap, flags, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+               else
+                       rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+                                             flags, ssize, hugeshift, psize);
+#else
+               else {
+                       /*
+                        * if we have hugeshift, and is not transhuge with
+                        * hugetlb disabled, something is really wrong.
+                        */
+                       rc = 1;
+                       WARN_ON(1);
+               }
+#endif
+               if (current->mm == mm)
+                       check_paca_psize(ea, mm, psize, user_region);
+
+               goto bail;
+       }
+
+#ifndef CONFIG_PPC_64K_PAGES
+       DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+       DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+               pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+       /* Do actual hashing */
+#ifdef CONFIG_PPC_64K_PAGES
+       /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
+       if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+               demote_segment_4k(mm, ea);
+               psize = MMU_PAGE_4K;
+       }
+
+       /*
+        * If this PTE is non-cacheable and we have restrictions on
+        * using non cacheable large pages, then we switch to 4k
+        */
+       if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
+               if (user_region) {
+                       demote_segment_4k(mm, ea);
+                       psize = MMU_PAGE_4K;
+               } else if (ea < VMALLOC_END) {
+                       /*
+                        * some driver did a non-cacheable mapping
+                        * in vmalloc space, so switch vmalloc
+                        * to 4k pages
+                        */
+                       printk(KERN_ALERT "Reducing vmalloc segment "
+                              "to 4kB pages because of "
+                              "non-cacheable mapping\n");
+                       psize = mmu_vmalloc_psize = MMU_PAGE_4K;
+                       copro_flush_all_slbs(mm);
+               }
+       }
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+       if (current->mm == mm)
+               check_paca_psize(ea, mm, psize, user_region);
+
+#ifdef CONFIG_PPC_64K_PAGES
+       if (psize == MMU_PAGE_64K)
+               rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+                                    flags, ssize);
+       else
+#endif /* CONFIG_PPC_64K_PAGES */
+       {
+               int spp = subpage_protection(mm, ea);
+               if (access & spp)
+                       rc = -2;
+               else
+                       rc = __hash_page_4K(ea, access, vsid, ptep, trap,
+                                           flags, ssize, spp);
+       }
+
+       /*
+        * Dump some info in case of hash insertion failure, they should
+        * never happen so it is really useful to know if/when they do
+        */
+       if (rc == -1)
+               hash_failure_debug(ea, access, vsid, trap, ssize, psize,
+                                  psize, pte_val(*ptep));
+#ifndef CONFIG_PPC_64K_PAGES
+       DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+       DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+               pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+       DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+       exception_exit(prev_state);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(hash_page_mm);
+
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
+             unsigned long dsisr)
+{
+       unsigned long flags = 0;
+       struct mm_struct *mm = current->mm;
+
+       if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
+           (get_region_id(ea) == IO_REGION_ID))
+               mm = &init_mm;
+
+       if (dsisr & DSISR_NOHPTE)
+               flags |= HPTE_NOHPTE_UPDATE;
+
+       return hash_page_mm(mm, ea, access, trap, flags);
+}
+EXPORT_SYMBOL_GPL(hash_page);
+
+int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
+               unsigned long dsisr)
+{
+       unsigned long access = _PAGE_PRESENT | _PAGE_READ;
+       unsigned long flags = 0;
+       struct mm_struct *mm = current->mm;
+       unsigned int region_id = get_region_id(ea);
+
+       if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
+               mm = &init_mm;
+
+       if (dsisr & DSISR_NOHPTE)
+               flags |= HPTE_NOHPTE_UPDATE;
+
+       if (dsisr & DSISR_ISSTORE)
+               access |= _PAGE_WRITE;
+       /*
+        * We set _PAGE_PRIVILEGED only when
+        * kernel mode access kernel space.
+        *
+        * _PAGE_PRIVILEGED is NOT set
+        * 1) when kernel mode access user space
+        * 2) user space access kernel space.
+        */
+       access |= _PAGE_PRIVILEGED;
+       if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
+               access &= ~_PAGE_PRIVILEGED;
+
+       if (trap == 0x400)
+               access |= _PAGE_EXEC;
+
+       return hash_page_mm(mm, ea, access, trap, flags);
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+       int psize = get_slice_psize(mm, ea);
+
+       /* We only prefault standard pages for now */
+       if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
+               return false;
+
+       /*
+        * Don't prefault if subpage protection is enabled for the EA.
+        */
+       if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
+               return false;
+
+       return true;
+}
+#else
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+       return true;
+}
+#endif
+
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+                 bool is_exec, unsigned long trap)
+{
+       int hugepage_shift;
+       unsigned long vsid;
+       pgd_t *pgdir;
+       pte_t *ptep;
+       unsigned long flags;
+       int rc, ssize, update_flags = 0;
+       unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+
+       BUG_ON(get_region_id(ea) != USER_REGION_ID);
+
+       if (!should_hash_preload(mm, ea))
+               return;
+
+       DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+               " trap=%lx\n", mm, mm->pgd, ea, access, trap);
+
+       /* Get Linux PTE if available */
+       pgdir = mm->pgd;
+       if (pgdir == NULL)
+               return;
+
+       /* Get VSID */
+       ssize = user_segment_size(ea);
+       vsid = get_user_vsid(&mm->context, ea, ssize);
+       if (!vsid)
+               return;
+       /*
+        * Hash doesn't like irqs. Walking linux page table with irq disabled
+        * saves us from holding multiple locks.
+        */
+       local_irq_save(flags);
+
+       /*
+        * THP pages use update_mmu_cache_pmd. We don't do
+        * hash preload there. Hence can ignore THP here
+        */
+       ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
+       if (!ptep)
+               goto out_exit;
+
+       WARN_ON(hugepage_shift);
+#ifdef CONFIG_PPC_64K_PAGES
+       /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
+        * a 64K kernel), then we don't preload, hash_page() will take
+        * care of it once we actually try to access the page.
+        * That way we don't have to duplicate all of the logic for segment
+        * page size demotion here
+        */
+       if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
+               goto out_exit;
+#endif /* CONFIG_PPC_64K_PAGES */
+
+       /* Is that local to this CPU ? */
+       if (mm_is_thread_local(mm))
+               update_flags |= HPTE_LOCAL_UPDATE;
+
+       /* Hash it in */
+#ifdef CONFIG_PPC_64K_PAGES
+       if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
+               rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+                                    update_flags, ssize);
+       else
+#endif /* CONFIG_PPC_64K_PAGES */
+               rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
+                                   ssize, subpage_protection(mm, ea));
+
+       /* Dump some info in case of hash insertion failure, they should
+        * never happen so it is really useful to know if/when they do
+        */
+       if (rc == -1)
+               hash_failure_debug(ea, access, vsid, trap, ssize,
+                                  mm_ctx_user_psize(&mm->context),
+                                  mm_ctx_user_psize(&mm->context),
+                                  pte_val(*ptep));
+out_exit:
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_MEM_KEYS
+/*
+ * Return the protection key associated with the given address and the
+ * mm_struct.
+ */
+u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
+{
+       pte_t *ptep;
+       u16 pkey = 0;
+       unsigned long flags;
+
+       if (!mm || !mm->pgd)
+               return 0;
+
+       local_irq_save(flags);
+       ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
+       if (ptep)
+               pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
+       local_irq_restore(flags);
+
+       return pkey;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void tm_flush_hash_page(int local)
+{
+       /*
+        * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
+        * page back to a block device w/PIO could pick up transactional data
+        * (bad!) so we force an abort here. Before the sync the page will be
+        * made read-only, which will flush_hash_page. BIG ISSUE here: if the
+        * kernel uses a page from userspace without unmapping it first, it may
+        * see the speculated version.
+        */
+       if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+           MSR_TM_ACTIVE(current->thread.regs->msr)) {
+               tm_enable();
+               tm_abort(TM_CAUSE_TLBI);
+       }
+}
+#else
+static inline void tm_flush_hash_page(int local)
+{
+}
+#endif
+
+/*
+ * Return the global hash slot, corresponding to the given PTE, which contains
+ * the HPTE.
+ */
+unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
+               int ssize, real_pte_t rpte, unsigned int subpg_index)
+{
+       unsigned long hash, gslot, hidx;
+
+       hash = hpt_hash(vpn, shift, ssize);
+       hidx = __rpte_to_hidx(rpte, subpg_index);
+       if (hidx & _PTEIDX_SECONDARY)
+               hash = ~hash;
+       gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+       gslot += hidx & _PTEIDX_GROUP_IX;
+       return gslot;
+}
+
+/*
+ * WARNING: This is called from hash_low_64.S, if you change this prototype,
+ *          do not forget to update the assembly call site !
+ */
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
+                    unsigned long flags)
+{
+       unsigned long index, shift, gslot;
+       int local = flags & HPTE_LOCAL_UPDATE;
+
+       DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+       pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+               gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
+               DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
+               /*
+                * We use same base page size and actual psize, because we don't
+                * use these functions for hugepage
+                */
+               mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
+                                            ssize, local);
+       } pte_iterate_hashed_end();
+
+       tm_flush_hash_page(local);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
+                        pmd_t *pmdp, unsigned int psize, int ssize,
+                        unsigned long flags)
+{
+       int i, max_hpte_count, valid;
+       unsigned long s_addr;
+       unsigned char *hpte_slot_array;
+       unsigned long hidx, shift, vpn, hash, slot;
+       int local = flags & HPTE_LOCAL_UPDATE;
+
+       s_addr = addr & HPAGE_PMD_MASK;
+       hpte_slot_array = get_hpte_slot_array(pmdp);
+       /*
+        * IF we try to do a HUGE PTE update after a withdraw is done.
+        * we will find the below NULL. This happens when we do
+        * split_huge_page_pmd
+        */
+       if (!hpte_slot_array)
+               return;
+
+       if (mmu_hash_ops.hugepage_invalidate) {
+               mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+                                                psize, ssize, local);
+               goto tm_abort;
+       }
+       /*
+        * No bluk hpte removal support, invalidate each entry
+        */
+       shift = mmu_psize_defs[psize].shift;
+       max_hpte_count = HPAGE_PMD_SIZE >> shift;
+       for (i = 0; i < max_hpte_count; i++) {
+               /*
+                * 8 bits per each hpte entries
+                * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+                */
+               valid = hpte_valid(hpte_slot_array, i);
+               if (!valid)
+                       continue;
+               hidx =  hpte_hash_index(hpte_slot_array, i);
+
+               /* get the vpn */
+               addr = s_addr + (i * (1ul << shift));
+               vpn = hpt_vpn(addr, vsid, ssize);
+               hash = hpt_hash(vpn, shift, ssize);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+               mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
+                                            MMU_PAGE_16M, ssize, local);
+       }
+tm_abort:
+       tm_flush_hash_page(local);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void flush_hash_range(unsigned long number, int local)
+{
+       if (mmu_hash_ops.flush_hash_range)
+               mmu_hash_ops.flush_hash_range(number, local);
+       else {
+               int i;
+               struct ppc64_tlb_batch *batch =
+                       this_cpu_ptr(&ppc64_tlb_batch);
+
+               for (i = 0; i < number; i++)
+                       flush_hash_page(batch->vpn[i], batch->pte[i],
+                                       batch->psize, batch->ssize, local);
+       }
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
+{
+       enum ctx_state prev_state = exception_enter();
+
+       if (user_mode(regs)) {
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+               if (rc == -2)
+                       _exception(SIGSEGV, regs, SEGV_ACCERR, address);
+               else
+#endif
+                       _exception(SIGBUS, regs, BUS_ADRERR, address);
+       } else
+               bad_page_fault(regs, address, SIGBUS);
+
+       exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+                          unsigned long pa, unsigned long rflags,
+                          unsigned long vflags, int psize, int ssize)
+{
+       unsigned long hpte_group;
+       long slot;
+
+repeat:
+       hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+       /* Insert into the hash table, primary slot */
+       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+                                       psize, psize, ssize);
+
+       /* Primary is full, try the secondary */
+       if (unlikely(slot == -1)) {
+               hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
+                                               vflags | HPTE_V_SECONDARY,
+                                               psize, psize, ssize);
+               if (slot == -1) {
+                       if (mftb() & 0x1)
+                               hpte_group = (hash & htab_hash_mask) *
+                                               HPTES_PER_GROUP;
+
+                       mmu_hash_ops.hpte_remove(hpte_group);
+                       goto repeat;
+               }
+       }
+
+       return slot;
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+       unsigned long hash;
+       unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+       unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+       unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+       long ret;
+
+       hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+       /* Don't create HPTE entries for bad address */
+       if (!vsid)
+               return;
+
+       ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+                                   HPTE_V_BOLTED,
+                                   mmu_linear_psize, mmu_kernel_ssize);
+
+       BUG_ON (ret < 0);
+       spin_lock(&linear_map_hash_lock);
+       BUG_ON(linear_map_hash_slots[lmi] & 0x80);
+       linear_map_hash_slots[lmi] = ret | 0x80;
+       spin_unlock(&linear_map_hash_lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+       unsigned long hash, hidx, slot;
+       unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+       unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+
+       hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+       spin_lock(&linear_map_hash_lock);
+       BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+       hidx = linear_map_hash_slots[lmi] & 0x7f;
+       linear_map_hash_slots[lmi] = 0;
+       spin_unlock(&linear_map_hash_lock);
+       if (hidx & _PTEIDX_SECONDARY)
+               hash = ~hash;
+       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+       slot += hidx & _PTEIDX_GROUP_IX;
+       mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+                                    mmu_linear_psize,
+                                    mmu_kernel_ssize, 0);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+       unsigned long flags, vaddr, lmi;
+       int i;
+
+       local_irq_save(flags);
+       for (i = 0; i < numpages; i++, page++) {
+               vaddr = (unsigned long)page_address(page);
+               lmi = __pa(vaddr) >> PAGE_SHIFT;
+               if (lmi >= linear_map_hash_count)
+                       continue;
+               if (enable)
+                       kernel_map_linear_page(vaddr, lmi);
+               else
+                       kernel_unmap_linear_page(vaddr, lmi);
+       }
+       local_irq_restore(flags);
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                               phys_addr_t first_memblock_size)
+{
+       /*
+        * We don't currently support the first MEMBLOCK not mapping 0
+        * physical on those processors
+        */
+       BUG_ON(first_memblock_base != 0);
+
+       /*
+        * On virtualized systems the first entry is our RMA region aka VRMA,
+        * non-virtualized 64-bit hash MMU systems don't have a limitation
+        * on real mode access.
+        *
+        * For guests on platforms before POWER9, we clamp the it limit to 1G
+        * to avoid some funky things such as RTAS bugs etc...
+        */
+       if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
+               ppc64_rma_size = first_memblock_size;
+               if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
+                       ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
+
+               /* Finally limit subsequent allocations */
+               memblock_set_current_limit(ppc64_rma_size);
+       } else {
+               ppc64_rma_size = ULONG_MAX;
+       }
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int hpt_order_get(void *data, u64 *val)
+{
+       *val = ppc64_pft_size;
+       return 0;
+}
+
+static int hpt_order_set(void *data, u64 val)
+{
+       if (!mmu_hash_ops.resize_hpt)
+               return -ENODEV;
+
+       return mmu_hash_ops.resize_hpt(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+
+static int __init hash64_debugfs(void)
+{
+       if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
+                                       NULL, &fops_hpt_order)) {
+               pr_err("lpar: unable to create hpt_order debugsfs file\n");
+       }
+
+       return 0;
+}
+machine_device_initcall(pseries, hash64_debugfs);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
new file mode 100644 (file)
index 0000000..e7a9c4f
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/sizes.h>
+#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY        0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
+
+struct mm_iommu_table_group_mem_t {
+       struct list_head next;
+       struct rcu_head rcu;
+       unsigned long used;
+       atomic64_t mapped;
+       unsigned int pageshift;
+       u64 ua;                 /* userspace address */
+       u64 entries;            /* number of entries in hpas/hpages[] */
+       /*
+        * in mm_iommu_get we temporarily use this to store
+        * struct page address.
+        *
+        * We need to convert ua to hpa in real mode. Make it
+        * simpler by storing physical address.
+        */
+       union {
+               struct page **hpages;   /* vmalloc'ed */
+               phys_addr_t *hpas;
+       };
+#define MM_IOMMU_TABLE_INVALID_HPA     ((uint64_t)-1)
+       u64 dev_hpa;            /* Device memory base address */
+};
+
+static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+               unsigned long npages, bool incr)
+{
+       long ret = 0, locked, lock_limit;
+
+       if (!npages)
+               return 0;
+
+       down_write(&mm->mmap_sem);
+
+       if (incr) {
+               locked = mm->locked_vm + npages;
+               lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                       ret = -ENOMEM;
+               else
+                       mm->locked_vm += npages;
+       } else {
+               if (WARN_ON_ONCE(npages > mm->locked_vm))
+                       npages = mm->locked_vm;
+               mm->locked_vm -= npages;
+       }
+
+       pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
+                       current ? current->pid : 0,
+                       incr ? '+' : '-',
+                       npages << PAGE_SHIFT,
+                       mm->locked_vm << PAGE_SHIFT,
+                       rlimit(RLIMIT_MEMLOCK));
+       up_write(&mm->mmap_sem);
+
+       return ret;
+}
+
+bool mm_iommu_preregistered(struct mm_struct *mm)
+{
+       return !list_empty(&mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+                             unsigned long entries, unsigned long dev_hpa,
+                             struct mm_iommu_table_group_mem_t **pmem)
+{
+       struct mm_iommu_table_group_mem_t *mem;
+       long i, ret, locked_entries = 0;
+       unsigned int pageshift;
+
+       mutex_lock(&mem_list_mutex);
+
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
+                       next) {
+               /* Overlap? */
+               if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+                               (ua < (mem->ua +
+                                      (mem->entries << PAGE_SHIFT)))) {
+                       ret = -EINVAL;
+                       goto unlock_exit;
+               }
+
+       }
+
+       if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+               ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+               if (ret)
+                       goto unlock_exit;
+
+               locked_entries = entries;
+       }
+
+       mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+       if (!mem) {
+               ret = -ENOMEM;
+               goto unlock_exit;
+       }
+
+       if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+               mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+               mem->dev_hpa = dev_hpa;
+               goto good_exit;
+       }
+       mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
+       /*
+        * For a starting point for a maximum page size calculation
+        * we use @ua and @entries natural alignment to allow IOMMU pages
+        * smaller than huge pages but still bigger than PAGE_SIZE.
+        */
+       mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
+       mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
+       if (!mem->hpas) {
+               kfree(mem);
+               ret = -ENOMEM;
+               goto unlock_exit;
+       }
+
+       down_read(&mm->mmap_sem);
+       ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
+       up_read(&mm->mmap_sem);
+       if (ret != entries) {
+               /* free the reference taken */
+               for (i = 0; i < ret; i++)
+                       put_page(mem->hpages[i]);
+
+               vfree(mem->hpas);
+               kfree(mem);
+               ret = -EFAULT;
+               goto unlock_exit;
+       }
+
+       pageshift = PAGE_SHIFT;
+       for (i = 0; i < entries; ++i) {
+               struct page *page = mem->hpages[i];
+
+               /*
+                * Allow to use larger than 64k IOMMU pages. Only do that
+                * if we are backed by hugetlb.
+                */
+               if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
+                       struct page *head = compound_head(page);
+
+                       pageshift = compound_order(head) + PAGE_SHIFT;
+               }
+               mem->pageshift = min(mem->pageshift, pageshift);
+               /*
+                * We don't need struct page reference any more, switch
+                * to physical address.
+                */
+               mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+       }
+
+good_exit:
+       ret = 0;
+       atomic64_set(&mem->mapped, 1);
+       mem->used = 1;
+       mem->ua = ua;
+       mem->entries = entries;
+       *pmem = mem;
+
+       list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
+
+unlock_exit:
+       if (locked_entries && ret)
+               mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+
+       mutex_unlock(&mem_list_mutex);
+
+       return ret;
+}
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+               struct mm_iommu_table_group_mem_t **pmem)
+{
+       return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+                       pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_new);
+
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+               unsigned long entries, unsigned long dev_hpa,
+               struct mm_iommu_table_group_mem_t **pmem)
+{
+       return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+       long i;
+       struct page *page = NULL;
+
+       if (!mem->hpas)
+               return;
+
+       for (i = 0; i < mem->entries; ++i) {
+               if (!mem->hpas[i])
+                       continue;
+
+               page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+               if (!page)
+                       continue;
+
+               if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+                       SetPageDirty(page);
+
+               put_page(page);
+               mem->hpas[i] = 0;
+       }
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+       mm_iommu_unpin(mem);
+       vfree(mem->hpas);
+       kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+       struct mm_iommu_table_group_mem_t *mem = container_of(head,
+                       struct mm_iommu_table_group_mem_t, rcu);
+
+       mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+       list_del_rcu(&mem->next);
+       call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
+{
+       long ret = 0;
+       unsigned long entries, dev_hpa;
+
+       mutex_lock(&mem_list_mutex);
+
+       if (mem->used == 0) {
+               ret = -ENOENT;
+               goto unlock_exit;
+       }
+
+       --mem->used;
+       /* There are still users, exit */
+       if (mem->used)
+               goto unlock_exit;
+
+       /* Are there still mappings? */
+       if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+               ++mem->used;
+               ret = -EBUSY;
+               goto unlock_exit;
+       }
+
+       /* @mapped became 0 so now mappings are disabled, release the region */
+       entries = mem->entries;
+       dev_hpa = mem->dev_hpa;
+       mm_iommu_release(mem);
+
+       if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+               mm_iommu_adjust_locked_vm(mm, entries, false);
+
+unlock_exit:
+       mutex_unlock(&mem_list_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+               unsigned long ua, unsigned long size)
+{
+       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+               if ((mem->ua <= ua) &&
+                               (ua + size <= mem->ua +
+                                (mem->entries << PAGE_SHIFT))) {
+                       ret = mem;
+                       break;
+               }
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+               unsigned long ua, unsigned long size)
+{
+       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+       list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+                       next) {
+               if ((mem->ua <= ua) &&
+                               (ua + size <= mem->ua +
+                                (mem->entries << PAGE_SHIFT))) {
+                       ret = mem;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
+               unsigned long ua, unsigned long entries)
+{
+       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+       mutex_lock(&mem_list_mutex);
+
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+               if ((mem->ua == ua) && (mem->entries == entries)) {
+                       ret = mem;
+                       ++mem->used;
+                       break;
+               }
+       }
+
+       mutex_unlock(&mem_list_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+               unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+       const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+       u64 *va;
+
+       if (entry >= mem->entries)
+               return -EFAULT;
+
+       if (pageshift > mem->pageshift)
+               return -EFAULT;
+
+       if (!mem->hpas) {
+               *hpa = mem->dev_hpa + (ua - mem->ua);
+               return 0;
+       }
+
+       va = &mem->hpas[entry];
+       *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+               unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+       const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+       unsigned long *pa;
+
+       if (entry >= mem->entries)
+               return -EFAULT;
+
+       if (pageshift > mem->pageshift)
+               return -EFAULT;
+
+       if (!mem->hpas) {
+               *hpa = mem->dev_hpa + (ua - mem->ua);
+               return 0;
+       }
+
+       pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
+       if (!pa)
+               return -EFAULT;
+
+       *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+       return 0;
+}
+
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
+{
+       struct mm_iommu_table_group_mem_t *mem;
+       long entry;
+       void *va;
+       unsigned long *pa;
+
+       mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
+       if (!mem)
+               return;
+
+       if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
+               return;
+
+       entry = (ua - mem->ua) >> PAGE_SHIFT;
+       va = &mem->hpas[entry];
+
+       pa = (void *) vmalloc_to_phys(va);
+       if (!pa)
+               return;
+
+       *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+}
+
+bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+               unsigned int pageshift, unsigned long *size)
+{
+       struct mm_iommu_table_group_mem_t *mem;
+       unsigned long end;
+
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+               if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+                       continue;
+
+               end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
+               if ((mem->dev_hpa <= hpa) && (hpa < end)) {
+                       /*
+                        * Since the IOMMU page size might be bigger than
+                        * PAGE_SIZE, the amount of preregistered memory
+                        * starting from @hpa might be smaller than 1<<pageshift
+                        * and the caller needs to distinguish this situation.
+                        */
+                       *size = min(1UL << pageshift, end - hpa);
+                       return true;
+               }
+       }
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+       if (atomic64_inc_not_zero(&mem->mapped))
+               return 0;
+
+       /* Last mm_iommu_put() has been called, no more mappings allowed() */
+       return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+       atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(struct mm_struct *mm)
+{
+       INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+}
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
new file mode 100644 (file)
index 0000000..cb2b086
--- /dev/null
@@ -0,0 +1,263 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/pkeys.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+static DEFINE_IDA(mmu_context_ida);
+
+static int alloc_context_id(int min_id, int max_id)
+{
+       return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
+}
+
+void hash__reserve_context_id(int id)
+{
+       int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
+
+       WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+       unsigned long max;
+
+       if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+               max = MAX_USER_CONTEXT;
+       else
+               max = MAX_USER_CONTEXT_65BIT_VA;
+
+       return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
+void slb_setup_new_exec(void);
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+       int index;
+
+       index = hash__alloc_context_id();
+       if (index < 0)
+               return index;
+
+       mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
+                                          GFP_KERNEL);
+       if (!mm->context.hash_context) {
+               ida_free(&mmu_context_ida, index);
+               return -ENOMEM;
+       }
+
+       /*
+        * The old code would re-promote on fork, we don't do that when using
+        * slices as it could cause problem promoting slices that have been
+        * forced down to 4K.
+        *
+        * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+        * explicitly against context.id == 0. This ensures that we properly
+        * initialize context slice details for newly allocated mm's (which will
+        * have id == 0) and don't alter context slice inherited via fork (which
+        * will have id != 0).
+        *
+        * We should not be calling init_new_context() on init_mm. Hence a
+        * check against 0 is OK.
+        */
+       if (mm->context.id == 0) {
+               memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
+               slice_init_new_context_exec(mm);
+       } else {
+               /* This is fork. Copy hash_context details from current->mm */
+               memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+               /* inherit subpage prot detalis if we have one. */
+               if (current->mm->context.hash_context->spt) {
+                       mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
+                                                               GFP_KERNEL);
+                       if (!mm->context.hash_context->spt) {
+                               ida_free(&mmu_context_ida, index);
+                               kfree(mm->context.hash_context);
+                               return -ENOMEM;
+                       }
+               }
+#endif
+
+       }
+
+       pkey_mm_init(mm);
+       return index;
+}
+
+void hash__setup_new_exec(void)
+{
+       slice_setup_new_exec();
+
+       slb_setup_new_exec();
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
+{
+       unsigned long rts_field;
+       int index, max_id;
+
+       max_id = (1 << mmu_pid_bits) - 1;
+       index = alloc_context_id(mmu_base_pid, max_id);
+       if (index < 0)
+               return index;
+
+       /*
+        * set the process table entry,
+        */
+       rts_field = radix__get_tree_size();
+       process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
+
+       /*
+        * Order the above store with subsequent update of the PID
+        * register (at which point HW can start loading/caching
+        * the entry) and the corresponding load by the MMU from
+        * the L2 cache.
+        */
+       asm volatile("ptesync;isync" : : : "memory");
+
+       mm->context.npu_context = NULL;
+       mm->context.hash_context = NULL;
+
+       return index;
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       int index;
+
+       if (radix_enabled())
+               index = radix__init_new_context(mm);
+       else
+               index = hash__init_new_context(mm);
+
+       if (index < 0)
+               return index;
+
+       mm->context.id = index;
+
+       mm->context.pte_frag = NULL;
+       mm->context.pmd_frag = NULL;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       mm_iommu_init(mm);
+#endif
+       atomic_set(&mm->context.active_cpus, 0);
+       atomic_set(&mm->context.copros, 0);
+
+       return 0;
+}
+
+void __destroy_context(int context_id)
+{
+       ida_free(&mmu_context_ida, context_id);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+static void destroy_contexts(mm_context_t *ctx)
+{
+       int index, context_id;
+
+       for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+               context_id = ctx->extended_id[index];
+               if (context_id)
+                       ida_free(&mmu_context_ida, context_id);
+       }
+       kfree(ctx->hash_context);
+}
+
+static void pmd_frag_destroy(void *pmd_frag)
+{
+       int count;
+       struct page *page;
+
+       page = virt_to_page(pmd_frag);
+       /* drop all the pending references */
+       count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
+       /* We allow PTE_FRAG_NR fragments from a PTE page */
+       if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
+               pgtable_pmd_page_dtor(page);
+               __free_page(page);
+       }
+}
+
+static void destroy_pagetable_cache(struct mm_struct *mm)
+{
+       void *frag;
+
+       frag = mm->context.pte_frag;
+       if (frag)
+               pte_frag_destroy(frag);
+
+       frag = mm->context.pmd_frag;
+       if (frag)
+               pmd_frag_destroy(frag);
+       return;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
+#endif
+       if (radix_enabled())
+               WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+       else
+               subpage_prot_free(mm);
+       destroy_contexts(&mm->context);
+       mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+       destroy_pagetable_cache(mm);
+
+       if (radix_enabled()) {
+               /*
+                * Radix doesn't have a valid bit in the process table
+                * entries. However we know that at least P9 implementation
+                * will avoid caching an entry with an invalid RTS field,
+                * and 0 is invalid. So this will do.
+                *
+                * This runs before the "fullmm" tlb flush in exit_mmap,
+                * which does a RIC=2 tlbie to clear the process table
+                * entry. See the "fullmm" comments in tlb-radix.c.
+                *
+                * No barrier required here after the store because
+                * this process will do the invalidate, which starts with
+                * ptesync.
+                */
+               process_tb[mm->context.id].prtb0 = 0;
+       }
+}
+
+#ifdef CONFIG_PPC_RADIX_MMU
+void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+       mtspr(SPRN_PID, next->context.id);
+       isync();
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
new file mode 100644 (file)
index 0000000..16bda04
--- /dev/null
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/memblock.h>
+#include <misc/cxl-base.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/trace.h>
+#include <asm/powernv.h>
+
+#include <mm/mmu_decl.h>
+#include <trace/events/thp.h>
+
+unsigned long __pmd_frag_nr;
+EXPORT_SYMBOL(__pmd_frag_nr);
+unsigned long __pmd_frag_size_shift;
+EXPORT_SYMBOL(__pmd_frag_size_shift);
+
+int (*register_process_table)(unsigned long base, unsigned long page_size,
+                             unsigned long tbl_size);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+                         pmd_t *pmdp, pmd_t entry, int dirty)
+{
+       int changed;
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+       assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
+#endif
+       changed = !pmd_same(*(pmdp), entry);
+       if (changed) {
+               /*
+                * We can use MMU_PAGE_2M here, because only radix
+                * path look at the psize.
+                */
+               __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
+                                       pmd_pte(entry), address, MMU_PAGE_2M);
+       }
+       return changed;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                             unsigned long address, pmd_t *pmdp)
+{
+       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+               pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+       /*
+        * Make sure hardware valid bit is not set. We don't do
+        * tlb flush for this update.
+        */
+
+       WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+       WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
+#endif
+       trace_hugepage_set_pmd(addr, pmd_val(pmd));
+       return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+       smp_mb();
+       smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                    pmd_t *pmdp)
+{
+       unsigned long old_pmd;
+
+       old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
+       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       /*
+        * This ensures that generic code that rely on IRQ disabling
+        * to prevent a parallel THP split work as expected.
+        */
+       serialize_against_pte_lookup(vma->vm_mm);
+       return __pmd(old_pmd);
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+       return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+       unsigned long pmdv;
+
+       pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+       return pmd_set_protbits(__pmd(pmdv), pgprot);
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+       return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+       unsigned long pmdv;
+
+       pmdv = pmd_val(pmd);
+       pmdv &= _HPAGE_CHG_MASK;
+       return pmd_set_protbits(__pmd(pmdv), newprot);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                         pmd_t *pmd)
+{
+       if (radix_enabled())
+               prefetch((void *)addr);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/* For use by kexec */
+void mmu_cleanup_all(void)
+{
+       if (radix_enabled())
+               radix__mmu_cleanup_all();
+       else if (mmu_hash_ops.hpte_clear_all)
+               mmu_hash_ops.hpte_clear_all();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+       if (radix_enabled())
+               return radix__create_section_mapping(start, end, nid);
+
+       return hash__create_section_mapping(start, end, nid);
+}
+
+int __meminit remove_section_mapping(unsigned long start, unsigned long end)
+{
+       if (radix_enabled())
+               return radix__remove_section_mapping(start, end);
+
+       return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void __init mmu_partition_table_init(void)
+{
+       unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+       unsigned long ptcr;
+
+       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+       /* Initialize the Partition Table with no entries */
+       partition_tb = memblock_alloc(patb_size, patb_size);
+       if (!partition_tb)
+               panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+                     __func__, patb_size, patb_size);
+
+       /*
+        * update partition table control register,
+        * 64 K size.
+        */
+       ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+       mtspr(SPRN_PTCR, ptcr);
+       powernv_set_nmmu_ptcr(ptcr);
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+                                  unsigned long dw1)
+{
+       unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+       partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+       partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+       /*
+        * Global flush of TLBs and partition table caches for this lpid.
+        * The type of flush (hash or radix) depends on what the previous
+        * use of this partition ID was, not the new use.
+        */
+       asm volatile("ptesync" : : : "memory");
+       if (old & PATB_HR) {
+               asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+                            "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+               asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+                            "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+               trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
+       } else {
+               asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+                            "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+               trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+       }
+       /* do we need fixup here ?*/
+       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+
+static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
+{
+       void *pmd_frag, *ret;
+
+       if (PMD_FRAG_NR == 1)
+               return NULL;
+
+       spin_lock(&mm->page_table_lock);
+       ret = mm->context.pmd_frag;
+       if (ret) {
+               pmd_frag = ret + PMD_FRAG_SIZE;
+               /*
+                * If we have taken up all the fragments mark PTE page NULL
+                */
+               if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
+                       pmd_frag = NULL;
+               mm->context.pmd_frag = pmd_frag;
+       }
+       spin_unlock(&mm->page_table_lock);
+       return (pmd_t *)ret;
+}
+
+static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
+{
+       void *ret = NULL;
+       struct page *page;
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+       if (mm == &init_mm)
+               gfp &= ~__GFP_ACCOUNT;
+       page = alloc_page(gfp);
+       if (!page)
+               return NULL;
+       if (!pgtable_pmd_page_ctor(page)) {
+               __free_pages(page, 0);
+               return NULL;
+       }
+
+       atomic_set(&page->pt_frag_refcount, 1);
+
+       ret = page_address(page);
+       /*
+        * if we support only one fragment just return the
+        * allocated page.
+        */
+       if (PMD_FRAG_NR == 1)
+               return ret;
+
+       spin_lock(&mm->page_table_lock);
+       /*
+        * If we find pgtable_page set, we return
+        * the allocated page with single fragement
+        * count.
+        */
+       if (likely(!mm->context.pmd_frag)) {
+               atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+               mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
+       }
+       spin_unlock(&mm->page_table_lock);
+
+       return (pmd_t *)ret;
+}
+
+pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
+{
+       pmd_t *pmd;
+
+       pmd = get_pmd_from_cache(mm);
+       if (pmd)
+               return pmd;
+
+       return __alloc_for_pmdcache(mm);
+}
+
+void pmd_fragment_free(unsigned long *pmd)
+{
+       struct page *page = virt_to_page(pmd);
+
+       BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
+       if (atomic_dec_and_test(&page->pt_frag_refcount)) {
+               pgtable_pmd_page_dtor(page);
+               __free_page(page);
+       }
+}
+
+static inline void pgtable_free(void *table, int index)
+{
+       switch (index) {
+       case PTE_INDEX:
+               pte_fragment_free(table, 0);
+               break;
+       case PMD_INDEX:
+               pmd_fragment_free(table);
+               break;
+       case PUD_INDEX:
+               kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+               break;
+#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
+               /* 16M hugepd directory at pud level */
+       case HTLB_16M_INDEX:
+               BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
+               kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
+               break;
+               /* 16G hugepd directory at the pgd level */
+       case HTLB_16G_INDEX:
+               BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
+               kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
+               break;
+#endif
+               /* We don't free pgd table via RCU callback */
+       default:
+               BUG();
+       }
+}
+
+#ifdef CONFIG_SMP
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+       unsigned long pgf = (unsigned long)table;
+
+       BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
+       pgf |= index;
+       tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+       void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+       unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+       return pgtable_free(table, index);
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+       return pgtable_free(table, index);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+       /*
+        * Hash maps the memory with one size mmu_linear_psize.
+        * So don't bother to print these on hash
+        */
+       if (!radix_enabled())
+               return;
+       seq_printf(m, "DirectMap4k:    %8lu kB\n",
+                  atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
+       seq_printf(m, "DirectMap64k:    %8lu kB\n",
+                  atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
+       seq_printf(m, "DirectMap2M:    %8lu kB\n",
+                  atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
+       seq_printf(m, "DirectMap1G:    %8lu kB\n",
+                  atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+}
+#endif /* CONFIG_PROC_FS */
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t *ptep)
+{
+       unsigned long pte_val;
+
+       /*
+        * Clear the _PAGE_PRESENT so that no hardware parallel update is
+        * possible. Also keep the pte_present true so that we don't take
+        * wrong fault.
+        */
+       pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+       return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+       if (radix_enabled())
+               return radix__ptep_modify_prot_commit(vma, addr,
+                                                     ptep, old_pte, pte);
+       set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
+/*
+ * For hash translation mode, we use the deposited table to store hash slot
+ * information and they are stored at PTRS_PER_PMD offset from related pmd
+ * location. Hence a pmd move requires deposit and withdraw.
+ *
+ * For radix translation with split pmd ptl, we store the deposited table in the
+ * pmd page. Hence if we have different pmd page we need to withdraw during pmd
+ * move.
+ *
+ * With hash we use deposited table always irrespective of anon or not.
+ * With radix we use deposited table only for anonymous mapping.
+ */
+int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+                          struct spinlock *old_pmd_ptl,
+                          struct vm_area_struct *vma)
+{
+       if (radix_enabled())
+               return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+
+       return true;
+}
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
new file mode 100644 (file)
index 0000000..ae7fca4
--- /dev/null
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PowerPC Memory Protection Keys management
+ *
+ * Copyright 2017, Ram Pai, IBM Corporation.
+ */
+
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mmu.h>
+#include <asm/setup.h>
+#include <linux/pkeys.h>
+#include <linux/of_device.h>
+
+DEFINE_STATIC_KEY_TRUE(pkey_disabled);
+int  pkeys_total;              /* Total pkeys as per device tree */
+u32  initial_allocation_mask;   /* Bits set for the initially allocated keys */
+u32  reserved_allocation_mask;  /* Bits set for reserved keys */
+static bool pkey_execute_disable_supported;
+static bool pkeys_devtree_defined;     /* property exported by device tree */
+static u64 pkey_amr_mask;              /* Bits in AMR not to be touched */
+static u64 pkey_iamr_mask;             /* Bits in AMR not to be touched */
+static u64 pkey_uamor_mask;            /* Bits in UMOR not to be touched */
+static int execute_only_key = 2;
+
+#define AMR_BITS_PER_PKEY 2
+#define AMR_RD_BIT 0x1UL
+#define AMR_WR_BIT 0x2UL
+#define IAMR_EX_BIT 0x1UL
+#define PKEY_REG_BITS (sizeof(u64)*8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
+
+static void scan_pkey_feature(void)
+{
+       u32 vals[2];
+       struct device_node *cpu;
+
+       cpu = of_find_node_by_type(NULL, "cpu");
+       if (!cpu)
+               return;
+
+       if (of_property_read_u32_array(cpu,
+                       "ibm,processor-storage-keys", vals, 2))
+               return;
+
+       /*
+        * Since any pkey can be used for data or execute, we will just treat
+        * all keys as equal and track them as one entity.
+        */
+       pkeys_total = vals[0];
+       pkeys_devtree_defined = true;
+}
+
+static inline bool pkey_mmu_enabled(void)
+{
+       if (firmware_has_feature(FW_FEATURE_LPAR))
+               return pkeys_total;
+       else
+               return cpu_has_feature(CPU_FTR_PKEY);
+}
+
+static int pkey_initialize(void)
+{
+       int os_reserved, i;
+
+       /*
+        * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
+        * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
+        * Ensure that the bits a distinct.
+        */
+       BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
+                    (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+       /*
+        * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
+        * in the vmaflag. Make sure that is really the case.
+        */
+       BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
+                    __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
+                               != (sizeof(u64) * BITS_PER_BYTE));
+
+       /* scan the device tree for pkey feature */
+       scan_pkey_feature();
+
+       /*
+        * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
+        * tree. We make this exception since skiboot forgot to expose this
+        * property on power8.
+        */
+       if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
+                       cpu_has_feature(CPU_FTRS_POWER8))
+               pkeys_total = 32;
+
+       /*
+        * Adjust the upper limit, based on the number of bits supported by
+        * arch-neutral code.
+        */
+       pkeys_total = min_t(int, pkeys_total,
+                       ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
+
+       if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
+               static_branch_enable(&pkey_disabled);
+       else
+               static_branch_disable(&pkey_disabled);
+
+       if (static_branch_likely(&pkey_disabled))
+               return 0;
+
+       /*
+        * The device tree cannot be relied to indicate support for
+        * execute_disable support. Instead we use a PVR check.
+        */
+       if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
+               pkey_execute_disable_supported = false;
+       else
+               pkey_execute_disable_supported = true;
+
+#ifdef CONFIG_PPC_4K_PAGES
+       /*
+        * The OS can manage only 8 pkeys due to its inability to represent them
+        * in the Linux 4K PTE.
+        */
+       os_reserved = pkeys_total - 8;
+#else
+       os_reserved = 0;
+#endif
+       /* Bits are in LE format. */
+       reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
+
+       /* register mask is in BE format */
+       pkey_amr_mask = ~0x0ul;
+       pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
+
+       pkey_iamr_mask = ~0x0ul;
+       pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
+       pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+       pkey_uamor_mask = ~0x0ul;
+       pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
+       pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+       /* mark the rest of the keys as reserved and hence unavailable */
+       for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
+               reserved_allocation_mask |= (0x1 << i);
+               pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
+       }
+       initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
+
+       if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
+               /*
+                * Insufficient number of keys to support
+                * execute only key. Mark it unavailable.
+                * Any AMR, UAMOR, IAMR bit set for
+                * this key is irrelevant since this key
+                * can never be allocated.
+                */
+               execute_only_key = -1;
+       }
+
+       return 0;
+}
+
+arch_initcall(pkey_initialize);
+
+void pkey_mm_init(struct mm_struct *mm)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return;
+       mm_pkey_allocation_map(mm) = initial_allocation_mask;
+       mm->context.execute_only_pkey = execute_only_key;
+}
+
+static inline u64 read_amr(void)
+{
+       return mfspr(SPRN_AMR);
+}
+
+static inline void write_amr(u64 value)
+{
+       mtspr(SPRN_AMR, value);
+}
+
+static inline u64 read_iamr(void)
+{
+       if (!likely(pkey_execute_disable_supported))
+               return 0x0UL;
+
+       return mfspr(SPRN_IAMR);
+}
+
+static inline void write_iamr(u64 value)
+{
+       if (!likely(pkey_execute_disable_supported))
+               return;
+
+       mtspr(SPRN_IAMR, value);
+}
+
+static inline u64 read_uamor(void)
+{
+       return mfspr(SPRN_UAMOR);
+}
+
+static inline void write_uamor(u64 value)
+{
+       mtspr(SPRN_UAMOR, value);
+}
+
+static bool is_pkey_enabled(int pkey)
+{
+       u64 uamor = read_uamor();
+       u64 pkey_bits = 0x3ul << pkeyshift(pkey);
+       u64 uamor_pkey_bits = (uamor & pkey_bits);
+
+       /*
+        * Both the bits in UAMOR corresponding to the key should be set or
+        * reset.
+        */
+       WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
+       return !!(uamor_pkey_bits);
+}
+
+static inline void init_amr(int pkey, u8 init_bits)
+{
+       u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
+       u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
+
+       write_amr(old_amr | new_amr_bits);
+}
+
+static inline void init_iamr(int pkey, u8 init_bits)
+{
+       u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
+       u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
+
+       write_iamr(old_iamr | new_iamr_bits);
+}
+
+/*
+ * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
+ * specified in @init_val.
+ */
+int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+                               unsigned long init_val)
+{
+       u64 new_amr_bits = 0x0ul;
+       u64 new_iamr_bits = 0x0ul;
+
+       if (!is_pkey_enabled(pkey))
+               return -EINVAL;
+
+       if (init_val & PKEY_DISABLE_EXECUTE) {
+               if (!pkey_execute_disable_supported)
+                       return -EINVAL;
+               new_iamr_bits |= IAMR_EX_BIT;
+       }
+       init_iamr(pkey, new_iamr_bits);
+
+       /* Set the bits we need in AMR: */
+       if (init_val & PKEY_DISABLE_ACCESS)
+               new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
+       else if (init_val & PKEY_DISABLE_WRITE)
+               new_amr_bits |= AMR_WR_BIT;
+
+       init_amr(pkey, new_amr_bits);
+       return 0;
+}
+
+void thread_pkey_regs_save(struct thread_struct *thread)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return;
+
+       /*
+        * TODO: Skip saving registers if @thread hasn't used any keys yet.
+        */
+       thread->amr = read_amr();
+       thread->iamr = read_iamr();
+       thread->uamor = read_uamor();
+}
+
+void thread_pkey_regs_restore(struct thread_struct *new_thread,
+                             struct thread_struct *old_thread)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return;
+
+       if (old_thread->amr != new_thread->amr)
+               write_amr(new_thread->amr);
+       if (old_thread->iamr != new_thread->iamr)
+               write_iamr(new_thread->iamr);
+       if (old_thread->uamor != new_thread->uamor)
+               write_uamor(new_thread->uamor);
+}
+
+void thread_pkey_regs_init(struct thread_struct *thread)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return;
+
+       thread->amr = pkey_amr_mask;
+       thread->iamr = pkey_iamr_mask;
+       thread->uamor = pkey_uamor_mask;
+
+       write_uamor(pkey_uamor_mask);
+       write_amr(pkey_amr_mask);
+       write_iamr(pkey_iamr_mask);
+}
+
+static inline bool pkey_allows_readwrite(int pkey)
+{
+       int pkey_shift = pkeyshift(pkey);
+
+       if (!is_pkey_enabled(pkey))
+               return true;
+
+       return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
+}
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+       return mm->context.execute_only_pkey;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+       /* Do this check first since the vm_flags should be hot */
+       if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+               return false;
+
+       return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
+}
+
+/*
+ * This should only be called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
+                                 int pkey)
+{
+       /*
+        * If the currently associated pkey is execute-only, but the requested
+        * protection is not execute-only, move it back to the default pkey.
+        */
+       if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
+               return 0;
+
+       /*
+        * The requested protection is execute-only. Hence let's use an
+        * execute-only pkey.
+        */
+       if (prot == PROT_EXEC) {
+               pkey = execute_only_pkey(vma->vm_mm);
+               if (pkey > 0)
+                       return pkey;
+       }
+
+       /* Nothing to override. */
+       return vma_pkey(vma);
+}
+
+static bool pkey_access_permitted(int pkey, bool write, bool execute)
+{
+       int pkey_shift;
+       u64 amr;
+
+       if (!is_pkey_enabled(pkey))
+               return true;
+
+       pkey_shift = pkeyshift(pkey);
+       if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
+               return true;
+
+       amr = read_amr(); /* Delay reading amr until absolutely needed */
+       return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
+               (write &&  !(amr & (AMR_WR_BIT << pkey_shift))));
+}
+
+bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return true;
+
+       return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
+}
+
+/*
+ * We only want to enforce protection keys on the current thread because we
+ * effectively have no access to AMR/IAMR for other threads or any way to tell
+ * which AMR/IAMR in a threaded process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current mm, or if we are
+ * in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+       if (!current->mm)
+               return true;
+
+       /* if it is not our ->mm, it has to be foreign */
+       if (current->mm != vma->vm_mm)
+               return true;
+
+       return false;
+}
+
+bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
+                              bool execute, bool foreign)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return true;
+       /*
+        * Do not enforce our key-permissions on a foreign vma.
+        */
+       if (foreign || vma_is_foreign(vma))
+               return true;
+
+       return pkey_access_permitted(vma_pkey(vma), write, execute);
+}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+       if (static_branch_likely(&pkey_disabled))
+               return;
+
+       /* Duplicate the oldmm pkey state in mm: */
+       mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+       mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
new file mode 100644 (file)
index 0000000..cab0633
--- /dev/null
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/security.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+       int psize;
+       struct hstate *hstate = hstate_file(vma->vm_file);
+
+       psize = hstate_get_psize(hstate);
+       radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+       int psize;
+       struct hstate *hstate = hstate_file(vma->vm_file);
+
+       psize = hstate_get_psize(hstate);
+       radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
+                                  unsigned long end)
+{
+       int psize;
+       struct hstate *hstate = hstate_file(vma->vm_file);
+
+       psize = hstate_get_psize(hstate);
+       radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+}
+
+/*
+ * A vairant of hugetlb_get_unmapped_area doing topdown search
+ * FIXME!! should we do as x86 does or non hugetlb area does ?
+ * ie, use topdown or not based on mmap_is_legacy check ?
+ */
+unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+                               unsigned long len, unsigned long pgoff,
+                               unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       struct hstate *h = hstate_file(file);
+       int fixed = (flags & MAP_FIXED);
+       unsigned long high_limit;
+       struct vm_unmapped_area_info info;
+
+       high_limit = DEFAULT_MAP_WINDOW;
+       if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+               high_limit = TASK_SIZE;
+
+       if (len & ~huge_page_mask(h))
+               return -EINVAL;
+       if (len > high_limit)
+               return -ENOMEM;
+
+       if (fixed) {
+               if (addr > high_limit - len)
+                       return -ENOMEM;
+               if (prepare_hugepage_range(file, addr, len))
+                       return -EINVAL;
+               return addr;
+       }
+
+       if (addr) {
+               addr = ALIGN(addr, huge_page_size(h));
+               vma = find_vma(mm, addr);
+               if (high_limit - len >= addr && addr >= mmap_min_addr &&
+                   (!vma || addr + len <= vm_start_gap(vma)))
+                       return addr;
+       }
+       /*
+        * We are always doing an topdown search here. Slice code
+        * does that too.
+        */
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+       info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+
+       return vm_unmapped_area(&info);
+}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+                                        unsigned long addr, pte_t *ptep,
+                                        pte_t old_pte, pte_t pte)
+{
+       struct mm_struct *mm = vma->vm_mm;
+
+       /*
+        * To avoid NMMU hang while relaxing access we need to flush the tlb before
+        * we set the new value.
+        */
+       if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+           (atomic_read(&mm->context.copros) > 0))
+               radix__flush_hugetlb_page(vma, addr);
+
+       set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
new file mode 100644 (file)
index 0000000..c9bcf42
--- /dev/null
@@ -0,0 +1,1124 @@
+/*
+ * Page table handling routines for radix page table.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched/mm.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/mm.h>
+#include <linux/string_helpers.h>
+#include <linux/stop_machine.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+#include <asm/powernv.h>
+#include <asm/sections.h>
+#include <asm/trace.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/thp.h>
+
+unsigned int mmu_pid_bits;
+unsigned int mmu_base_pid;
+
+static int native_register_process_table(unsigned long base, unsigned long pg_sz,
+                                        unsigned long table_size)
+{
+       unsigned long patb0, patb1;
+
+       patb0 = be64_to_cpu(partition_tb[0].patb0);
+       patb1 = base | table_size | PATB_GR;
+
+       mmu_partition_table_set_entry(0, patb0, patb1);
+
+       return 0;
+}
+
+static __ref void *early_alloc_pgtable(unsigned long size, int nid,
+                       unsigned long region_start, unsigned long region_end)
+{
+       phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+       phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
+       void *ptr;
+
+       if (region_start)
+               min_addr = region_start;
+       if (region_end)
+               max_addr = region_end;
+
+       ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
+
+       if (!ptr)
+               panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
+                     __func__, size, size, nid, &min_addr, &max_addr);
+
+       return ptr;
+}
+
+static int early_map_kernel_page(unsigned long ea, unsigned long pa,
+                         pgprot_t flags,
+                         unsigned int map_page_size,
+                         int nid,
+                         unsigned long region_start, unsigned long region_end)
+{
+       unsigned long pfn = pa >> PAGE_SHIFT;
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       pgdp = pgd_offset_k(ea);
+       if (pgd_none(*pgdp)) {
+               pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
+                                               region_start, region_end);
+               pgd_populate(&init_mm, pgdp, pudp);
+       }
+       pudp = pud_offset(pgdp, ea);
+       if (map_page_size == PUD_SIZE) {
+               ptep = (pte_t *)pudp;
+               goto set_the_pte;
+       }
+       if (pud_none(*pudp)) {
+               pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
+                                               region_start, region_end);
+               pud_populate(&init_mm, pudp, pmdp);
+       }
+       pmdp = pmd_offset(pudp, ea);
+       if (map_page_size == PMD_SIZE) {
+               ptep = pmdp_ptep(pmdp);
+               goto set_the_pte;
+       }
+       if (!pmd_present(*pmdp)) {
+               ptep = early_alloc_pgtable(PAGE_SIZE, nid,
+                                               region_start, region_end);
+               pmd_populate_kernel(&init_mm, pmdp, ptep);
+       }
+       ptep = pte_offset_kernel(pmdp, ea);
+
+set_the_pte:
+       set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+       smp_wmb();
+       return 0;
+}
+
+/*
+ * nid, region_start, and region_end are hints to try to place the page
+ * table memory in the same node or region.
+ */
+static int __map_kernel_page(unsigned long ea, unsigned long pa,
+                         pgprot_t flags,
+                         unsigned int map_page_size,
+                         int nid,
+                         unsigned long region_start, unsigned long region_end)
+{
+       unsigned long pfn = pa >> PAGE_SHIFT;
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+       /*
+        * Make sure task size is correct as per the max adddr
+        */
+       BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+
+#ifdef CONFIG_PPC_64K_PAGES
+       BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
+#endif
+
+       if (unlikely(!slab_is_available()))
+               return early_map_kernel_page(ea, pa, flags, map_page_size,
+                                               nid, region_start, region_end);
+
+       /*
+        * Should make page table allocation functions be able to take a
+        * node, so we can place kernel page tables on the right nodes after
+        * boot.
+        */
+       pgdp = pgd_offset_k(ea);
+       pudp = pud_alloc(&init_mm, pgdp, ea);
+       if (!pudp)
+               return -ENOMEM;
+       if (map_page_size == PUD_SIZE) {
+               ptep = (pte_t *)pudp;
+               goto set_the_pte;
+       }
+       pmdp = pmd_alloc(&init_mm, pudp, ea);
+       if (!pmdp)
+               return -ENOMEM;
+       if (map_page_size == PMD_SIZE) {
+               ptep = pmdp_ptep(pmdp);
+               goto set_the_pte;
+       }
+       ptep = pte_alloc_kernel(pmdp, ea);
+       if (!ptep)
+               return -ENOMEM;
+
+set_the_pte:
+       set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+       smp_wmb();
+       return 0;
+}
+
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+                         pgprot_t flags,
+                         unsigned int map_page_size)
+{
+       return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void radix__change_memory_range(unsigned long start, unsigned long end,
+                               unsigned long clear)
+{
+       unsigned long idx;
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       start = ALIGN_DOWN(start, PAGE_SIZE);
+       end = PAGE_ALIGN(end); // aligns up
+
+       pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
+                start, end, clear);
+
+       for (idx = start; idx < end; idx += PAGE_SIZE) {
+               pgdp = pgd_offset_k(idx);
+               pudp = pud_alloc(&init_mm, pgdp, idx);
+               if (!pudp)
+                       continue;
+               if (pud_huge(*pudp)) {
+                       ptep = (pte_t *)pudp;
+                       goto update_the_pte;
+               }
+               pmdp = pmd_alloc(&init_mm, pudp, idx);
+               if (!pmdp)
+                       continue;
+               if (pmd_huge(*pmdp)) {
+                       ptep = pmdp_ptep(pmdp);
+                       goto update_the_pte;
+               }
+               ptep = pte_alloc_kernel(pmdp, idx);
+               if (!ptep)
+                       continue;
+update_the_pte:
+               radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
+       }
+
+       radix__flush_tlb_kernel_range(start, end);
+}
+
+void radix__mark_rodata_ro(void)
+{
+       unsigned long start, end;
+
+       start = (unsigned long)_stext;
+       end = (unsigned long)__init_begin;
+
+       radix__change_memory_range(start, end, _PAGE_WRITE);
+}
+
+void radix__mark_initmem_nx(void)
+{
+       unsigned long start = (unsigned long)__init_begin;
+       unsigned long end = (unsigned long)__init_end;
+
+       radix__change_memory_range(start, end, _PAGE_EXEC);
+}
+#endif /* CONFIG_STRICT_KERNEL_RWX */
+
+static inline void __meminit
+print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
+{
+       char buf[10];
+
+       if (end <= start)
+               return;
+
+       string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+       pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
+               exec ? " (exec)" : "");
+}
+
+static unsigned long next_boundary(unsigned long addr, unsigned long end)
+{
+#ifdef CONFIG_STRICT_KERNEL_RWX
+       if (addr < __pa_symbol(__init_begin))
+               return __pa_symbol(__init_begin);
+#endif
+       return end;
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+                                            unsigned long end,
+                                            int nid)
+{
+       unsigned long vaddr, addr, mapping_size = 0;
+       bool prev_exec, exec = false;
+       pgprot_t prot;
+       int psize;
+
+       start = _ALIGN_UP(start, PAGE_SIZE);
+       for (addr = start; addr < end; addr += mapping_size) {
+               unsigned long gap, previous_size;
+               int rc;
+
+               gap = next_boundary(addr, end) - addr;
+               previous_size = mapping_size;
+               prev_exec = exec;
+
+               if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+                   mmu_psize_defs[MMU_PAGE_1G].shift) {
+                       mapping_size = PUD_SIZE;
+                       psize = MMU_PAGE_1G;
+               } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+                          mmu_psize_defs[MMU_PAGE_2M].shift) {
+                       mapping_size = PMD_SIZE;
+                       psize = MMU_PAGE_2M;
+               } else {
+                       mapping_size = PAGE_SIZE;
+                       psize = mmu_virtual_psize;
+               }
+
+               vaddr = (unsigned long)__va(addr);
+
+               if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
+                   overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
+                       prot = PAGE_KERNEL_X;
+                       exec = true;
+               } else {
+                       prot = PAGE_KERNEL;
+                       exec = false;
+               }
+
+               if (mapping_size != previous_size || exec != prev_exec) {
+                       print_mapping(start, addr, previous_size, prev_exec);
+                       start = addr;
+               }
+
+               rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
+               if (rc)
+                       return rc;
+
+               update_page_count(psize, 1);
+       }
+
+       print_mapping(start, addr, mapping_size, exec);
+       return 0;
+}
+
+void __init radix_init_pgtable(void)
+{
+       unsigned long rts_field;
+       struct memblock_region *reg;
+
+       /* We don't support slb for radix */
+       mmu_slb_size = 0;
+       /*
+        * Create the linear mapping, using standard page size for now
+        */
+       for_each_memblock(memory, reg) {
+               /*
+                * The memblock allocator  is up at this point, so the
+                * page tables will be allocated within the range. No
+                * need or a node (which we don't have yet).
+                */
+
+               if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+                       pr_warn("Outside the supported range\n");
+                       continue;
+               }
+
+               WARN_ON(create_physical_mapping(reg->base,
+                                               reg->base + reg->size,
+                                               -1));
+       }
+
+       /* Find out how many PID bits are supported */
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               if (!mmu_pid_bits)
+                       mmu_pid_bits = 20;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+               /*
+                * When KVM is possible, we only use the top half of the
+                * PID space to avoid collisions between host and guest PIDs
+                * which can cause problems due to prefetch when exiting the
+                * guest with AIL=3
+                */
+               mmu_base_pid = 1 << (mmu_pid_bits - 1);
+#else
+               mmu_base_pid = 1;
+#endif
+       } else {
+               /* The guest uses the bottom half of the PID space */
+               if (!mmu_pid_bits)
+                       mmu_pid_bits = 19;
+               mmu_base_pid = 1;
+       }
+
+       /*
+        * Allocate Partition table and process table for the
+        * host.
+        */
+       BUG_ON(PRTB_SIZE_SHIFT > 36);
+       process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
+       /*
+        * Fill in the process table.
+        */
+       rts_field = radix__get_tree_size();
+       process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
+       /*
+        * Fill in the partition table. We are suppose to use effective address
+        * of process table here. But our linear mapping also enable us to use
+        * physical address here.
+        */
+       register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
+       pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
+       asm volatile("ptesync" : : : "memory");
+       asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+                    "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
+       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+       trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
+
+       /*
+        * The init_mm context is given the first available (non-zero) PID,
+        * which is the "guard PID" and contains no page table. PIDR should
+        * never be set to zero because that duplicates the kernel address
+        * space at the 0x0... offset (quadrant 0)!
+        *
+        * An arbitrary PID that may later be allocated by the PID allocator
+        * for userspace processes must not be used either, because that
+        * would cause stale user mappings for that PID on CPUs outside of
+        * the TLB invalidation scheme (because it won't be in mm_cpumask).
+        *
+        * So permanently carve out one PID for the purpose of a guard PID.
+        */
+       init_mm.context.id = mmu_base_pid;
+       mmu_base_pid++;
+}
+
+static void __init radix_init_partition_table(void)
+{
+       unsigned long rts_field, dw0;
+
+       mmu_partition_table_init();
+       rts_field = radix__get_tree_size();
+       dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
+       mmu_partition_table_set_entry(0, dw0, 0);
+
+       pr_info("Initializing Radix MMU\n");
+       pr_info("Partition table %p\n", partition_tb);
+}
+
+void __init radix_init_native(void)
+{
+       register_process_table = native_register_process_table;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+       int idx = -1;
+
+       switch (shift) {
+       case 0xc:
+               idx = MMU_PAGE_4K;
+               break;
+       case 0x10:
+               idx = MMU_PAGE_64K;
+               break;
+       case 0x15:
+               idx = MMU_PAGE_2M;
+               break;
+       case 0x1e:
+               idx = MMU_PAGE_1G;
+               break;
+       }
+       return idx;
+}
+
+static int __init radix_dt_scan_page_sizes(unsigned long node,
+                                          const char *uname, int depth,
+                                          void *data)
+{
+       int size = 0;
+       int shift, idx;
+       unsigned int ap;
+       const __be32 *prop;
+       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+       /* We are scanning "cpu" nodes only */
+       if (type == NULL || strcmp(type, "cpu") != 0)
+               return 0;
+
+       /* Find MMU PID size */
+       prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+       if (prop && size == 4)
+               mmu_pid_bits = be32_to_cpup(prop);
+
+       /* Grab page size encodings */
+       prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
+       if (!prop)
+               return 0;
+
+       pr_info("Page sizes from device-tree:\n");
+       for (; size >= 4; size -= 4, ++prop) {
+
+               struct mmu_psize_def *def;
+
+               /* top 3 bit is AP encoding */
+               shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
+               ap = be32_to_cpu(prop[0]) >> 29;
+               pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
+
+               idx = get_idx_from_shift(shift);
+               if (idx < 0)
+                       continue;
+
+               def = &mmu_psize_defs[idx];
+               def->shift = shift;
+               def->ap  = ap;
+       }
+
+       /* needed ? */
+       cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+       return 1;
+}
+
+void __init radix__early_init_devtree(void)
+{
+       int rc;
+
+       /*
+        * Try to find the available page sizes in the device-tree
+        */
+       rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
+       if (rc != 0)  /* Found */
+               goto found;
+       /*
+        * let's assume we have page 4k and 64k support
+        */
+       mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+       mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+       mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+       mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+found:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+               /*
+                * map vmemmap using 2M if available
+                */
+               mmu_vmemmap_psize = MMU_PAGE_2M;
+       }
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+       return;
+}
+
+static void radix_init_amor(void)
+{
+       /*
+       * In HV mode, we init AMOR (Authority Mask Override Register) so that
+       * the hypervisor and guest can setup IAMR (Instruction Authority Mask
+       * Register), enable key 0 and set it to 1.
+       *
+       * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
+       */
+       mtspr(SPRN_AMOR, (3ul << 62));
+}
+
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
+{
+       if (disabled || !early_radix_enabled())
+               return;
+
+       if (smp_processor_id() == boot_cpuid)
+               pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+       /*
+        * Radix always uses key0 of the IAMR to determine if an access is
+        * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
+        * fetch.
+        */
+       mtspr(SPRN_IAMR, (1ul << 62));
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+       if (disabled || !early_radix_enabled())
+               return;
+
+       if (smp_processor_id() == boot_cpuid) {
+               pr_info("Activating Kernel Userspace Access Prevention\n");
+               cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
+       }
+
+       /* Make sure userspace can't change the AMR */
+       mtspr(SPRN_UAMOR, 0);
+       mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+       isync();
+}
+#endif
+
+void __init radix__early_init_mmu(void)
+{
+       unsigned long lpcr;
+
+#ifdef CONFIG_PPC_64K_PAGES
+       /* PAGE_SIZE mappings */
+       mmu_virtual_psize = MMU_PAGE_64K;
+#else
+       mmu_virtual_psize = MMU_PAGE_4K;
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       /* vmemmap mapping */
+       mmu_vmemmap_psize = mmu_virtual_psize;
+#endif
+       /*
+        * initialize page table size
+        */
+       __pte_index_size = RADIX_PTE_INDEX_SIZE;
+       __pmd_index_size = RADIX_PMD_INDEX_SIZE;
+       __pud_index_size = RADIX_PUD_INDEX_SIZE;
+       __pgd_index_size = RADIX_PGD_INDEX_SIZE;
+       __pud_cache_index = RADIX_PUD_INDEX_SIZE;
+       __pte_table_size = RADIX_PTE_TABLE_SIZE;
+       __pmd_table_size = RADIX_PMD_TABLE_SIZE;
+       __pud_table_size = RADIX_PUD_TABLE_SIZE;
+       __pgd_table_size = RADIX_PGD_TABLE_SIZE;
+
+       __pmd_val_bits = RADIX_PMD_VAL_BITS;
+       __pud_val_bits = RADIX_PUD_VAL_BITS;
+       __pgd_val_bits = RADIX_PGD_VAL_BITS;
+
+       __kernel_virt_start = RADIX_KERN_VIRT_START;
+       __vmalloc_start = RADIX_VMALLOC_START;
+       __vmalloc_end = RADIX_VMALLOC_END;
+       __kernel_io_start = RADIX_KERN_IO_START;
+       __kernel_io_end = RADIX_KERN_IO_END;
+       vmemmap = (struct page *)RADIX_VMEMMAP_START;
+       ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+       pci_io_base = ISA_IO_BASE;
+#endif
+       __pte_frag_nr = RADIX_PTE_FRAG_NR;
+       __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
+       __pmd_frag_nr = RADIX_PMD_FRAG_NR;
+       __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
+
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+               radix_init_native();
+               lpcr = mfspr(SPRN_LPCR);
+               mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+               radix_init_partition_table();
+               radix_init_amor();
+       } else {
+               radix_init_pseries();
+       }
+
+       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+       radix_init_pgtable();
+       /* Switch to the guard PID before turning on MMU */
+       radix__switch_mmu_context(NULL, &init_mm);
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               tlbiel_all();
+}
+
+void radix__early_init_mmu_secondary(void)
+{
+       unsigned long lpcr;
+       /*
+        * update partition table control register and UPRT
+        */
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+               lpcr = mfspr(SPRN_LPCR);
+               mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+
+               mtspr(SPRN_PTCR,
+                     __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+               radix_init_amor();
+       }
+
+       radix__switch_mmu_context(NULL, &init_mm);
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               tlbiel_all();
+}
+
+void radix__mmu_cleanup_all(void)
+{
+       unsigned long lpcr;
+
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+               lpcr = mfspr(SPRN_LPCR);
+               mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
+               mtspr(SPRN_PTCR, 0);
+               powernv_set_nmmu_ptcr(0);
+               radix__flush_tlb_all();
+       }
+}
+
+void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                               phys_addr_t first_memblock_size)
+{
+       /*
+        * We don't currently support the first MEMBLOCK not mapping 0
+        * physical on those processors
+        */
+       BUG_ON(first_memblock_base != 0);
+
+       /*
+        * Radix mode is not limited by RMA / VRMA addressing.
+        */
+       ppc64_rma_size = ULONG_MAX;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+       pte_t *pte;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PTE; i++) {
+               pte = pte_start + i;
+               if (!pte_none(*pte))
+                       return;
+       }
+
+       pte_free_kernel(&init_mm, pte_start);
+       pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+       pmd_t *pmd;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               pmd = pmd_start + i;
+               if (!pmd_none(*pmd))
+                       return;
+       }
+
+       pmd_free(&init_mm, pmd_start);
+       pud_clear(pud);
+}
+
+struct change_mapping_params {
+       pte_t *pte;
+       unsigned long start;
+       unsigned long end;
+       unsigned long aligned_start;
+       unsigned long aligned_end;
+};
+
+static int __meminit stop_machine_change_mapping(void *data)
+{
+       struct change_mapping_params *params =
+                       (struct change_mapping_params *)data;
+
+       if (!data)
+               return -1;
+
+       spin_unlock(&init_mm.page_table_lock);
+       pte_clear(&init_mm, params->aligned_start, params->pte);
+       create_physical_mapping(params->aligned_start, params->start, -1);
+       create_physical_mapping(params->end, params->aligned_end, -1);
+       spin_lock(&init_mm.page_table_lock);
+       return 0;
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+                            unsigned long end)
+{
+       unsigned long next;
+       pte_t *pte;
+
+       pte = pte_start + pte_index(addr);
+       for (; addr < end; addr = next, pte++) {
+               next = (addr + PAGE_SIZE) & PAGE_MASK;
+               if (next > end)
+                       next = end;
+
+               if (!pte_present(*pte))
+                       continue;
+
+               if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+                       /*
+                        * The vmemmap_free() and remove_section_mapping()
+                        * codepaths call us with aligned addresses.
+                        */
+                       WARN_ONCE(1, "%s: unaligned range\n", __func__);
+                       continue;
+               }
+
+               pte_clear(&init_mm, addr, pte);
+       }
+}
+
+/*
+ * clear the pte and potentially split the mapping helper
+ */
+static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
+                               unsigned long size, pte_t *pte)
+{
+       unsigned long mask = ~(size - 1);
+       unsigned long aligned_start = addr & mask;
+       unsigned long aligned_end = addr + size;
+       struct change_mapping_params params;
+       bool split_region = false;
+
+       if ((end - addr) < size) {
+               /*
+                * We're going to clear the PTE, but not flushed
+                * the mapping, time to remap and flush. The
+                * effects if visible outside the processor or
+                * if we are running in code close to the
+                * mapping we cleared, we are in trouble.
+                */
+               if (overlaps_kernel_text(aligned_start, addr) ||
+                       overlaps_kernel_text(end, aligned_end)) {
+                       /*
+                        * Hack, just return, don't pte_clear
+                        */
+                       WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
+                                 "text, not splitting\n", addr, end);
+                       return;
+               }
+               split_region = true;
+       }
+
+       if (split_region) {
+               params.pte = pte;
+               params.start = addr;
+               params.end = end;
+               params.aligned_start = addr & ~(size - 1);
+               params.aligned_end = min_t(unsigned long, aligned_end,
+                               (unsigned long)__va(memblock_end_of_DRAM()));
+               stop_machine(stop_machine_change_mapping, &params, NULL);
+               return;
+       }
+
+       pte_clear(&init_mm, addr, pte);
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+                            unsigned long end)
+{
+       unsigned long next;
+       pte_t *pte_base;
+       pmd_t *pmd;
+
+       pmd = pmd_start + pmd_index(addr);
+       for (; addr < end; addr = next, pmd++) {
+               next = pmd_addr_end(addr, end);
+
+               if (!pmd_present(*pmd))
+                       continue;
+
+               if (pmd_huge(*pmd)) {
+                       split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
+                       continue;
+               }
+
+               pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+               remove_pte_table(pte_base, addr, next);
+               free_pte_table(pte_base, pmd);
+       }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+                            unsigned long end)
+{
+       unsigned long next;
+       pmd_t *pmd_base;
+       pud_t *pud;
+
+       pud = pud_start + pud_index(addr);
+       for (; addr < end; addr = next, pud++) {
+               next = pud_addr_end(addr, end);
+
+               if (!pud_present(*pud))
+                       continue;
+
+               if (pud_huge(*pud)) {
+                       split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
+                       continue;
+               }
+
+               pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+               remove_pmd_table(pmd_base, addr, next);
+               free_pmd_table(pmd_base, pud);
+       }
+}
+
+static void __meminit remove_pagetable(unsigned long start, unsigned long end)
+{
+       unsigned long addr, next;
+       pud_t *pud_base;
+       pgd_t *pgd;
+
+       spin_lock(&init_mm.page_table_lock);
+
+       for (addr = start; addr < end; addr = next) {
+               next = pgd_addr_end(addr, end);
+
+               pgd = pgd_offset_k(addr);
+               if (!pgd_present(*pgd))
+                       continue;
+
+               if (pgd_huge(*pgd)) {
+                       split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
+                       continue;
+               }
+
+               pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+               remove_pud_table(pud_base, addr, next);
+       }
+
+       spin_unlock(&init_mm.page_table_lock);
+       radix__flush_tlb_kernel_range(start, end);
+}
+
+int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+       if (end >= RADIX_VMALLOC_START) {
+               pr_warn("Outside the supported range\n");
+               return -1;
+       }
+
+       return create_physical_mapping(start, end, nid);
+}
+
+int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+       remove_pagetable(start, end);
+       return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
+                                pgprot_t flags, unsigned int map_page_size,
+                                int nid)
+{
+       return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
+}
+
+int __meminit radix__vmemmap_create_mapping(unsigned long start,
+                                     unsigned long page_size,
+                                     unsigned long phys)
+{
+       /* Create a PTE encoding */
+       unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+       int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
+       int ret;
+
+       if ((start + page_size) >= RADIX_VMEMMAP_END) {
+               pr_warn("Outside the supported range\n");
+               return -1;
+       }
+
+       ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+       BUG_ON(ret);
+
+       return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+{
+       remove_pagetable(start, start + page_size);
+}
+#endif
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                 pmd_t *pmdp, unsigned long clr,
+                                 unsigned long set)
+{
+       unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+       old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
+       trace_hugepage_update(addr, old, clr, set);
+
+       return old;
+}
+
+pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+
+{
+       pmd_t pmd;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
+       VM_BUG_ON(pmd_devmap(*pmdp));
+       /*
+        * khugepaged calls this for normal pmd
+        */
+       pmd = *pmdp;
+       pmd_clear(pmdp);
+
+       /*FIXME!!  Verify whether we need this kick below */
+       serialize_against_pte_lookup(vma->vm_mm);
+
+       radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
+       return pmd;
+}
+
+/*
+ * For us pgtable_t is pte_t *. Inorder to save the deposisted
+ * page table, we consider the allocated page table as a list
+ * head. On withdraw we need to make sure we zero out the used
+ * list_head memory area.
+ */
+void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                pgtable_t pgtable)
+{
+       struct list_head *lh = (struct list_head *) pgtable;
+
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+       /* FIFO */
+       if (!pmd_huge_pte(mm, pmdp))
+               INIT_LIST_HEAD(lh);
+       else
+               list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+       pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+       pte_t *ptep;
+       pgtable_t pgtable;
+       struct list_head *lh;
+
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+       /* FIFO */
+       pgtable = pmd_huge_pte(mm, pmdp);
+       lh = (struct list_head *) pgtable;
+       if (list_empty(lh))
+               pmd_huge_pte(mm, pmdp) = NULL;
+       else {
+               pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+               list_del(lh);
+       }
+       ptep = (pte_t *) pgtable;
+       *ptep = __pte(0);
+       ptep++;
+       *ptep = __pte(0);
+       return pgtable;
+}
+
+pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                    unsigned long addr, pmd_t *pmdp)
+{
+       pmd_t old_pmd;
+       unsigned long old;
+
+       old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+       old_pmd = __pmd(old);
+       /*
+        * Serialize against find_current_mm_pte which does lock-less
+        * lookup in page tables with local interrupts disabled. For huge pages
+        * it casts pmd_t to pte_t. Since format of pte_t is different from
+        * pmd_t we want to prevent transit from pmd pointing to page table
+        * to pmd pointing to huge page (and back) while interrupts are disabled.
+        * We clear pmd to possibly replace it with page table pointer in
+        * different code paths. So make sure we wait for the parallel
+        * find_current_mm_pte to finish.
+        */
+       serialize_against_pte_lookup(mm);
+       return old_pmd;
+}
+
+int radix__has_transparent_hugepage(void)
+{
+       /* For radix 2M at PMD level means thp */
+       if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
+               return 1;
+       return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+                                 pte_t entry, unsigned long address, int psize)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+                                             _PAGE_RW | _PAGE_EXEC);
+
+       unsigned long change = pte_val(entry) ^ pte_val(*ptep);
+       /*
+        * To avoid NMMU hang while relaxing access, we need mark
+        * the pte invalid in between.
+        */
+       if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+               unsigned long old_pte, new_pte;
+
+               old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
+               /*
+                * new value of pte
+                */
+               new_pte = old_pte | set;
+               radix__flush_tlb_page_psize(mm, address, psize);
+               __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
+       } else {
+               __radix_pte_update(ptep, 0, set);
+               /*
+                * Book3S does not require a TLB flush when relaxing access
+                * restrictions when the address space is not attached to a
+                * NMMU, because the core MMU will reload the pte after taking
+                * an access fault, which is defined by the architectue.
+                */
+       }
+       /* See ptesync comment in radix__set_pte_at */
+}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+                                   unsigned long addr, pte_t *ptep,
+                                   pte_t old_pte, pte_t pte)
+{
+       struct mm_struct *mm = vma->vm_mm;
+
+       /*
+        * To avoid NMMU hang while relaxing access we need to flush the tlb before
+        * we set the new value. We need to do this only for radix, because hash
+        * translation does flush when updating the linux pte.
+        */
+       if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+           (atomic_read(&mm->context.copros) > 0))
+               radix__flush_tlb_page(vma, addr);
+
+       set_pte_at(mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
new file mode 100644 (file)
index 0000000..6a23b9e
--- /dev/null
@@ -0,0 +1,1101 @@
+/*
+ * TLB flush routines for radix kernels.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+
+#include <asm/ppc-opcode.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/trace.h>
+#include <asm/cputhreads.h>
+
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
+/*
+ * tlbiel instruction for radix, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
+                                       unsigned int pid,
+                                       unsigned int ric, unsigned int prs)
+{
+       unsigned long rb;
+       unsigned long rs;
+
+       rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+       rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+       asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
+                    : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
+                    : "memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+       unsigned int set;
+
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Flush the first set of the TLB, and the entire Page Walk Cache
+        * and partition table entries. Then flush the remaining sets of the
+        * TLB.
+        */
+       tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+       for (set = 1; set < num_sets; set++)
+               tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+
+       /* Do the same for process scoped entries. */
+       tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
+       for (set = 1; set < num_sets; set++)
+               tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
+
+       asm volatile("ptesync": : :"memory");
+}
+
+void radix__tlbiel_all(unsigned int action)
+{
+       unsigned int is;
+
+       switch (action) {
+       case TLB_INVAL_SCOPE_GLOBAL:
+               is = 3;
+               break;
+       case TLB_INVAL_SCOPE_LPID:
+               is = 2;
+               break;
+       default:
+               BUG();
+       }
+
+       if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+               tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
+       else
+               WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
+
+       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void __tlbiel_pid(unsigned long pid, int set,
+                               unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = PPC_BIT(53); /* IS = 1 */
+       rb |= set << PPC_BITLSHIFT(51);
+       rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = PPC_BIT(53); /* IS = 1 */
+       rs = pid << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+                               unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = PPC_BIT(52); /* IS = 2 */
+       rb |= set << PPC_BITLSHIFT(51);
+       rs = 0;  /* LPID comes from LPIDR */
+       prs = 0; /* partition scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = PPC_BIT(52); /* IS = 2 */
+       rs = lpid;
+       prs = 0; /* partition scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+                               unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = PPC_BIT(52); /* IS = 2 */
+       rb |= set << PPC_BITLSHIFT(51);
+       rs = 0;  /* LPID comes from LPIDR */
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
+                              unsigned long ap, unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = va & ~(PPC_BITMASK(52, 63));
+       rb |= ap << PPC_BITLSHIFT(58);
+       rs = pid << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
+                             unsigned long ap, unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = va & ~(PPC_BITMASK(52, 63));
+       rb |= ap << PPC_BITLSHIFT(58);
+       rs = pid << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+                             unsigned long ap, unsigned long ric)
+{
+       unsigned long rb,rs,prs,r;
+
+       rb = va & ~(PPC_BITMASK(52, 63));
+       rb |= ap << PPC_BITLSHIFT(58);
+       rs = lpid;
+       prs = 0; /* partition scoped */
+       r = 1;   /* radix format */
+
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie(void)
+{
+       unsigned long pid = 0;
+       unsigned long va = ((1UL << 52) - 1);
+
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+               asm volatile("ptesync": : :"memory");
+               __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+       }
+}
+
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+       unsigned long va = ((1UL << 52) - 1);
+
+       if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+               asm volatile("ptesync": : :"memory");
+               __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+       }
+}
+
+/*
+ * We use 128 set in radix mode and 256 set in hpt mode.
+ */
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+{
+       int set;
+
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+        * also flush the entire Page Walk Cache.
+        */
+       __tlbiel_pid(pid, 0, ric);
+
+       /* For PWC, only one flush is needed */
+       if (ric == RIC_FLUSH_PWC) {
+               asm volatile("ptesync": : :"memory");
+               return;
+       }
+
+       /* For the remaining sets, just flush the TLB */
+       for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+               __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
+{
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Workaround the fact that the "ric" argument to __tlbie_pid
+        * must be a compile-time contraint to match the "i" constraint
+        * in the asm statement.
+        */
+       switch (ric) {
+       case RIC_FLUSH_TLB:
+               __tlbie_pid(pid, RIC_FLUSH_TLB);
+               break;
+       case RIC_FLUSH_PWC:
+               __tlbie_pid(pid, RIC_FLUSH_PWC);
+               break;
+       case RIC_FLUSH_ALL:
+       default:
+               __tlbie_pid(pid, RIC_FLUSH_ALL);
+       }
+       fixup_tlbie();
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+       int set;
+
+       VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+        * also flush the entire Page Walk Cache.
+        */
+       __tlbiel_lpid(lpid, 0, ric);
+
+       /* For PWC, only one flush is needed */
+       if (ric == RIC_FLUSH_PWC) {
+               asm volatile("ptesync": : :"memory");
+               return;
+       }
+
+       /* For the remaining sets, just flush the TLB */
+       for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+               __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Workaround the fact that the "ric" argument to __tlbie_pid
+        * must be a compile-time contraint to match the "i" constraint
+        * in the asm statement.
+        */
+       switch (ric) {
+       case RIC_FLUSH_TLB:
+               __tlbie_lpid(lpid, RIC_FLUSH_TLB);
+               break;
+       case RIC_FLUSH_PWC:
+               __tlbie_lpid(lpid, RIC_FLUSH_PWC);
+               break;
+       case RIC_FLUSH_ALL:
+       default:
+               __tlbie_lpid(lpid, RIC_FLUSH_ALL);
+       }
+       fixup_tlbie_lpid(lpid);
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+       int set;
+
+       VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+       asm volatile("ptesync": : :"memory");
+
+       /*
+        * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+        * also flush the entire Page Walk Cache.
+        */
+       __tlbiel_lpid_guest(lpid, 0, ric);
+
+       /* For PWC, only one flush is needed */
+       if (ric == RIC_FLUSH_PWC) {
+               asm volatile("ptesync": : :"memory");
+               return;
+       }
+
+       /* For the remaining sets, just flush the TLB */
+       for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+               __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
+}
+
+
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+                                   unsigned long pid, unsigned long page_size,
+                                   unsigned long psize)
+{
+       unsigned long addr;
+       unsigned long ap = mmu_get_ap(psize);
+
+       for (addr = start; addr < end; addr += page_size)
+               __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+                             unsigned long psize, unsigned long ric)
+{
+       unsigned long ap = mmu_get_ap(psize);
+
+       asm volatile("ptesync": : :"memory");
+       __tlbiel_va(va, pid, ap, ric);
+       asm volatile("ptesync": : :"memory");
+}
+
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+                                   unsigned long pid, unsigned long page_size,
+                                   unsigned long psize, bool also_pwc)
+{
+       asm volatile("ptesync": : :"memory");
+       if (also_pwc)
+               __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+       __tlbiel_va_range(start, end, pid, page_size, psize);
+       asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+                                   unsigned long pid, unsigned long page_size,
+                                   unsigned long psize)
+{
+       unsigned long addr;
+       unsigned long ap = mmu_get_ap(psize);
+
+       for (addr = start; addr < end; addr += page_size)
+               __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+                             unsigned long psize, unsigned long ric)
+{
+       unsigned long ap = mmu_get_ap(psize);
+
+       asm volatile("ptesync": : :"memory");
+       __tlbie_va(va, pid, ap, ric);
+       fixup_tlbie();
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+                             unsigned long psize, unsigned long ric)
+{
+       unsigned long ap = mmu_get_ap(psize);
+
+       asm volatile("ptesync": : :"memory");
+       __tlbie_lpid_va(va, lpid, ap, ric);
+       fixup_tlbie_lpid(lpid);
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+                                   unsigned long pid, unsigned long page_size,
+                                   unsigned long psize, bool also_pwc)
+{
+       asm volatile("ptesync": : :"memory");
+       if (also_pwc)
+               __tlbie_pid(pid, RIC_FLUSH_PWC);
+       __tlbie_va_range(start, end, pid, page_size, psize);
+       fixup_tlbie();
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+void radix__local_flush_tlb_mm(struct mm_struct *mm)
+{
+       unsigned long pid;
+
+       preempt_disable();
+       pid = mm->context.id;
+       if (pid != MMU_NO_CONTEXT)
+               _tlbiel_pid(pid, RIC_FLUSH_TLB);
+       preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_mm);
+
+#ifndef CONFIG_SMP
+void radix__local_flush_all_mm(struct mm_struct *mm)
+{
+       unsigned long pid;
+
+       preempt_disable();
+       pid = mm->context.id;
+       if (pid != MMU_NO_CONTEXT)
+               _tlbiel_pid(pid, RIC_FLUSH_ALL);
+       preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_all_mm);
+#endif /* CONFIG_SMP */
+
+void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+                                      int psize)
+{
+       unsigned long pid;
+
+       preempt_disable();
+       pid = mm->context.id;
+       if (pid != MMU_NO_CONTEXT)
+               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+       preempt_enable();
+}
+
+void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       /* need the return fix for nohash.c */
+       if (is_vm_hugetlb_page(vma))
+               return radix__local_flush_hugetlb_page(vma, vmaddr);
+#endif
+       radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_page);
+
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+       if (atomic_read(&mm->context.copros) > 0)
+               return false;
+       if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
+               return true;
+       return false;
+}
+
+static bool mm_needs_flush_escalation(struct mm_struct *mm)
+{
+       /*
+        * P9 nest MMU has issues with the page walk cache
+        * caching PTEs and not flushing them properly when
+        * RIC = 0 for a PID/LPID invalidate
+        */
+       if (atomic_read(&mm->context.copros) > 0)
+               return true;
+       return false;
+}
+
+#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+       struct mm_struct *mm = arg;
+       unsigned long pid = mm->context.id;
+
+       if (current->mm == mm)
+               return; /* Local CPU */
+
+       if (current->active_mm == mm) {
+               /*
+                * Must be a kernel thread because sender is single-threaded.
+                */
+               BUG_ON(current->mm);
+               mmgrab(&init_mm);
+               switch_mm(mm, &init_mm, current);
+               current->active_mm = &init_mm;
+               mmdrop(mm);
+       }
+       _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+       /*
+        * Would be nice if this was async so it could be run in
+        * parallel with our local flush, but generic code does not
+        * give a good API for it. Could extend the generic code or
+        * make a special powerpc IPI for flushing TLBs.
+        * For now it's not too performance critical.
+        */
+       smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+                               (void *)mm, 1);
+       mm_reset_thread_local(mm);
+}
+
+void radix__flush_tlb_mm(struct mm_struct *mm)
+{
+       unsigned long pid;
+
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       preempt_disable();
+       /*
+        * Order loads of mm_cpumask vs previous stores to clear ptes before
+        * the invalidate. See barrier in switch_mm_irqs_off
+        */
+       smp_mb();
+       if (!mm_is_thread_local(mm)) {
+               if (unlikely(mm_is_singlethreaded(mm))) {
+                       exit_flush_lazy_tlbs(mm);
+                       goto local;
+               }
+
+               if (mm_needs_flush_escalation(mm))
+                       _tlbie_pid(pid, RIC_FLUSH_ALL);
+               else
+                       _tlbie_pid(pid, RIC_FLUSH_TLB);
+       } else {
+local:
+               _tlbiel_pid(pid, RIC_FLUSH_TLB);
+       }
+       preempt_enable();
+}
+EXPORT_SYMBOL(radix__flush_tlb_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+       unsigned long pid;
+
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       preempt_disable();
+       smp_mb(); /* see radix__flush_tlb_mm */
+       if (!mm_is_thread_local(mm)) {
+               if (unlikely(mm_is_singlethreaded(mm))) {
+                       if (!fullmm) {
+                               exit_flush_lazy_tlbs(mm);
+                               goto local;
+                       }
+               }
+               _tlbie_pid(pid, RIC_FLUSH_ALL);
+       } else {
+local:
+               _tlbiel_pid(pid, RIC_FLUSH_ALL);
+       }
+       preempt_enable();
+}
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+       __flush_all_mm(mm, false);
+}
+EXPORT_SYMBOL(radix__flush_all_mm);
+
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+{
+       tlb->need_flush_all = 1;
+}
+EXPORT_SYMBOL(radix__flush_tlb_pwc);
+
+void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+                                int psize)
+{
+       unsigned long pid;
+
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       preempt_disable();
+       smp_mb(); /* see radix__flush_tlb_mm */
+       if (!mm_is_thread_local(mm)) {
+               if (unlikely(mm_is_singlethreaded(mm))) {
+                       exit_flush_lazy_tlbs(mm);
+                       goto local;
+               }
+               _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+       } else {
+local:
+               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+       }
+       preempt_enable();
+}
+
+void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       if (is_vm_hugetlb_page(vma))
+               return radix__flush_hugetlb_page(vma, vmaddr);
+#endif
+       radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__flush_tlb_page);
+
+#else /* CONFIG_SMP */
+#define radix__flush_all_mm radix__local_flush_all_mm
+#endif /* CONFIG_SMP */
+
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+       _tlbie_pid(0, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+
+#define TLB_FLUSH_ALL -1UL
+
+/*
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+                                       unsigned long start, unsigned long end,
+                                       bool flush_all_sizes)
+
+{
+       unsigned long pid;
+       unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+       unsigned long page_size = 1UL << page_shift;
+       unsigned long nr_pages = (end - start) >> page_shift;
+       bool local, full;
+
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       preempt_disable();
+       smp_mb(); /* see radix__flush_tlb_mm */
+       if (!mm_is_thread_local(mm)) {
+               if (unlikely(mm_is_singlethreaded(mm))) {
+                       if (end != TLB_FLUSH_ALL) {
+                               exit_flush_lazy_tlbs(mm);
+                               goto is_local;
+                       }
+               }
+               local = false;
+               full = (end == TLB_FLUSH_ALL ||
+                               nr_pages > tlb_single_page_flush_ceiling);
+       } else {
+is_local:
+               local = true;
+               full = (end == TLB_FLUSH_ALL ||
+                               nr_pages > tlb_local_single_page_flush_ceiling);
+       }
+
+       if (full) {
+               if (local) {
+                       _tlbiel_pid(pid, RIC_FLUSH_TLB);
+               } else {
+                       if (mm_needs_flush_escalation(mm))
+                               _tlbie_pid(pid, RIC_FLUSH_ALL);
+                       else
+                               _tlbie_pid(pid, RIC_FLUSH_TLB);
+               }
+       } else {
+               bool hflush = flush_all_sizes;
+               bool gflush = flush_all_sizes;
+               unsigned long hstart, hend;
+               unsigned long gstart, gend;
+
+               if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+                       hflush = true;
+
+               if (hflush) {
+                       hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+                       hend = end & PMD_MASK;
+                       if (hstart == hend)
+                               hflush = false;
+               }
+
+               if (gflush) {
+                       gstart = (start + PUD_SIZE - 1) & PUD_MASK;
+                       gend = end & PUD_MASK;
+                       if (gstart == gend)
+                               gflush = false;
+               }
+
+               asm volatile("ptesync": : :"memory");
+               if (local) {
+                       __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+                       if (hflush)
+                               __tlbiel_va_range(hstart, hend, pid,
+                                               PMD_SIZE, MMU_PAGE_2M);
+                       if (gflush)
+                               __tlbiel_va_range(gstart, gend, pid,
+                                               PUD_SIZE, MMU_PAGE_1G);
+                       asm volatile("ptesync": : :"memory");
+               } else {
+                       __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+                       if (hflush)
+                               __tlbie_va_range(hstart, hend, pid,
+                                               PMD_SIZE, MMU_PAGE_2M);
+                       if (gflush)
+                               __tlbie_va_range(gstart, gend, pid,
+                                               PUD_SIZE, MMU_PAGE_1G);
+                       fixup_tlbie();
+                       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+               }
+       }
+       preempt_enable();
+}
+
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+                    unsigned long end)
+
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       if (is_vm_hugetlb_page(vma))
+               return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+
+       __radix__flush_tlb_range(vma->vm_mm, start, end, false);
+}
+EXPORT_SYMBOL(radix__flush_tlb_range);
+
+static int radix_get_mmu_psize(int page_size)
+{
+       int psize;
+
+       if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
+               psize = mmu_virtual_psize;
+       else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
+               psize = MMU_PAGE_2M;
+       else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
+               psize = MMU_PAGE_1G;
+       else
+               return -1;
+       return psize;
+}
+
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+                                       unsigned long addr,
+                                       unsigned long page_size)
+{
+       int psize = radix_get_mmu_psize(page_size);
+
+       _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+       _tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__flush_tlb_lpid(unsigned int lpid)
+{
+       _tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+       _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR).
+ * Important difference, the guest normally manages its own translations,
+ * but some cases e.g., vCPU CPU migration require KVM to flush.
+ */
+void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+       _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
+
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+                                 unsigned long end, int psize);
+
+void radix__tlb_flush(struct mmu_gather *tlb)
+{
+       int psize = 0;
+       struct mm_struct *mm = tlb->mm;
+       int page_size = tlb->page_size;
+       unsigned long start = tlb->start;
+       unsigned long end = tlb->end;
+
+       /*
+        * if page size is not something we understand, do a full mm flush
+        *
+        * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+        * that flushes the process table entry cache upon process teardown.
+        * See the comment for radix in arch_exit_mmap().
+        */
+       if (tlb->fullmm) {
+               __flush_all_mm(mm, true);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+       } else if (mm_tlb_flush_nested(mm)) {
+               /*
+                * If there is a concurrent invalidation that is clearing ptes,
+                * then it's possible this invalidation will miss one of those
+                * cleared ptes and miss flushing the TLB. If this invalidate
+                * returns before the other one flushes TLBs, that can result
+                * in it returning while there are still valid TLBs inside the
+                * range to be invalidated.
+                *
+                * See mm/memory.c:tlb_finish_mmu() for more details.
+                *
+                * The solution to this is ensure the entire range is always
+                * flushed here. The problem for powerpc is that the flushes
+                * are page size specific, so this "forced flush" would not
+                * do the right thing if there are a mix of page sizes in
+                * the range to be invalidated. So use __flush_tlb_range
+                * which invalidates all possible page sizes in the range.
+                *
+                * PWC flush probably is not be required because the core code
+                * shouldn't free page tables in this path, but accounting
+                * for the possibility makes us a bit more robust.
+                *
+                * need_flush_all is an uncommon case because page table
+                * teardown should be done with exclusive locks held (but
+                * after locks are dropped another invalidate could come
+                * in), it could be optimized further if necessary.
+                */
+               if (!tlb->need_flush_all)
+                       __radix__flush_tlb_range(mm, start, end, true);
+               else
+                       radix__flush_all_mm(mm);
+#endif
+       } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+               if (!tlb->need_flush_all)
+                       radix__flush_tlb_mm(mm);
+               else
+                       radix__flush_all_mm(mm);
+       } else {
+               if (!tlb->need_flush_all)
+                       radix__flush_tlb_range_psize(mm, start, end, psize);
+               else
+                       radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+       }
+       tlb->need_flush_all = 0;
+}
+
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+                               unsigned long start, unsigned long end,
+                               int psize, bool also_pwc)
+{
+       unsigned long pid;
+       unsigned int page_shift = mmu_psize_defs[psize].shift;
+       unsigned long page_size = 1UL << page_shift;
+       unsigned long nr_pages = (end - start) >> page_shift;
+       bool local, full;
+
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       preempt_disable();
+       smp_mb(); /* see radix__flush_tlb_mm */
+       if (!mm_is_thread_local(mm)) {
+               if (unlikely(mm_is_singlethreaded(mm))) {
+                       if (end != TLB_FLUSH_ALL) {
+                               exit_flush_lazy_tlbs(mm);
+                               goto is_local;
+                       }
+               }
+               local = false;
+               full = (end == TLB_FLUSH_ALL ||
+                               nr_pages > tlb_single_page_flush_ceiling);
+       } else {
+is_local:
+               local = true;
+               full = (end == TLB_FLUSH_ALL ||
+                               nr_pages > tlb_local_single_page_flush_ceiling);
+       }
+
+       if (full) {
+               if (local) {
+                       _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+               } else {
+                       if (mm_needs_flush_escalation(mm))
+                               also_pwc = true;
+
+                       _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+               }
+       } else {
+               if (local)
+                       _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
+               else
+                       _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+       }
+       preempt_enable();
+}
+
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+                                 unsigned long end, int psize)
+{
+       return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+                                 unsigned long end, int psize)
+{
+       __radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+       unsigned long pid, end;
+
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       /* 4k page size, just blow the world */
+       if (PAGE_SIZE == 0x1000) {
+               radix__flush_all_mm(mm);
+               return;
+       }
+
+       end = addr + HPAGE_PMD_SIZE;
+
+       /* Otherwise first do the PWC, then iterate the pages. */
+       preempt_disable();
+       smp_mb(); /* see radix__flush_tlb_mm */
+       if (!mm_is_thread_local(mm)) {
+               if (unlikely(mm_is_singlethreaded(mm))) {
+                       exit_flush_lazy_tlbs(mm);
+                       goto local;
+               }
+               _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+       } else {
+local:
+               _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+       }
+
+       preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+                               unsigned long start, unsigned long end)
+{
+       radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
+}
+EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+
+void radix__flush_tlb_all(void)
+{
+       unsigned long rb,prs,r,rs;
+       unsigned long ric = RIC_FLUSH_ALL;
+
+       rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
+       prs = 0; /* partition scoped */
+       r = 1;   /* radix format */
+       rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
+
+       asm volatile("ptesync": : :"memory");
+       /*
+        * now flush guest entries by passing PRS = 1 and LPID != 0
+        */
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
+       /*
+        * now flush host entires by passing PRS = 0 and LPID == 0
+        */
+       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+{
+       unsigned long pid = mm->context.id;
+
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               return;
+
+       /*
+        * If this context hasn't run on that CPU before and KVM is
+        * around, there's a slim chance that the guest on another
+        * CPU just brought in obsolete translation into the TLB of
+        * this CPU due to a bad prefetch using the guest PID on
+        * the way into the hypervisor.
+        *
+        * We work around this here. If KVM is possible, we check if
+        * any sibling thread is in KVM. If it is, the window may exist
+        * and thus we flush that PID from the core.
+        *
+        * A potential future improvement would be to mark which PIDs
+        * have never been used on the system and avoid it if the PID
+        * is new and the process has no other cpumask bit set.
+        */
+       if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
+               int cpu = smp_processor_id();
+               int sib = cpu_first_thread_sibling(cpu);
+               bool flush = false;
+
+               for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
+                       if (sib == cpu)
+                               continue;
+                       if (!cpu_possible(sib))
+                               continue;
+                       if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
+                               flush = true;
+               }
+               if (flush)
+                       _tlbiel_pid(pid, RIC_FLUSH_ALL);
+       }
+}
+EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
new file mode 100644 (file)
index 0000000..c227422
--- /dev/null
@@ -0,0 +1,833 @@
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code written by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/asm-prototypes.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <linux/compiler.h>
+#include <linux/context_tracking.h>
+#include <linux/mm_types.h>
+
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+
+enum slb_index {
+       LINEAR_INDEX    = 0, /* Kernel linear map  (0xc000000000000000) */
+       KSTACK_INDEX    = 1, /* Kernel stack map */
+};
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
+
+#define slb_esid_mask(ssize)   \
+       (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+                                        enum slb_index index)
+{
+       return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+                                        unsigned long flags)
+{
+       return (vsid << slb_vsid_shift(ssize)) | flags |
+               ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+                                        unsigned long flags)
+{
+       return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+static void assert_slb_presence(bool present, unsigned long ea)
+{
+#ifdef CONFIG_DEBUG_VM
+       unsigned long tmp;
+
+       WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_206))
+               return;
+
+       /*
+        * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
+        * ignores all other bits from 0-27, so just clear them all.
+        */
+       ea &= ~((1UL << 28) - 1);
+       asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
+
+       WARN_ON(present == (tmp == 0));
+#endif
+}
+
+static inline void slb_shadow_update(unsigned long ea, int ssize,
+                                    unsigned long flags,
+                                    enum slb_index index)
+{
+       struct slb_shadow *p = get_slb_shadow();
+
+       /*
+        * Clear the ESID first so the entry is not valid while we are
+        * updating it.  No write barriers are needed here, provided
+        * we only update the current CPU's SLB shadow buffer.
+        */
+       WRITE_ONCE(p->save_area[index].esid, 0);
+       WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+       WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
+}
+
+static inline void slb_shadow_clear(enum slb_index index)
+{
+       WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
+}
+
+static inline void create_shadowed_slbe(unsigned long ea, int ssize,
+                                       unsigned long flags,
+                                       enum slb_index index)
+{
+       /*
+        * Updating the shadow buffer before writing the SLB ensures
+        * we don't get a stale entry here if we get preempted by PHYP
+        * between these two statements.
+        */
+       slb_shadow_update(ea, ssize, flags, index);
+
+       assert_slb_presence(false, ea);
+       asm volatile("slbmte  %0,%1" :
+                    : "r" (mk_vsid_data(ea, ssize, flags)),
+                      "r" (mk_esid_data(ea, ssize, index))
+                    : "memory" );
+}
+
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
+{
+       struct slb_shadow *p = get_slb_shadow();
+       enum slb_index index;
+
+        /* No isync needed because realmode. */
+       for (index = 0; index < SLB_NUM_BOLTED; index++) {
+               asm volatile("slbmte  %0,%1" :
+                    : "r" (be64_to_cpu(p->save_area[index].vsid)),
+                      "r" (be64_to_cpu(p->save_area[index].esid)));
+       }
+
+       assert_slb_presence(true, local_paca->kstack);
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ */
+void slb_restore_bolted_realmode(void)
+{
+       __slb_restore_bolted_realmode();
+       get_paca()->slb_cache_ptr = 0;
+
+       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
+{
+       asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
+/*
+ * This flushes non-bolted entries, it can be run in virtual mode. Must
+ * be called with interrupts disabled.
+ */
+void slb_flush_and_restore_bolted(void)
+{
+       struct slb_shadow *p = get_slb_shadow();
+
+       BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
+
+       WARN_ON(!irqs_disabled());
+
+       /*
+        * We can't take a PMU exception in the following code, so hard
+        * disable interrupts.
+        */
+       hard_irq_disable();
+
+       asm volatile("isync\n"
+                    "slbia\n"
+                    "slbmte  %0, %1\n"
+                    "isync\n"
+                    :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
+                       "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
+                    : "memory");
+       assert_slb_presence(true, get_paca()->kstack);
+
+       get_paca()->slb_cache_ptr = 0;
+
+       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+       int i;
+       unsigned long e, v;
+
+       /* Save slb_cache_ptr value. */
+       get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+       if (!slb_ptr)
+               return;
+
+       for (i = 0; i < mmu_slb_size; i++) {
+               asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
+               asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
+               slb_ptr->esid = e;
+               slb_ptr->vsid = v;
+               slb_ptr++;
+       }
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+       int i, n;
+       unsigned long e, v;
+       unsigned long llp;
+
+       if (!slb_ptr)
+               return;
+
+       pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+       pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
+
+       for (i = 0; i < mmu_slb_size; i++) {
+               e = slb_ptr->esid;
+               v = slb_ptr->vsid;
+               slb_ptr++;
+
+               if (!e && !v)
+                       continue;
+
+               pr_err("%02d %016lx %016lx\n", i, e, v);
+
+               if (!(e & SLB_ESID_V)) {
+                       pr_err("\n");
+                       continue;
+               }
+               llp = v & SLB_VSID_LLP;
+               if (v & SLB_VSID_B_1T) {
+                       pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+                              GET_ESID_1T(e),
+                              (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
+               } else {
+                       pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+                              GET_ESID(e),
+                              (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
+               }
+       }
+       pr_err("----------------------------------\n");
+
+       /* Dump slb cache entires as well. */
+       pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+       pr_err("Valid SLB cache entries:\n");
+       n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+       for (i = 0; i < n; i++)
+               pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+       pr_err("Rest of SLB cache entries:\n");
+       for (i = n; i < SLB_CACHE_ENTRIES; i++)
+               pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+}
+
+void slb_vmalloc_update(void)
+{
+       /*
+        * vmalloc is not bolted, so just have to flush non-bolted.
+        */
+       slb_flush_and_restore_bolted();
+}
+
+static bool preload_hit(struct thread_info *ti, unsigned long esid)
+{
+       unsigned char i;
+
+       for (i = 0; i < ti->slb_preload_nr; i++) {
+               unsigned char idx;
+
+               idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+               if (esid == ti->slb_preload_esid[idx])
+                       return true;
+       }
+       return false;
+}
+
+static bool preload_add(struct thread_info *ti, unsigned long ea)
+{
+       unsigned char idx;
+       unsigned long esid;
+
+       if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+               /* EAs are stored >> 28 so 256MB segments don't need clearing */
+               if (ea & ESID_MASK_1T)
+                       ea &= ESID_MASK_1T;
+       }
+
+       esid = ea >> SID_SHIFT;
+
+       if (preload_hit(ti, esid))
+               return false;
+
+       idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
+       ti->slb_preload_esid[idx] = esid;
+       if (ti->slb_preload_nr == SLB_PRELOAD_NR)
+               ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+       else
+               ti->slb_preload_nr++;
+
+       return true;
+}
+
+static void preload_age(struct thread_info *ti)
+{
+       if (!ti->slb_preload_nr)
+               return;
+       ti->slb_preload_nr--;
+       ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+}
+
+void slb_setup_new_exec(void)
+{
+       struct thread_info *ti = current_thread_info();
+       struct mm_struct *mm = current->mm;
+       unsigned long exec = 0x10000000;
+
+       WARN_ON(irqs_disabled());
+
+       /*
+        * preload cache can only be used to determine whether a SLB
+        * entry exists if it does not start to overflow.
+        */
+       if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
+               return;
+
+       hard_irq_disable();
+
+       /*
+        * We have no good place to clear the slb preload cache on exec,
+        * flush_thread is about the earliest arch hook but that happens
+        * after we switch to the mm and have aleady preloaded the SLBEs.
+        *
+        * For the most part that's probably okay to use entries from the
+        * previous exec, they will age out if unused. It may turn out to
+        * be an advantage to clear the cache before switching to it,
+        * however.
+        */
+
+       /*
+        * preload some userspace segments into the SLB.
+        * Almost all 32 and 64bit PowerPC executables are linked at
+        * 0x10000000 so it makes sense to preload this segment.
+        */
+       if (!is_kernel_addr(exec)) {
+               if (preload_add(ti, exec))
+                       slb_allocate_user(mm, exec);
+       }
+
+       /* Libraries and mmaps. */
+       if (!is_kernel_addr(mm->mmap_base)) {
+               if (preload_add(ti, mm->mmap_base))
+                       slb_allocate_user(mm, mm->mmap_base);
+       }
+
+       /* see switch_slb */
+       asm volatile("isync" : : : "memory");
+
+       local_irq_enable();
+}
+
+void preload_new_slb_context(unsigned long start, unsigned long sp)
+{
+       struct thread_info *ti = current_thread_info();
+       struct mm_struct *mm = current->mm;
+       unsigned long heap = mm->start_brk;
+
+       WARN_ON(irqs_disabled());
+
+       /* see above */
+       if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
+               return;
+
+       hard_irq_disable();
+
+       /* Userspace entry address. */
+       if (!is_kernel_addr(start)) {
+               if (preload_add(ti, start))
+                       slb_allocate_user(mm, start);
+       }
+
+       /* Top of stack, grows down. */
+       if (!is_kernel_addr(sp)) {
+               if (preload_add(ti, sp))
+                       slb_allocate_user(mm, sp);
+       }
+
+       /* Bottom of heap, grows up. */
+       if (heap && !is_kernel_addr(heap)) {
+               if (preload_add(ti, heap))
+                       slb_allocate_user(mm, heap);
+       }
+
+       /* see switch_slb */
+       asm volatile("isync" : : : "memory");
+
+       local_irq_enable();
+}
+
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+       struct thread_info *ti = task_thread_info(tsk);
+       unsigned char i;
+
+       /*
+        * We need interrupts hard-disabled here, not just soft-disabled,
+        * so that a PMU interrupt can't occur, which might try to access
+        * user memory (to get a stack trace) and possible cause an SLB miss
+        * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+        */
+       hard_irq_disable();
+       asm volatile("isync" : : : "memory");
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               /*
+                * SLBIA IH=3 invalidates all Class=1 SLBEs and their
+                * associated lookaside structures, which matches what
+                * switch_slb wants. So ARCH_300 does not use the slb
+                * cache.
+                */
+               asm volatile(PPC_SLBIA(3));
+       } else {
+               unsigned long offset = get_paca()->slb_cache_ptr;
+
+               if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
+                   offset <= SLB_CACHE_ENTRIES) {
+                       unsigned long slbie_data = 0;
+
+                       for (i = 0; i < offset; i++) {
+                               unsigned long ea;
+
+                               ea = (unsigned long)
+                                       get_paca()->slb_cache[i] << SID_SHIFT;
+                               /*
+                                * Could assert_slb_presence(true) here, but
+                                * hypervisor or machine check could have come
+                                * in and removed the entry at this point.
+                                */
+
+                               slbie_data = ea;
+                               slbie_data |= user_segment_size(slbie_data)
+                                               << SLBIE_SSIZE_SHIFT;
+                               slbie_data |= SLBIE_C; /* user slbs have C=1 */
+                               asm volatile("slbie %0" : : "r" (slbie_data));
+                       }
+
+                       /* Workaround POWER5 < DD2.1 issue */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
+                               asm volatile("slbie %0" : : "r" (slbie_data));
+
+               } else {
+                       struct slb_shadow *p = get_slb_shadow();
+                       unsigned long ksp_esid_data =
+                               be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+                       unsigned long ksp_vsid_data =
+                               be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+                       asm volatile(PPC_SLBIA(1) "\n"
+                                    "slbmte    %0,%1\n"
+                                    "isync"
+                                    :: "r"(ksp_vsid_data),
+                                       "r"(ksp_esid_data));
+
+                       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+               }
+
+               get_paca()->slb_cache_ptr = 0;
+       }
+       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+       copy_mm_to_paca(mm);
+
+       /*
+        * We gradually age out SLBs after a number of context switches to
+        * reduce reload overhead of unused entries (like we do with FP/VEC
+        * reload). Each time we wrap 256 switches, take an entry out of the
+        * SLB preload cache.
+        */
+       tsk->thread.load_slb++;
+       if (!tsk->thread.load_slb) {
+               unsigned long pc = KSTK_EIP(tsk);
+
+               preload_age(ti);
+               preload_add(ti, pc);
+       }
+
+       for (i = 0; i < ti->slb_preload_nr; i++) {
+               unsigned char idx;
+               unsigned long ea;
+
+               idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+               ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
+
+               slb_allocate_user(mm, ea);
+       }
+
+       /*
+        * Synchronize slbmte preloads with possible subsequent user memory
+        * address accesses by the kernel (user mode won't happen until
+        * rfid, which is safe).
+        */
+       asm volatile("isync" : : : "memory");
+}
+
+void slb_set_size(u16 size)
+{
+       mmu_slb_size = size;
+}
+
+void slb_initialize(void)
+{
+       unsigned long linear_llp, vmalloc_llp, io_llp;
+       unsigned long lflags;
+       static int slb_encoding_inited;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       unsigned long vmemmap_llp;
+#endif
+
+       /* Prepare our SLB miss handler based on our page size */
+       linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+       io_llp = mmu_psize_defs[mmu_io_psize].sllp;
+       vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+       get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+       if (!slb_encoding_inited) {
+               slb_encoding_inited = 1;
+               pr_devel("SLB: linear  LLP = %04lx\n", linear_llp);
+               pr_devel("SLB: io      LLP = %04lx\n", io_llp);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+               pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
+       }
+
+       get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
+       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+       lflags = SLB_VSID_KERNEL | linear_llp;
+
+       /* Invalidate the entire SLB (even entry 0) & all the ERATS */
+       asm volatile("isync":::"memory");
+       asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+       asm volatile("isync; slbia; isync":::"memory");
+       create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
+
+       /*
+        * For the boot cpu, we're running on the stack in init_thread_union,
+        * which is in the first segment of the linear mapping, and also
+        * get_paca()->kstack hasn't been initialized yet.
+        * For secondary cpus, we need to bolt the kernel stack entry now.
+        */
+       slb_shadow_clear(KSTACK_INDEX);
+       if (raw_smp_processor_id() != boot_cpuid &&
+           (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+               create_shadowed_slbe(get_paca()->kstack,
+                                    mmu_kernel_ssize, lflags, KSTACK_INDEX);
+
+       asm volatile("isync":::"memory");
+}
+
+static void slb_cache_update(unsigned long esid_data)
+{
+       int slb_cache_index;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               return; /* ISAv3.0B and later does not use slb_cache */
+
+       /*
+        * Now update slb cache entries
+        */
+       slb_cache_index = local_paca->slb_cache_ptr;
+       if (slb_cache_index < SLB_CACHE_ENTRIES) {
+               /*
+                * We have space in slb cache for optimized switch_slb().
+                * Top 36 bits from esid_data as per ISA
+                */
+               local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+               local_paca->slb_cache_ptr++;
+       } else {
+               /*
+                * Our cache is full and the current cache content strictly
+                * doesn't indicate the active SLB conents. Bump the ptr
+                * so that switch_slb() will ignore the cache.
+                */
+               local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
+       }
+}
+
+static enum slb_index alloc_slb_index(bool kernel)
+{
+       enum slb_index index;
+
+       /*
+        * The allocation bitmaps can become out of synch with the SLB
+        * when the _switch code does slbie when bolting a new stack
+        * segment and it must not be anywhere else in the SLB. This leaves
+        * a kernel allocated entry that is unused in the SLB. With very
+        * large systems or small segment sizes, the bitmaps could slowly
+        * fill with these entries. They will eventually be cleared out
+        * by the round robin allocator in that case, so it's probably not
+        * worth accounting for.
+        */
+
+       /*
+        * SLBs beyond 32 entries are allocated with stab_rr only
+        * POWER7/8/9 have 32 SLB entries, this could be expanded if a
+        * future CPU has more.
+        */
+       if (local_paca->slb_used_bitmap != U32_MAX) {
+               index = ffz(local_paca->slb_used_bitmap);
+               local_paca->slb_used_bitmap |= 1U << index;
+               if (kernel)
+                       local_paca->slb_kern_bitmap |= 1U << index;
+       } else {
+               /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
+               index = local_paca->stab_rr;
+               if (index < (mmu_slb_size - 1))
+                       index++;
+               else
+                       index = SLB_NUM_BOLTED;
+               local_paca->stab_rr = index;
+               if (index < 32) {
+                       if (kernel)
+                               local_paca->slb_kern_bitmap |= 1U << index;
+                       else
+                               local_paca->slb_kern_bitmap &= ~(1U << index);
+               }
+       }
+       BUG_ON(index < SLB_NUM_BOLTED);
+
+       return index;
+}
+
+static long slb_insert_entry(unsigned long ea, unsigned long context,
+                               unsigned long flags, int ssize, bool kernel)
+{
+       unsigned long vsid;
+       unsigned long vsid_data, esid_data;
+       enum slb_index index;
+
+       vsid = get_vsid(context, ea, ssize);
+       if (!vsid)
+               return -EFAULT;
+
+       /*
+        * There must not be a kernel SLB fault in alloc_slb_index or before
+        * slbmte here or the allocation bitmaps could get out of whack with
+        * the SLB.
+        *
+        * User SLB faults or preloads take this path which might get inlined
+        * into the caller, so add compiler barriers here to ensure unsafe
+        * memory accesses do not come between.
+        */
+       barrier();
+
+       index = alloc_slb_index(kernel);
+
+       vsid_data = __mk_vsid_data(vsid, ssize, flags);
+       esid_data = mk_esid_data(ea, ssize, index);
+
+       /*
+        * No need for an isync before or after this slbmte. The exception
+        * we enter with and the rfid we exit with are context synchronizing.
+        * User preloads should add isync afterwards in case the kernel
+        * accesses user memory before it returns to userspace with rfid.
+        */
+       assert_slb_presence(false, ea);
+       asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
+
+       barrier();
+
+       if (!kernel)
+               slb_cache_update(esid_data);
+
+       return 0;
+}
+
+static long slb_allocate_kernel(unsigned long ea, unsigned long id)
+{
+       unsigned long context;
+       unsigned long flags;
+       int ssize;
+
+       if (id == LINEAR_MAP_REGION_ID) {
+
+               /* We only support upto MAX_PHYSMEM_BITS */
+               if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
+                       return -EFAULT;
+
+               flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       } else if (id == VMEMMAP_REGION_ID) {
+
+               if (ea >= H_VMEMMAP_END)
+                       return -EFAULT;
+
+               flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+       } else if (id == VMALLOC_REGION_ID) {
+
+               if (ea >= H_VMALLOC_END)
+                       return -EFAULT;
+
+               flags = local_paca->vmalloc_sllp;
+
+       } else if (id == IO_REGION_ID) {
+
+               if (ea >= H_KERN_IO_END)
+                       return -EFAULT;
+
+               flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+
+       } else {
+               return -EFAULT;
+       }
+
+       ssize = MMU_SEGSIZE_1T;
+       if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
+               ssize = MMU_SEGSIZE_256M;
+
+       context = get_kernel_context(ea);
+
+       return slb_insert_entry(ea, context, flags, ssize, true);
+}
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
+{
+       unsigned long context;
+       unsigned long flags;
+       int bpsize;
+       int ssize;
+
+       /*
+        * consider this as bad access if we take a SLB miss
+        * on an address above addr limit.
+        */
+       if (ea >= mm_ctx_slb_addr_limit(&mm->context))
+               return -EFAULT;
+
+       context = get_user_context(&mm->context, ea);
+       if (!context)
+               return -EFAULT;
+
+       if (unlikely(ea >= H_PGTABLE_RANGE)) {
+               WARN_ON(1);
+               return -EFAULT;
+       }
+
+       ssize = user_segment_size(ea);
+
+       bpsize = get_slice_psize(mm, ea);
+       flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
+
+       return slb_insert_entry(ea, context, flags, ssize, false);
+}
+
+long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+{
+       unsigned long id = get_region_id(ea);
+
+       /* IRQs are not reconciled here, so can't check irqs_disabled */
+       VM_WARN_ON(mfmsr() & MSR_EE);
+
+       if (unlikely(!(regs->msr & MSR_RI)))
+               return -EINVAL;
+
+       /*
+        * SLB kernel faults must be very careful not to touch anything
+        * that is not bolted. E.g., PACA and global variables are okay,
+        * mm->context stuff is not.
+        *
+        * SLB user faults can access all of kernel memory, but must be
+        * careful not to touch things like IRQ state because it is not
+        * "reconciled" here. The difficulty is that we must use
+        * fast_exception_return to return from kernel SLB faults without
+        * looking at possible non-bolted memory. We could test user vs
+        * kernel faults in the interrupt handler asm and do a full fault,
+        * reconcile, ret_from_except for user faults which would make them
+        * first class kernel code. But for performance it's probably nicer
+        * if they go via fast_exception_return too.
+        */
+       if (id >= LINEAR_MAP_REGION_ID) {
+               long err;
+#ifdef CONFIG_DEBUG_VM
+               /* Catch recursive kernel SLB faults. */
+               BUG_ON(local_paca->in_kernel_slb_handler);
+               local_paca->in_kernel_slb_handler = 1;
+#endif
+               err = slb_allocate_kernel(ea, id);
+#ifdef CONFIG_DEBUG_VM
+               local_paca->in_kernel_slb_handler = 0;
+#endif
+               return err;
+       } else {
+               struct mm_struct *mm = current->mm;
+               long err;
+
+               if (unlikely(!mm))
+                       return -EFAULT;
+
+               err = slb_allocate_user(mm, ea);
+               if (!err)
+                       preload_add(current_thread_info(), ea);
+
+               return err;
+       }
+}
+
+void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
+{
+       if (err == -EFAULT) {
+               if (user_mode(regs))
+                       _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
+               else
+                       bad_page_fault(regs, ea, SIGSEGV);
+       } else if (err == -EINVAL) {
+               unrecoverable_exception(regs);
+       } else {
+               BUG();
+       }
+}
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
new file mode 100644 (file)
index 0000000..473dd43
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2007-2008 Paul Mackerras, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+
+/*
+ * Free all pages allocated for subpage protection maps and pointers.
+ * Also makes sure that the subpage_prot_table structure is
+ * reinitialized for the next user.
+ */
+void subpage_prot_free(struct mm_struct *mm)
+{
+       struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+       unsigned long i, j, addr;
+       u32 **p;
+
+       if (!spt)
+               return;
+
+       for (i = 0; i < 4; ++i) {
+               if (spt->low_prot[i]) {
+                       free_page((unsigned long)spt->low_prot[i]);
+                       spt->low_prot[i] = NULL;
+               }
+       }
+       addr = 0;
+       for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
+               p = spt->protptrs[i];
+               if (!p)
+                       continue;
+               spt->protptrs[i] = NULL;
+               for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
+                    ++j, addr += PAGE_SIZE)
+                       if (p[j])
+                               free_page((unsigned long)p[j]);
+               free_page((unsigned long)p);
+       }
+       spt->maxaddr = 0;
+       kfree(spt);
+}
+
+static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
+                            int npages)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       spinlock_t *ptl;
+
+       pgd = pgd_offset(mm, addr);
+       if (pgd_none(*pgd))
+               return;
+       pud = pud_offset(pgd, addr);
+       if (pud_none(*pud))
+               return;
+       pmd = pmd_offset(pud, addr);
+       if (pmd_none(*pmd))
+               return;
+       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       arch_enter_lazy_mmu_mode();
+       for (; npages > 0; --npages) {
+               pte_update(mm, addr, pte, 0, 0, 0);
+               addr += PAGE_SIZE;
+               ++pte;
+       }
+       arch_leave_lazy_mmu_mode();
+       pte_unmap_unlock(pte - 1, ptl);
+}
+
+/*
+ * Clear the subpage protection map for an address range, allowing
+ * all accesses that are allowed by the pte permissions.
+ */
+static void subpage_prot_clear(unsigned long addr, unsigned long len)
+{
+       struct mm_struct *mm = current->mm;
+       struct subpage_prot_table *spt;
+       u32 **spm, *spp;
+       unsigned long i;
+       size_t nw;
+       unsigned long next, limit;
+
+       down_write(&mm->mmap_sem);
+
+       spt = mm_ctx_subpage_prot(&mm->context);
+       if (!spt)
+               goto err_out;
+
+       limit = addr + len;
+       if (limit > spt->maxaddr)
+               limit = spt->maxaddr;
+       for (; addr < limit; addr = next) {
+               next = pmd_addr_end(addr, limit);
+               if (addr < 0x100000000UL) {
+                       spm = spt->low_prot;
+               } else {
+                       spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+                       if (!spm)
+                               continue;
+               }
+               spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+               if (!spp)
+                       continue;
+               spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+               i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+               nw = PTRS_PER_PTE - i;
+               if (addr + (nw << PAGE_SHIFT) > next)
+                       nw = (next - addr) >> PAGE_SHIFT;
+
+               memset(spp, 0, nw * sizeof(u32));
+
+               /* now flush any existing HPTEs for the range */
+               hpte_flush_range(mm, addr, nw);
+       }
+
+err_out:
+       up_write(&mm->mmap_sem);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+                                 unsigned long end, struct mm_walk *walk)
+{
+       struct vm_area_struct *vma = walk->vma;
+       split_huge_pmd(vma, pmd, addr);
+       return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+                                   unsigned long len)
+{
+       struct vm_area_struct *vma;
+       struct mm_walk subpage_proto_walk = {
+               .mm = mm,
+               .pmd_entry = subpage_walk_pmd_entry,
+       };
+
+       /*
+        * We don't try too hard, we just mark all the vma in that range
+        * VM_NOHUGEPAGE and split them.
+        */
+       vma = find_vma(mm, addr);
+       /*
+        * If the range is in unmapped range, just return
+        */
+       if (vma && ((addr + len) <= vma->vm_start))
+               return;
+
+       while (vma) {
+               if (vma->vm_start >= (addr + len))
+                       break;
+               vma->vm_flags |= VM_NOHUGEPAGE;
+               walk_page_vma(vma, &subpage_proto_walk);
+               vma = vma->vm_next;
+       }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+                                   unsigned long len)
+{
+       return;
+}
+#endif
+
+/*
+ * Copy in a subpage protection map for an address range.
+ * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
+ * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
+ * 2 or 3 to prevent all accesses.
+ * Note that the normal page protections also apply; the subpage
+ * protection mechanism is an additional constraint, so putting 0
+ * in a 2-bit field won't allow writes to a page that is otherwise
+ * write-protected.
+ */
+SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
+               unsigned long, len, u32 __user *, map)
+{
+       struct mm_struct *mm = current->mm;
+       struct subpage_prot_table *spt;
+       u32 **spm, *spp;
+       unsigned long i;
+       size_t nw;
+       unsigned long next, limit;
+       int err;
+
+       if (radix_enabled())
+               return -ENOENT;
+
+       /* Check parameters */
+       if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
+           addr >= mm->task_size || len >= mm->task_size ||
+           addr + len > mm->task_size)
+               return -EINVAL;
+
+       if (is_hugepage_only_range(mm, addr, len))
+               return -EINVAL;
+
+       if (!map) {
+               /* Clear out the protection map for the address range */
+               subpage_prot_clear(addr, len);
+               return 0;
+       }
+
+       if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
+               return -EFAULT;
+
+       down_write(&mm->mmap_sem);
+
+       spt = mm_ctx_subpage_prot(&mm->context);
+       if (!spt) {
+               /*
+                * Allocate subpage prot table if not already done.
+                * Do this with mmap_sem held
+                */
+               spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
+               if (!spt) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               mm->context.hash_context->spt = spt;
+       }
+
+       subpage_mark_vma_nohuge(mm, addr, len);
+       for (limit = addr + len; addr < limit; addr = next) {
+               next = pmd_addr_end(addr, limit);
+               err = -ENOMEM;
+               if (addr < 0x100000000UL) {
+                       spm = spt->low_prot;
+               } else {
+                       spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+                       if (!spm) {
+                               spm = (u32 **)get_zeroed_page(GFP_KERNEL);
+                               if (!spm)
+                                       goto out;
+                               spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
+                       }
+               }
+               spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
+               spp = *spm;
+               if (!spp) {
+                       spp = (u32 *)get_zeroed_page(GFP_KERNEL);
+                       if (!spp)
+                               goto out;
+                       *spm = spp;
+               }
+               spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+               local_irq_disable();
+               demote_segment_4k(mm, addr);
+               local_irq_enable();
+
+               i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+               nw = PTRS_PER_PTE - i;
+               if (addr + (nw << PAGE_SHIFT) > next)
+                       nw = (next - addr) >> PAGE_SHIFT;
+
+               up_write(&mm->mmap_sem);
+               if (__copy_from_user(spp, map, nw * sizeof(u32)))
+                       return -EFAULT;
+               map += nw;
+               down_write(&mm->mmap_sem);
+
+               /* now flush any existing HPTEs for the range */
+               hpte_flush_range(mm, addr, nw);
+       }
+       if (limit > spt->maxaddr)
+               spt->maxaddr = limit;
+       err = 0;
+ out:
+       up_write(&mm->mmap_sem);
+       return err;
+}
diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c
new file mode 100644 (file)
index 0000000..0ee7734
--- /dev/null
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/byteorder.h>
+#include "vphn.h"
+
+/*
+ * The associativity domain numbers are returned from the hypervisor as a
+ * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
+ * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
+ * bytes.
+ *
+ *    --- 16-bit fields -->
+ *  _________________________
+ *  |  0  |  1  |  2  |  3  |   be_packed[0]
+ *  ------+-----+-----+------
+ *  _________________________
+ *  |  4  |  5  |  6  |  7  |   be_packed[1]
+ *  -------------------------
+ *            ...
+ *  _________________________
+ *  | 20  | 21  | 22  | 23  |   be_packed[5]
+ *  -------------------------
+ *
+ * Convert to the sequence they would appear in the ibm,associativity property.
+ */
+int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
+{
+       __be64 be_packed[VPHN_REGISTER_COUNT];
+       int i, nr_assoc_doms = 0;
+       const __be16 *field = (const __be16 *) be_packed;
+       u16 last = 0;
+       bool is_32bit = false;
+
+#define VPHN_FIELD_UNUSED      (0xffff)
+#define VPHN_FIELD_MSB         (0x8000)
+#define VPHN_FIELD_MASK                (~VPHN_FIELD_MSB)
+
+       /* Let's fix the values returned by plpar_hcall9() */
+       for (i = 0; i < VPHN_REGISTER_COUNT; i++)
+               be_packed[i] = cpu_to_be64(packed[i]);
+
+       for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
+               u16 new = be16_to_cpup(field++);
+
+               if (is_32bit) {
+                       /*
+                        * Let's concatenate the 16 bits of this field to the
+                        * 15 lower bits of the previous field
+                        */
+                       unpacked[++nr_assoc_doms] =
+                               cpu_to_be32(last << 16 | new);
+                       is_32bit = false;
+               } else if (new == VPHN_FIELD_UNUSED)
+                       /* This is the list terminator */
+                       break;
+               else if (new & VPHN_FIELD_MSB) {
+                       /* Data is in the lower 15 bits of this field */
+                       unpacked[++nr_assoc_doms] =
+                               cpu_to_be32(new & VPHN_FIELD_MASK);
+               } else {
+                       /*
+                        * Data is in the lower 15 bits of this field
+                        * concatenated with the next 16 bit field
+                        */
+                       last = new;
+                       is_32bit = true;
+               }
+       }
+
+       /* The first cell contains the length of the property */
+       unpacked[0] = cpu_to_be32(nr_assoc_doms);
+
+       return nr_assoc_doms;
+}
diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h
new file mode 100644 (file)
index 0000000..f0b93c2
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_POWERPC_MM_VPHN_H_
+#define _ARCH_POWERPC_MM_VPHN_H_
+
+/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */
+#define VPHN_REGISTER_COUNT 6
+
+/*
+ * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
+ * form the complete property we have to add the length in the first cell.
+ */
+#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
+
+extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
+
+#endif
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
deleted file mode 100644 (file)
index 6fa6765..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-#include <linux/mm.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-
-int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
-                  pte_t *ptep, unsigned long trap, unsigned long flags,
-                  int ssize, int subpg_prot)
-{
-       real_pte_t rpte;
-       unsigned long hpte_group;
-       unsigned long rflags, pa;
-       unsigned long old_pte, new_pte;
-       unsigned long vpn, hash, slot;
-       unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
-
-       /*
-        * atomically mark the linux large page PTE busy and dirty
-        */
-       do {
-               pte_t pte = READ_ONCE(*ptep);
-
-               old_pte = pte_val(pte);
-               /* If PTE busy, retry the access */
-               if (unlikely(old_pte & H_PAGE_BUSY))
-                       return 0;
-               /* If PTE permissions don't match, take page fault */
-               if (unlikely(!check_pte_access(access, old_pte)))
-                       return 1;
-               /*
-                * Try to lock the PTE, add ACCESSED and DIRTY if it was
-                * a write access. Since this is 4K insert of 64K page size
-                * also add H_PAGE_COMBO
-                */
-               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_WRITE)
-                       new_pte |= _PAGE_DIRTY;
-       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-       /*
-        * PP bits. _PAGE_USER is already PP bit 0x2, so we only
-        * need to add in 0x1 if it's a read-only user page
-        */
-       rflags = htab_convert_pte_flags(new_pte);
-       rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-
-       if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-           !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
-       vpn  = hpt_vpn(ea, vsid, ssize);
-       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
-               /*
-                * There MIGHT be an HPTE for this pte
-                */
-               unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
-                                                        rpte, 0);
-
-               if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
-                                              MMU_PAGE_4K, ssize, flags) == -1)
-                       old_pte &= ~_PAGE_HPTEFLAGS;
-       }
-
-       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-
-               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-               hash = hpt_hash(vpn, shift, ssize);
-
-repeat:
-               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-               /* Insert into the hash table, primary slot */
-               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-                                               MMU_PAGE_4K, MMU_PAGE_4K, ssize);
-               /*
-                * Primary is full, try the secondary
-                */
-               if (unlikely(slot == -1)) {
-                       hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-                       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-                                                       rflags,
-                                                       HPTE_V_SECONDARY,
-                                                       MMU_PAGE_4K,
-                                                       MMU_PAGE_4K, ssize);
-                       if (slot == -1) {
-                               if (mftb() & 0x1)
-                                       hpte_group = (hash & htab_hash_mask) *
-                                                       HPTES_PER_GROUP;
-                               mmu_hash_ops.hpte_remove(hpte_group);
-                               /*
-                                * FIXME!! Should be try the group from which we removed ?
-                                */
-                               goto repeat;
-                       }
-               }
-               /*
-                * Hypervisor failure. Restore old pte and return -1
-                * similar to __hash_page_*
-                */
-               if (unlikely(slot == -2)) {
-                       *ptep = __pte(old_pte);
-                       hash_failure_debug(ea, access, vsid, trap, ssize,
-                                          MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
-                       return -1;
-               }
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-               new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
-       }
-       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
-       return 0;
-}
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
deleted file mode 100644 (file)
index 3afa253..0000000
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-#include <linux/mm.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-
-/*
- * Return true, if the entry has a slot value which
- * the software considers as invalid.
- */
-static inline bool hpte_soft_invalid(unsigned long hidx)
-{
-       return ((hidx & 0xfUL) == 0xfUL);
-}
-
-/*
- * index from 0 - 15
- */
-bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
-{
-       return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
-}
-
-int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
-                  pte_t *ptep, unsigned long trap, unsigned long flags,
-                  int ssize, int subpg_prot)
-{
-       real_pte_t rpte;
-       unsigned long hpte_group;
-       unsigned int subpg_index;
-       unsigned long rflags, pa;
-       unsigned long old_pte, new_pte, subpg_pte;
-       unsigned long vpn, hash, slot, gslot;
-       unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
-
-       /*
-        * atomically mark the linux large page PTE busy and dirty
-        */
-       do {
-               pte_t pte = READ_ONCE(*ptep);
-
-               old_pte = pte_val(pte);
-               /* If PTE busy, retry the access */
-               if (unlikely(old_pte & H_PAGE_BUSY))
-                       return 0;
-               /* If PTE permissions don't match, take page fault */
-               if (unlikely(!check_pte_access(access, old_pte)))
-                       return 1;
-               /*
-                * Try to lock the PTE, add ACCESSED and DIRTY if it was
-                * a write access. Since this is 4K insert of 64K page size
-                * also add H_PAGE_COMBO
-                */
-               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
-               if (access & _PAGE_WRITE)
-                       new_pte |= _PAGE_DIRTY;
-       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-       /*
-        * Handle the subpage protection bits
-        */
-       subpg_pte = new_pte & ~subpg_prot;
-       rflags = htab_convert_pte_flags(subpg_pte);
-
-       if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-           !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
-
-               /*
-                * No CPU has hugepages but lacks no execute, so we
-                * don't need to worry about that case
-                */
-               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-       }
-
-       subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
-       vpn  = hpt_vpn(ea, vsid, ssize);
-       rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-       /*
-        *None of the sub 4k page is hashed
-        */
-       if (!(old_pte & H_PAGE_HASHPTE))
-               goto htab_insert_hpte;
-       /*
-        * Check if the pte was already inserted into the hash table
-        * as a 64k HW page, and invalidate the 64k HPTE if so.
-        */
-       if (!(old_pte & H_PAGE_COMBO)) {
-               flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
-               /*
-                * clear the old slot details from the old and new pte.
-                * On hash insert failure we use old pte value and we don't
-                * want slot information there if we have a insert failure.
-                */
-               old_pte &= ~H_PAGE_HASHPTE;
-               new_pte &= ~H_PAGE_HASHPTE;
-               goto htab_insert_hpte;
-       }
-       /*
-        * Check for sub page valid and update
-        */
-       if (__rpte_sub_valid(rpte, subpg_index)) {
-               int ret;
-
-               gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
-                                          subpg_index);
-               ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
-                                                MMU_PAGE_4K, MMU_PAGE_4K,
-                                                ssize, flags);
-
-               /*
-                * If we failed because typically the HPTE wasn't really here
-                * we try an insertion.
-                */
-               if (ret == -1)
-                       goto htab_insert_hpte;
-
-               *ptep = __pte(new_pte & ~H_PAGE_BUSY);
-               return 0;
-       }
-
-htab_insert_hpte:
-
-       /*
-        * Initialize all hidx entries to invalid value, the first time
-        * the PTE is about to allocate a 4K HPTE.
-        */
-       if (!(old_pte & H_PAGE_COMBO))
-               rpte.hidx = INVALID_RPTE_HIDX;
-
-       /*
-        * handle H_PAGE_4K_PFN case
-        */
-       if (old_pte & H_PAGE_4K_PFN) {
-               /*
-                * All the sub 4k page have the same
-                * physical address.
-                */
-               pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
-       } else {
-               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-               pa += (subpg_index << shift);
-       }
-       hash = hpt_hash(vpn, shift, ssize);
-repeat:
-       hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-       /* Insert into the hash table, primary slot */
-       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-                                       MMU_PAGE_4K, MMU_PAGE_4K, ssize);
-       /*
-        * Primary is full, try the secondary
-        */
-       if (unlikely(slot == -1)) {
-               bool soft_invalid;
-
-               hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-                                               rflags, HPTE_V_SECONDARY,
-                                               MMU_PAGE_4K, MMU_PAGE_4K,
-                                               ssize);
-
-               soft_invalid = hpte_soft_invalid(slot);
-               if (unlikely(soft_invalid)) {
-                       /*
-                        * We got a valid slot from a hardware point of view.
-                        * but we cannot use it, because we use this special
-                        * value; as defined by hpte_soft_invalid(), to track
-                        * invalid slots. We cannot use it. So invalidate it.
-                        */
-                       gslot = slot & _PTEIDX_GROUP_IX;
-                       mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
-                                                    MMU_PAGE_4K, MMU_PAGE_4K,
-                                                    ssize, 0);
-               }
-
-               if (unlikely(slot == -1 || soft_invalid)) {
-                       /*
-                        * For soft invalid slot, let's ensure that we release a
-                        * slot from the primary, with the hope that we will
-                        * acquire that slot next time we try. This will ensure
-                        * that we do not get the same soft-invalid slot.
-                        */
-                       if (soft_invalid || (mftb() & 0x1))
-                               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-                       mmu_hash_ops.hpte_remove(hpte_group);
-                       /*
-                        * FIXME!! Should be try the group from which we removed ?
-                        */
-                       goto repeat;
-               }
-       }
-       /*
-        * Hypervisor failure. Restore old pte and return -1
-        * similar to __hash_page_*
-        */
-       if (unlikely(slot == -2)) {
-               *ptep = __pte(old_pte);
-               hash_failure_debug(ea, access, vsid, trap, ssize,
-                                  MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
-               return -1;
-       }
-
-       new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
-       new_pte |= H_PAGE_HASHPTE;
-
-       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
-       return 0;
-}
-
-int __hash_page_64K(unsigned long ea, unsigned long access,
-                   unsigned long vsid, pte_t *ptep, unsigned long trap,
-                   unsigned long flags, int ssize)
-{
-       real_pte_t rpte;
-       unsigned long hpte_group;
-       unsigned long rflags, pa;
-       unsigned long old_pte, new_pte;
-       unsigned long vpn, hash, slot;
-       unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
-
-       /*
-        * atomically mark the linux large page PTE busy and dirty
-        */
-       do {
-               pte_t pte = READ_ONCE(*ptep);
-
-               old_pte = pte_val(pte);
-               /* If PTE busy, retry the access */
-               if (unlikely(old_pte & H_PAGE_BUSY))
-                       return 0;
-               /* If PTE permissions don't match, take page fault */
-               if (unlikely(!check_pte_access(access, old_pte)))
-                       return 1;
-               /*
-                * Check if PTE has the cache-inhibit bit set
-                * If so, bail out and refault as a 4k page
-                */
-               if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
-                   unlikely(pte_ci(pte)))
-                       return 0;
-               /*
-                * Try to lock the PTE, add ACCESSED and DIRTY if it was
-                * a write access.
-                */
-               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_WRITE)
-                       new_pte |= _PAGE_DIRTY;
-       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-       rflags = htab_convert_pte_flags(new_pte);
-       rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-
-       if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-           !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
-       vpn  = hpt_vpn(ea, vsid, ssize);
-       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
-               unsigned long gslot;
-
-               /*
-                * There MIGHT be an HPTE for this pte
-                */
-               gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
-               if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
-                                              MMU_PAGE_64K, ssize,
-                                              flags) == -1)
-                       old_pte &= ~_PAGE_HPTEFLAGS;
-       }
-
-       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-
-               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-               hash = hpt_hash(vpn, shift, ssize);
-
-repeat:
-               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-               /* Insert into the hash table, primary slot */
-               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-                                               MMU_PAGE_64K, MMU_PAGE_64K,
-                                               ssize);
-               /*
-                * Primary is full, try the secondary
-                */
-               if (unlikely(slot == -1)) {
-                       hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-                       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-                                                       rflags,
-                                                       HPTE_V_SECONDARY,
-                                                       MMU_PAGE_64K,
-                                                       MMU_PAGE_64K, ssize);
-                       if (slot == -1) {
-                               if (mftb() & 0x1)
-                                       hpte_group = (hash & htab_hash_mask) *
-                                                       HPTES_PER_GROUP;
-                               mmu_hash_ops.hpte_remove(hpte_group);
-                               /*
-                                * FIXME!! Should be try the group from which we removed ?
-                                */
-                               goto repeat;
-                       }
-               }
-               /*
-                * Hypervisor failure. Restore old pte and return -1
-                * similar to __hash_page_*
-                */
-               if (unlikely(slot == -2)) {
-                       *ptep = __pte(old_pte);
-                       hash_failure_debug(ea, access, vsid, trap, ssize,
-                                          MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
-                       return -1;
-               }
-
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-               new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
-       }
-       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
-       return 0;
-}
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
deleted file mode 100644 (file)
index aaa28fd..0000000
+++ /dev/null
@@ -1,884 +0,0 @@
-/*
- * native hashtable management.
- *
- * SMP scalability work:
- *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG_LOW
-
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/of.h>
-#include <linux/processor.h>
-#include <linux/threads.h>
-#include <linux/smp.h>
-
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/trace.h>
-#include <asm/tlb.h>
-#include <asm/cputable.h>
-#include <asm/udbg.h>
-#include <asm/kexec.h>
-#include <asm/ppc-opcode.h>
-#include <asm/feature-fixups.h>
-
-#include <misc/cxl-base.h>
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#ifdef __BIG_ENDIAN__
-#define HPTE_LOCK_BIT 3
-#else
-#define HPTE_LOCK_BIT (56+3)
-#endif
-
-DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-
-static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
-{
-       unsigned long rb;
-
-       rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-
-       asm volatile("tlbiel %0" : : "r" (rb));
-}
-
-/*
- * tlbiel instruction for hash, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
-                                       unsigned int pid,
-                                       unsigned int ric, unsigned int prs)
-{
-       unsigned long rb;
-       unsigned long rs;
-       unsigned int r = 0; /* hash format */
-
-       rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-       rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
-       asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
-                    : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
-                    : "memory");
-}
-
-
-static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
-{
-       unsigned int set;
-
-       asm volatile("ptesync": : :"memory");
-
-       for (set = 0; set < num_sets; set++)
-               tlbiel_hash_set_isa206(set, is);
-
-       asm volatile("ptesync": : :"memory");
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
-       unsigned int set;
-
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Flush the first set of the TLB, and any caching of partition table
-        * entries. Then flush the remaining sets of the TLB. Hash mode uses
-        * partition scoped TLB translations.
-        */
-       tlbiel_hash_set_isa300(0, is, 0, 2, 0);
-       for (set = 1; set < num_sets; set++)
-               tlbiel_hash_set_isa300(set, is, 0, 0, 0);
-
-       /*
-        * Now invalidate the process table cache.
-        *
-        * From ISA v3.0B p. 1078:
-        *     The following forms are invalid.
-        *      * PRS=1, R=0, and RIC!=2 (The only process-scoped
-        *        HPT caching is of the Process Table.)
-        */
-       tlbiel_hash_set_isa300(0, is, 0, 2, 1);
-
-       asm volatile("ptesync": : :"memory");
-
-       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-void hash__tlbiel_all(unsigned int action)
-{
-       unsigned int is;
-
-       switch (action) {
-       case TLB_INVAL_SCOPE_GLOBAL:
-               is = 3;
-               break;
-       case TLB_INVAL_SCOPE_LPID:
-               is = 2;
-               break;
-       default:
-               BUG();
-       }
-
-       if (early_cpu_has_feature(CPU_FTR_ARCH_300))
-               tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
-       else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
-               tlbiel_all_isa206(POWER8_TLB_SETS, is);
-       else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
-               tlbiel_all_isa206(POWER7_TLB_SETS, is);
-       else
-               WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
-}
-
-static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
-                                               int apsize, int ssize)
-{
-       unsigned long va;
-       unsigned int penc;
-       unsigned long sllp;
-
-       /*
-        * We need 14 to 65 bits of va for a tlibe of 4K page
-        * With vpn we ignore the lower VPN_SHIFT bits already.
-        * And top two bits are already ignored because we can
-        * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
-        * of 12.
-        */
-       va = vpn << VPN_SHIFT;
-       /*
-        * clear top 16 bits of 64bit va, non SLS segment
-        * Older versions of the architecture (2.02 and earler) require the
-        * masking of the top 16 bits.
-        */
-       if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
-               va &= ~(0xffffULL << 48);
-
-       switch (psize) {
-       case MMU_PAGE_4K:
-               /* clear out bits after (52) [0....52.....63] */
-               va &= ~((1ul << (64 - 52)) - 1);
-               va |= ssize << 8;
-               sllp = get_sllp_encoding(apsize);
-               va |= sllp << 5;
-               asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
-                            : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
-                            : "memory");
-               break;
-       default:
-               /* We need 14 to 14 + i bits of va */
-               penc = mmu_psize_defs[psize].penc[apsize];
-               va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
-               va |= penc << 12;
-               va |= ssize << 8;
-               /*
-                * AVAL bits:
-                * We don't need all the bits, but rest of the bits
-                * must be ignored by the processor.
-                * vpn cover upto 65 bits of va. (0...65) and we need
-                * 58..64 bits of va.
-                */
-               va |= (vpn & 0xfe); /* AVAL */
-               va |= 1; /* L */
-               asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
-                            : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
-                            : "memory");
-               break;
-       }
-       return va;
-}
-
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
-{
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-               /* Need the extra ptesync to ensure we don't reorder tlbie*/
-               asm volatile("ptesync": : :"memory");
-               ___tlbie(vpn, psize, apsize, ssize);
-       }
-}
-
-static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
-{
-       unsigned long rb;
-
-       rb = ___tlbie(vpn, psize, apsize, ssize);
-       trace_tlbie(0, 0, rb, 0, 0, 0, 0);
-}
-
-static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
-{
-       unsigned long va;
-       unsigned int penc;
-       unsigned long sllp;
-
-       /* VPN_SHIFT can be atmost 12 */
-       va = vpn << VPN_SHIFT;
-       /*
-        * clear top 16 bits of 64 bit va, non SLS segment
-        * Older versions of the architecture (2.02 and earler) require the
-        * masking of the top 16 bits.
-        */
-       if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
-               va &= ~(0xffffULL << 48);
-
-       switch (psize) {
-       case MMU_PAGE_4K:
-               /* clear out bits after(52) [0....52.....63] */
-               va &= ~((1ul << (64 - 52)) - 1);
-               va |= ssize << 8;
-               sllp = get_sllp_encoding(apsize);
-               va |= sllp << 5;
-               asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
-                            : : "r" (va), "i" (CPU_FTR_ARCH_206)
-                            : "memory");
-               break;
-       default:
-               /* We need 14 to 14 + i bits of va */
-               penc = mmu_psize_defs[psize].penc[apsize];
-               va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
-               va |= penc << 12;
-               va |= ssize << 8;
-               /*
-                * AVAL bits:
-                * We don't need all the bits, but rest of the bits
-                * must be ignored by the processor.
-                * vpn cover upto 65 bits of va. (0...65) and we need
-                * 58..64 bits of va.
-                */
-               va |= (vpn & 0xfe);
-               va |= 1; /* L */
-               asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
-                            : : "r" (va), "i" (CPU_FTR_ARCH_206)
-                            : "memory");
-               break;
-       }
-       trace_tlbie(0, 1, va, 0, 0, 0, 0);
-
-}
-
-static inline void tlbie(unsigned long vpn, int psize, int apsize,
-                        int ssize, int local)
-{
-       unsigned int use_local;
-       int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-       use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
-
-       if (use_local)
-               use_local = mmu_psize_defs[psize].tlbiel;
-       if (lock_tlbie && !use_local)
-               raw_spin_lock(&native_tlbie_lock);
-       asm volatile("ptesync": : :"memory");
-       if (use_local) {
-               __tlbiel(vpn, psize, apsize, ssize);
-               asm volatile("ptesync": : :"memory");
-       } else {
-               __tlbie(vpn, psize, apsize, ssize);
-               fixup_tlbie(vpn, psize, apsize, ssize);
-               asm volatile("eieio; tlbsync; ptesync": : :"memory");
-       }
-       if (lock_tlbie && !use_local)
-               raw_spin_unlock(&native_tlbie_lock);
-}
-
-static inline void native_lock_hpte(struct hash_pte *hptep)
-{
-       unsigned long *word = (unsigned long *)&hptep->v;
-
-       while (1) {
-               if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
-                       break;
-               spin_begin();
-               while(test_bit(HPTE_LOCK_BIT, word))
-                       spin_cpu_relax();
-               spin_end();
-       }
-}
-
-static inline void native_unlock_hpte(struct hash_pte *hptep)
-{
-       unsigned long *word = (unsigned long *)&hptep->v;
-
-       clear_bit_unlock(HPTE_LOCK_BIT, word);
-}
-
-static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
-                       unsigned long pa, unsigned long rflags,
-                       unsigned long vflags, int psize, int apsize, int ssize)
-{
-       struct hash_pte *hptep = htab_address + hpte_group;
-       unsigned long hpte_v, hpte_r;
-       int i;
-
-       if (!(vflags & HPTE_V_BOLTED)) {
-               DBG_LOW("    insert(group=%lx, vpn=%016lx, pa=%016lx,"
-                       " rflags=%lx, vflags=%lx, psize=%d)\n",
-                       hpte_group, vpn, pa, rflags, vflags, psize);
-       }
-
-       for (i = 0; i < HPTES_PER_GROUP; i++) {
-               if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
-                       /* retry with lock held */
-                       native_lock_hpte(hptep);
-                       if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
-                               break;
-                       native_unlock_hpte(hptep);
-               }
-
-               hptep++;
-       }
-
-       if (i == HPTES_PER_GROUP)
-               return -1;
-
-       hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
-
-       if (!(vflags & HPTE_V_BOLTED)) {
-               DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
-                       i, hpte_v, hpte_r);
-       }
-
-       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-               hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
-               hpte_v = hpte_old_to_new_v(hpte_v);
-       }
-
-       hptep->r = cpu_to_be64(hpte_r);
-       /* Guarantee the second dword is visible before the valid bit */
-       eieio();
-       /*
-        * Now set the first dword including the valid bit
-        * NOTE: this also unlocks the hpte
-        */
-       hptep->v = cpu_to_be64(hpte_v);
-
-       __asm__ __volatile__ ("ptesync" : : : "memory");
-
-       return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
-}
-
-static long native_hpte_remove(unsigned long hpte_group)
-{
-       struct hash_pte *hptep;
-       int i;
-       int slot_offset;
-       unsigned long hpte_v;
-
-       DBG_LOW("    remove(group=%lx)\n", hpte_group);
-
-       /* pick a random entry to start at */
-       slot_offset = mftb() & 0x7;
-
-       for (i = 0; i < HPTES_PER_GROUP; i++) {
-               hptep = htab_address + hpte_group + slot_offset;
-               hpte_v = be64_to_cpu(hptep->v);
-
-               if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
-                       /* retry with lock held */
-                       native_lock_hpte(hptep);
-                       hpte_v = be64_to_cpu(hptep->v);
-                       if ((hpte_v & HPTE_V_VALID)
-                           && !(hpte_v & HPTE_V_BOLTED))
-                               break;
-                       native_unlock_hpte(hptep);
-               }
-
-               slot_offset++;
-               slot_offset &= 0x7;
-       }
-
-       if (i == HPTES_PER_GROUP)
-               return -1;
-
-       /* Invalidate the hpte. NOTE: this also unlocks it */
-       hptep->v = 0;
-
-       return i;
-}
-
-static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-                                unsigned long vpn, int bpsize,
-                                int apsize, int ssize, unsigned long flags)
-{
-       struct hash_pte *hptep = htab_address + slot;
-       unsigned long hpte_v, want_v;
-       int ret = 0, local = 0;
-
-       want_v = hpte_encode_avpn(vpn, bpsize, ssize);
-
-       DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
-               vpn, want_v & HPTE_V_AVPN, slot, newpp);
-
-       hpte_v = hpte_get_old_v(hptep);
-       /*
-        * We need to invalidate the TLB always because hpte_remove doesn't do
-        * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
-        * random entry from it. When we do that we don't invalidate the TLB
-        * (hpte_remove) because we assume the old translation is still
-        * technically "valid".
-        */
-       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
-               DBG_LOW(" -> miss\n");
-               ret = -1;
-       } else {
-               native_lock_hpte(hptep);
-               /* recheck with locks held */
-               hpte_v = hpte_get_old_v(hptep);
-               if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
-                            !(hpte_v & HPTE_V_VALID))) {
-                       ret = -1;
-               } else {
-                       DBG_LOW(" -> hit\n");
-                       /* Update the HPTE */
-                       hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
-                                               ~(HPTE_R_PPP | HPTE_R_N)) |
-                                              (newpp & (HPTE_R_PPP | HPTE_R_N |
-                                                        HPTE_R_C)));
-               }
-               native_unlock_hpte(hptep);
-       }
-
-       if (flags & HPTE_LOCAL_UPDATE)
-               local = 1;
-       /*
-        * Ensure it is out of the tlb too if it is not a nohpte fault
-        */
-       if (!(flags & HPTE_NOHPTE_UPDATE))
-               tlbie(vpn, bpsize, apsize, ssize, local);
-
-       return ret;
-}
-
-static long native_hpte_find(unsigned long vpn, int psize, int ssize)
-{
-       struct hash_pte *hptep;
-       unsigned long hash;
-       unsigned long i;
-       long slot;
-       unsigned long want_v, hpte_v;
-
-       hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
-       want_v = hpte_encode_avpn(vpn, psize, ssize);
-
-       /* Bolted mappings are only ever in the primary group */
-       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-       for (i = 0; i < HPTES_PER_GROUP; i++) {
-
-               hptep = htab_address + slot;
-               hpte_v = hpte_get_old_v(hptep);
-               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-                       /* HPTE matches */
-                       return slot;
-               ++slot;
-       }
-
-       return -1;
-}
-
-/*
- * Update the page protection bits. Intended to be used to create
- * guard pages for kernel data structures on pages which are bolted
- * in the HPT. Assumes pages being operated on will not be stolen.
- *
- * No need to lock here because we should be the only user.
- */
-static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
-                                      int psize, int ssize)
-{
-       unsigned long vpn;
-       unsigned long vsid;
-       long slot;
-       struct hash_pte *hptep;
-
-       vsid = get_kernel_vsid(ea, ssize);
-       vpn = hpt_vpn(ea, vsid, ssize);
-
-       slot = native_hpte_find(vpn, psize, ssize);
-       if (slot == -1)
-               panic("could not find page to bolt\n");
-       hptep = htab_address + slot;
-
-       /* Update the HPTE */
-       hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
-                               ~(HPTE_R_PPP | HPTE_R_N)) |
-                              (newpp & (HPTE_R_PPP | HPTE_R_N)));
-       /*
-        * Ensure it is out of the tlb too. Bolted entries base and
-        * actual page size will be same.
-        */
-       tlbie(vpn, psize, psize, ssize, 0);
-}
-
-/*
- * Remove a bolted kernel entry. Memory hotplug uses this.
- *
- * No need to lock here because we should be the only user.
- */
-static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
-{
-       unsigned long vpn;
-       unsigned long vsid;
-       long slot;
-       struct hash_pte *hptep;
-
-       vsid = get_kernel_vsid(ea, ssize);
-       vpn = hpt_vpn(ea, vsid, ssize);
-
-       slot = native_hpte_find(vpn, psize, ssize);
-       if (slot == -1)
-               return -ENOENT;
-
-       hptep = htab_address + slot;
-
-       VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
-
-       /* Invalidate the hpte */
-       hptep->v = 0;
-
-       /* Invalidate the TLB */
-       tlbie(vpn, psize, psize, ssize, 0);
-       return 0;
-}
-
-
-static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
-                                  int bpsize, int apsize, int ssize, int local)
-{
-       struct hash_pte *hptep = htab_address + slot;
-       unsigned long hpte_v;
-       unsigned long want_v;
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
-
-       want_v = hpte_encode_avpn(vpn, bpsize, ssize);
-       hpte_v = hpte_get_old_v(hptep);
-
-       if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
-               native_lock_hpte(hptep);
-               /* recheck with locks held */
-               hpte_v = hpte_get_old_v(hptep);
-
-               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-                       /* Invalidate the hpte. NOTE: this also unlocks it */
-                       hptep->v = 0;
-               else
-                       native_unlock_hpte(hptep);
-       }
-       /*
-        * We need to invalidate the TLB always because hpte_remove doesn't do
-        * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
-        * random entry from it. When we do that we don't invalidate the TLB
-        * (hpte_remove) because we assume the old translation is still
-        * technically "valid".
-        */
-       tlbie(vpn, bpsize, apsize, ssize, local);
-
-       local_irq_restore(flags);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void native_hugepage_invalidate(unsigned long vsid,
-                                      unsigned long addr,
-                                      unsigned char *hpte_slot_array,
-                                      int psize, int ssize, int local)
-{
-       int i;
-       struct hash_pte *hptep;
-       int actual_psize = MMU_PAGE_16M;
-       unsigned int max_hpte_count, valid;
-       unsigned long flags, s_addr = addr;
-       unsigned long hpte_v, want_v, shift;
-       unsigned long hidx, vpn = 0, hash, slot;
-
-       shift = mmu_psize_defs[psize].shift;
-       max_hpte_count = 1U << (PMD_SHIFT - shift);
-
-       local_irq_save(flags);
-       for (i = 0; i < max_hpte_count; i++) {
-               valid = hpte_valid(hpte_slot_array, i);
-               if (!valid)
-                       continue;
-               hidx =  hpte_hash_index(hpte_slot_array, i);
-
-               /* get the vpn */
-               addr = s_addr + (i * (1ul << shift));
-               vpn = hpt_vpn(addr, vsid, ssize);
-               hash = hpt_hash(vpn, shift, ssize);
-               if (hidx & _PTEIDX_SECONDARY)
-                       hash = ~hash;
-
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += hidx & _PTEIDX_GROUP_IX;
-
-               hptep = htab_address + slot;
-               want_v = hpte_encode_avpn(vpn, psize, ssize);
-               hpte_v = hpte_get_old_v(hptep);
-
-               /* Even if we miss, we need to invalidate the TLB */
-               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
-                       /* recheck with locks held */
-                       native_lock_hpte(hptep);
-                       hpte_v = hpte_get_old_v(hptep);
-
-                       if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
-                               /*
-                                * Invalidate the hpte. NOTE: this also unlocks it
-                                */
-
-                               hptep->v = 0;
-                       } else
-                               native_unlock_hpte(hptep);
-               }
-               /*
-                * We need to do tlb invalidate for all the address, tlbie
-                * instruction compares entry_VA in tlb with the VA specified
-                * here
-                */
-               tlbie(vpn, psize, actual_psize, ssize, local);
-       }
-       local_irq_restore(flags);
-}
-#else
-static void native_hugepage_invalidate(unsigned long vsid,
-                                      unsigned long addr,
-                                      unsigned char *hpte_slot_array,
-                                      int psize, int ssize, int local)
-{
-       WARN(1, "%s called without THP support\n", __func__);
-}
-#endif
-
-static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
-                       int *psize, int *apsize, int *ssize, unsigned long *vpn)
-{
-       unsigned long avpn, pteg, vpi;
-       unsigned long hpte_v = be64_to_cpu(hpte->v);
-       unsigned long hpte_r = be64_to_cpu(hpte->r);
-       unsigned long vsid, seg_off;
-       int size, a_size, shift;
-       /* Look at the 8 bit LP value */
-       unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-               hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
-               hpte_r = hpte_new_to_old_r(hpte_r);
-       }
-       if (!(hpte_v & HPTE_V_LARGE)) {
-               size   = MMU_PAGE_4K;
-               a_size = MMU_PAGE_4K;
-       } else {
-               size = hpte_page_sizes[lp] & 0xf;
-               a_size = hpte_page_sizes[lp] >> 4;
-       }
-       /* This works for all page sizes, and for 256M and 1T segments */
-       *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
-       shift = mmu_psize_defs[size].shift;
-
-       avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
-       pteg = slot / HPTES_PER_GROUP;
-       if (hpte_v & HPTE_V_SECONDARY)
-               pteg = ~pteg;
-
-       switch (*ssize) {
-       case MMU_SEGSIZE_256M:
-               /* We only have 28 - 23 bits of seg_off in avpn */
-               seg_off = (avpn & 0x1f) << 23;
-               vsid    =  avpn >> 5;
-               /* We can find more bits from the pteg value */
-               if (shift < 23) {
-                       vpi = (vsid ^ pteg) & htab_hash_mask;
-                       seg_off |= vpi << shift;
-               }
-               *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
-               break;
-       case MMU_SEGSIZE_1T:
-               /* We only have 40 - 23 bits of seg_off in avpn */
-               seg_off = (avpn & 0x1ffff) << 23;
-               vsid    = avpn >> 17;
-               if (shift < 23) {
-                       vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
-                       seg_off |= vpi << shift;
-               }
-               *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
-               break;
-       default:
-               *vpn = size = 0;
-       }
-       *psize  = size;
-       *apsize = a_size;
-}
-
-/*
- * clear all mappings on kexec.  All cpus are in real mode (or they will
- * be when they isi), and we are the only one left.  We rely on our kernel
- * mapping being 0xC0's and the hardware ignoring those two real bits.
- *
- * This must be called with interrupts disabled.
- *
- * Taking the native_tlbie_lock is unsafe here due to the possibility of
- * lockdep being on. On pre POWER5 hardware, not taking the lock could
- * cause deadlock. POWER5 and newer not taking the lock is fine. This only
- * gets called during boot before secondary CPUs have come up and during
- * crashdump and all bets are off anyway.
- *
- * TODO: add batching support when enabled.  remember, no dynamic memory here,
- * although there is the control page available...
- */
-static void native_hpte_clear(void)
-{
-       unsigned long vpn = 0;
-       unsigned long slot, slots;
-       struct hash_pte *hptep = htab_address;
-       unsigned long hpte_v;
-       unsigned long pteg_count;
-       int psize, apsize, ssize;
-
-       pteg_count = htab_hash_mask + 1;
-
-       slots = pteg_count * HPTES_PER_GROUP;
-
-       for (slot = 0; slot < slots; slot++, hptep++) {
-               /*
-                * we could lock the pte here, but we are the only cpu
-                * running,  right?  and for crash dump, we probably
-                * don't want to wait for a maybe bad cpu.
-                */
-               hpte_v = be64_to_cpu(hptep->v);
-
-               /*
-                * Call __tlbie() here rather than tlbie() since we can't take the
-                * native_tlbie_lock.
-                */
-               if (hpte_v & HPTE_V_VALID) {
-                       hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
-                       hptep->v = 0;
-                       ___tlbie(vpn, psize, apsize, ssize);
-               }
-       }
-
-       asm volatile("eieio; tlbsync; ptesync":::"memory");
-}
-
-/*
- * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
- * the lock all the time
- */
-static void native_flush_hash_range(unsigned long number, int local)
-{
-       unsigned long vpn = 0;
-       unsigned long hash, index, hidx, shift, slot;
-       struct hash_pte *hptep;
-       unsigned long hpte_v;
-       unsigned long want_v;
-       unsigned long flags;
-       real_pte_t pte;
-       struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
-       unsigned long psize = batch->psize;
-       int ssize = batch->ssize;
-       int i;
-       unsigned int use_local;
-
-       use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
-               mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
-
-       local_irq_save(flags);
-
-       for (i = 0; i < number; i++) {
-               vpn = batch->vpn[i];
-               pte = batch->pte[i];
-
-               pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
-                       hash = hpt_hash(vpn, shift, ssize);
-                       hidx = __rpte_to_hidx(pte, index);
-                       if (hidx & _PTEIDX_SECONDARY)
-                               hash = ~hash;
-                       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-                       slot += hidx & _PTEIDX_GROUP_IX;
-                       hptep = htab_address + slot;
-                       want_v = hpte_encode_avpn(vpn, psize, ssize);
-                       hpte_v = hpte_get_old_v(hptep);
-
-                       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-                               continue;
-                       /* lock and try again */
-                       native_lock_hpte(hptep);
-                       hpte_v = hpte_get_old_v(hptep);
-
-                       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-                               native_unlock_hpte(hptep);
-                       else
-                               hptep->v = 0;
-
-               } pte_iterate_hashed_end();
-       }
-
-       if (use_local) {
-               asm volatile("ptesync":::"memory");
-               for (i = 0; i < number; i++) {
-                       vpn = batch->vpn[i];
-                       pte = batch->pte[i];
-
-                       pte_iterate_hashed_subpages(pte, psize,
-                                                   vpn, index, shift) {
-                               __tlbiel(vpn, psize, psize, ssize);
-                       } pte_iterate_hashed_end();
-               }
-               asm volatile("ptesync":::"memory");
-       } else {
-               int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-               if (lock_tlbie)
-                       raw_spin_lock(&native_tlbie_lock);
-
-               asm volatile("ptesync":::"memory");
-               for (i = 0; i < number; i++) {
-                       vpn = batch->vpn[i];
-                       pte = batch->pte[i];
-
-                       pte_iterate_hashed_subpages(pte, psize,
-                                                   vpn, index, shift) {
-                               __tlbie(vpn, psize, psize, ssize);
-                       } pte_iterate_hashed_end();
-               }
-               /*
-                * Just do one more with the last used values.
-                */
-               fixup_tlbie(vpn, psize, psize, ssize);
-               asm volatile("eieio; tlbsync; ptesync":::"memory");
-
-               if (lock_tlbie)
-                       raw_spin_unlock(&native_tlbie_lock);
-       }
-
-       local_irq_restore(flags);
-}
-
-void __init hpte_init_native(void)
-{
-       mmu_hash_ops.hpte_invalidate    = native_hpte_invalidate;
-       mmu_hash_ops.hpte_updatepp      = native_hpte_updatepp;
-       mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
-       mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
-       mmu_hash_ops.hpte_insert        = native_hpte_insert;
-       mmu_hash_ops.hpte_remove        = native_hpte_remove;
-       mmu_hash_ops.hpte_clear_all     = native_hpte_clear;
-       mmu_hash_ops.flush_hash_range = native_flush_hash_range;
-       mmu_hash_ops.hugepage_invalidate   = native_hugepage_invalidate;
-}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
deleted file mode 100644 (file)
index 6eb8964..0000000
+++ /dev/null
@@ -1,1930 +0,0 @@
-/*
- * PowerPC64 port by Mike Corrigan and Dave Engebretsen
- *   {mikejc|engebret}@us.ibm.com
- *
- *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
- *
- * SMP scalability work:
- *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
- *    Module name: htab.c
- *
- *    Description:
- *      PowerPC Hashed Page Table functions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG
-#undef DEBUG_LOW
-
-#define pr_fmt(fmt) "hash-mmu: " fmt
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/sched/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include <linux/export.h>
-#include <linux/ctype.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/signal.h>
-#include <linux/memblock.h>
-#include <linux/context_tracking.h>
-#include <linux/libfdt.h>
-#include <linux/pkeys.h>
-
-#include <asm/debugfs.h>
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <linux/uaccess.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/eeh.h>
-#include <asm/tlb.h>
-#include <asm/cacheflush.h>
-#include <asm/cputable.h>
-#include <asm/sections.h>
-#include <asm/copro.h>
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-#include <asm/fadump.h>
-#include <asm/firmware.h>
-#include <asm/tm.h>
-#include <asm/trace.h>
-#include <asm/ps3.h>
-#include <asm/pte-walk.h>
-#include <asm/asm-prototypes.h>
-
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#define KB (1024)
-#define MB (1024*KB)
-#define GB (1024L*MB)
-
-/*
- * Note:  pte   --> Linux PTE
- *        HPTE  --> PowerPC Hashed Page Table Entry
- *
- * Execution context:
- *   htab_initialize is called with the MMU off (of course), but
- *   the kernel has been copied down to zero so it can directly
- *   reference global data.  At this point it is very difficult
- *   to print debug info.
- *
- */
-
-static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-EXPORT_SYMBOL_GPL(mmu_psize_defs);
-
-u8 hpte_page_sizes[1 << LP_BITS];
-EXPORT_SYMBOL_GPL(hpte_page_sizes);
-
-struct hash_pte *htab_address;
-unsigned long htab_size_bytes;
-unsigned long htab_hash_mask;
-EXPORT_SYMBOL_GPL(htab_hash_mask);
-int mmu_linear_psize = MMU_PAGE_4K;
-EXPORT_SYMBOL_GPL(mmu_linear_psize);
-int mmu_virtual_psize = MMU_PAGE_4K;
-int mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
-int mmu_io_psize = MMU_PAGE_4K;
-int mmu_kernel_ssize = MMU_SEGSIZE_256M;
-EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
-int mmu_highuser_ssize = MMU_SEGSIZE_256M;
-u16 mmu_slb_size = 64;
-EXPORT_SYMBOL_GPL(mmu_slb_size);
-#ifdef CONFIG_PPC_64K_PAGES
-int mmu_ci_restrictions;
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-struct mmu_hash_ops mmu_hash_ops;
-EXPORT_SYMBOL(mmu_hash_ops);
-
-/* There are definitions of page sizes arrays to be used when none
- * is provided by the firmware.
- */
-
-/*
- * Fallback (4k pages only)
- */
-static struct mmu_psize_def mmu_psize_defaults[] = {
-       [MMU_PAGE_4K] = {
-               .shift  = 12,
-               .sllp   = 0,
-               .penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
-               .avpnm  = 0,
-               .tlbiel = 0,
-       },
-};
-
-/* POWER4, GPUL, POWER5
- *
- * Support for 16Mb large pages
- */
-static struct mmu_psize_def mmu_psize_defaults_gp[] = {
-       [MMU_PAGE_4K] = {
-               .shift  = 12,
-               .sllp   = 0,
-               .penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
-               .avpnm  = 0,
-               .tlbiel = 1,
-       },
-       [MMU_PAGE_16M] = {
-               .shift  = 24,
-               .sllp   = SLB_VSID_L,
-               .penc   = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
-                           [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
-               .avpnm  = 0x1UL,
-               .tlbiel = 0,
-       },
-};
-
-/*
- * 'R' and 'C' update notes:
- *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
- *     create writeable HPTEs without C set, because the hcall H_PROTECT
- *     that we use in that case will not update C
- *  - The above is however not a problem, because we also don't do that
- *     fancy "no flush" variant of eviction and we use H_REMOVE which will
- *     do the right thing and thus we don't have the race I described earlier
- *
- *    - Under bare metal,  we do have the race, so we need R and C set
- *    - We make sure R is always set and never lost
- *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
- */
-unsigned long htab_convert_pte_flags(unsigned long pteflags)
-{
-       unsigned long rflags = 0;
-
-       /* _PAGE_EXEC -> NOEXEC */
-       if ((pteflags & _PAGE_EXEC) == 0)
-               rflags |= HPTE_R_N;
-       /*
-        * PPP bits:
-        * Linux uses slb key 0 for kernel and 1 for user.
-        * kernel RW areas are mapped with PPP=0b000
-        * User area is mapped with PPP=0b010 for read/write
-        * or PPP=0b011 for read-only (including writeable but clean pages).
-        */
-       if (pteflags & _PAGE_PRIVILEGED) {
-               /*
-                * Kernel read only mapped with ppp bits 0b110
-                */
-               if (!(pteflags & _PAGE_WRITE)) {
-                       if (mmu_has_feature(MMU_FTR_KERNEL_RO))
-                               rflags |= (HPTE_R_PP0 | 0x2);
-                       else
-                               rflags |= 0x3;
-               }
-       } else {
-               if (pteflags & _PAGE_RWX)
-                       rflags |= 0x2;
-               if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
-                       rflags |= 0x1;
-       }
-       /*
-        * We can't allow hardware to update hpte bits. Hence always
-        * set 'R' bit and set 'C' if it is a write fault
-        */
-       rflags |=  HPTE_R_R;
-
-       if (pteflags & _PAGE_DIRTY)
-               rflags |= HPTE_R_C;
-       /*
-        * Add in WIG bits
-        */
-
-       if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
-               rflags |= HPTE_R_I;
-       else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
-               rflags |= (HPTE_R_I | HPTE_R_G);
-       else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
-               rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
-       else
-               /*
-                * Add memory coherence if cache inhibited is not set
-                */
-               rflags |= HPTE_R_M;
-
-       rflags |= pte_to_hpte_pkey_bits(pteflags);
-       return rflags;
-}
-
-int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
-                     unsigned long pstart, unsigned long prot,
-                     int psize, int ssize)
-{
-       unsigned long vaddr, paddr;
-       unsigned int step, shift;
-       int ret = 0;
-
-       shift = mmu_psize_defs[psize].shift;
-       step = 1 << shift;
-
-       prot = htab_convert_pte_flags(prot);
-
-       DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
-           vstart, vend, pstart, prot, psize, ssize);
-
-       for (vaddr = vstart, paddr = pstart; vaddr < vend;
-            vaddr += step, paddr += step) {
-               unsigned long hash, hpteg;
-               unsigned long vsid = get_kernel_vsid(vaddr, ssize);
-               unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
-               unsigned long tprot = prot;
-
-               /*
-                * If we hit a bad address return error.
-                */
-               if (!vsid)
-                       return -1;
-               /* Make kernel text executable */
-               if (overlaps_kernel_text(vaddr, vaddr + step))
-                       tprot &= ~HPTE_R_N;
-
-               /* Make kvm guest trampolines executable */
-               if (overlaps_kvm_tmp(vaddr, vaddr + step))
-                       tprot &= ~HPTE_R_N;
-
-               /*
-                * If relocatable, check if it overlaps interrupt vectors that
-                * are copied down to real 0. For relocatable kernel
-                * (e.g. kdump case) we copy interrupt vectors down to real
-                * address 0. Mark that region as executable. This is
-                * because on p8 system with relocation on exception feature
-                * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
-                * in order to execute the interrupt handlers in virtual
-                * mode the vector region need to be marked as executable.
-                */
-               if ((PHYSICAL_START > MEMORY_START) &&
-                       overlaps_interrupt_vector_text(vaddr, vaddr + step))
-                               tprot &= ~HPTE_R_N;
-
-               hash = hpt_hash(vpn, shift, ssize);
-               hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
-               BUG_ON(!mmu_hash_ops.hpte_insert);
-               ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
-                                              HPTE_V_BOLTED, psize, psize,
-                                              ssize);
-
-               if (ret < 0)
-                       break;
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-               if (debug_pagealloc_enabled() &&
-                       (paddr >> PAGE_SHIFT) < linear_map_hash_count)
-                       linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-       }
-       return ret < 0 ? ret : 0;
-}
-
-int htab_remove_mapping(unsigned long vstart, unsigned long vend,
-                     int psize, int ssize)
-{
-       unsigned long vaddr;
-       unsigned int step, shift;
-       int rc;
-       int ret = 0;
-
-       shift = mmu_psize_defs[psize].shift;
-       step = 1 << shift;
-
-       if (!mmu_hash_ops.hpte_removebolted)
-               return -ENODEV;
-
-       for (vaddr = vstart; vaddr < vend; vaddr += step) {
-               rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
-               if (rc == -ENOENT) {
-                       ret = -ENOENT;
-                       continue;
-               }
-               if (rc < 0)
-                       return rc;
-       }
-
-       return ret;
-}
-
-static bool disable_1tb_segments = false;
-
-static int __init parse_disable_1tb_segments(char *p)
-{
-       disable_1tb_segments = true;
-       return 0;
-}
-early_param("disable_1tb_segments", parse_disable_1tb_segments);
-
-static int __init htab_dt_scan_seg_sizes(unsigned long node,
-                                        const char *uname, int depth,
-                                        void *data)
-{
-       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-       const __be32 *prop;
-       int size = 0;
-
-       /* We are scanning "cpu" nodes only */
-       if (type == NULL || strcmp(type, "cpu") != 0)
-               return 0;
-
-       prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
-       if (prop == NULL)
-               return 0;
-       for (; size >= 4; size -= 4, ++prop) {
-               if (be32_to_cpu(prop[0]) == 40) {
-                       DBG("1T segment support detected\n");
-
-                       if (disable_1tb_segments) {
-                               DBG("1T segments disabled by command line\n");
-                               break;
-                       }
-
-                       cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
-                       return 1;
-               }
-       }
-       cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
-       return 0;
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
-       int idx = -1;
-
-       switch (shift) {
-       case 0xc:
-               idx = MMU_PAGE_4K;
-               break;
-       case 0x10:
-               idx = MMU_PAGE_64K;
-               break;
-       case 0x14:
-               idx = MMU_PAGE_1M;
-               break;
-       case 0x18:
-               idx = MMU_PAGE_16M;
-               break;
-       case 0x22:
-               idx = MMU_PAGE_16G;
-               break;
-       }
-       return idx;
-}
-
-static int __init htab_dt_scan_page_sizes(unsigned long node,
-                                         const char *uname, int depth,
-                                         void *data)
-{
-       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-       const __be32 *prop;
-       int size = 0;
-
-       /* We are scanning "cpu" nodes only */
-       if (type == NULL || strcmp(type, "cpu") != 0)
-               return 0;
-
-       prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
-       if (!prop)
-               return 0;
-
-       pr_info("Page sizes from device-tree:\n");
-       size /= 4;
-       cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
-       while(size > 0) {
-               unsigned int base_shift = be32_to_cpu(prop[0]);
-               unsigned int slbenc = be32_to_cpu(prop[1]);
-               unsigned int lpnum = be32_to_cpu(prop[2]);
-               struct mmu_psize_def *def;
-               int idx, base_idx;
-
-               size -= 3; prop += 3;
-               base_idx = get_idx_from_shift(base_shift);
-               if (base_idx < 0) {
-                       /* skip the pte encoding also */
-                       prop += lpnum * 2; size -= lpnum * 2;
-                       continue;
-               }
-               def = &mmu_psize_defs[base_idx];
-               if (base_idx == MMU_PAGE_16M)
-                       cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
-
-               def->shift = base_shift;
-               if (base_shift <= 23)
-                       def->avpnm = 0;
-               else
-                       def->avpnm = (1 << (base_shift - 23)) - 1;
-               def->sllp = slbenc;
-               /*
-                * We don't know for sure what's up with tlbiel, so
-                * for now we only set it for 4K and 64K pages
-                */
-               if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
-                       def->tlbiel = 1;
-               else
-                       def->tlbiel = 0;
-
-               while (size > 0 && lpnum) {
-                       unsigned int shift = be32_to_cpu(prop[0]);
-                       int penc  = be32_to_cpu(prop[1]);
-
-                       prop += 2; size -= 2;
-                       lpnum--;
-
-                       idx = get_idx_from_shift(shift);
-                       if (idx < 0)
-                               continue;
-
-                       if (penc == -1)
-                               pr_err("Invalid penc for base_shift=%d "
-                                      "shift=%d\n", base_shift, shift);
-
-                       def->penc[idx] = penc;
-                       pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
-                               " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
-                               base_shift, shift, def->sllp,
-                               def->avpnm, def->tlbiel, def->penc[idx]);
-               }
-       }
-
-       return 1;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Scan for 16G memory blocks that have been set aside for huge pages
- * and reserve those blocks for 16G huge pages.
- */
-static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
-                                       const char *uname, int depth,
-                                       void *data) {
-       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-       const __be64 *addr_prop;
-       const __be32 *page_count_prop;
-       unsigned int expected_pages;
-       long unsigned int phys_addr;
-       long unsigned int block_size;
-
-       /* We are scanning "memory" nodes only */
-       if (type == NULL || strcmp(type, "memory") != 0)
-               return 0;
-
-       /* This property is the log base 2 of the number of virtual pages that
-        * will represent this memory block. */
-       page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
-       if (page_count_prop == NULL)
-               return 0;
-       expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
-       addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
-       if (addr_prop == NULL)
-               return 0;
-       phys_addr = be64_to_cpu(addr_prop[0]);
-       block_size = be64_to_cpu(addr_prop[1]);
-       if (block_size != (16 * GB))
-               return 0;
-       printk(KERN_INFO "Huge page(16GB) memory: "
-                       "addr = 0x%lX size = 0x%lX pages = %d\n",
-                       phys_addr, block_size, expected_pages);
-       if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
-               memblock_reserve(phys_addr, block_size * expected_pages);
-               pseries_add_gpage(phys_addr, block_size, expected_pages);
-       }
-       return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
-static void mmu_psize_set_default_penc(void)
-{
-       int bpsize, apsize;
-       for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
-               for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
-                       mmu_psize_defs[bpsize].penc[apsize] = -1;
-}
-
-#ifdef CONFIG_PPC_64K_PAGES
-
-static bool might_have_hea(void)
-{
-       /*
-        * The HEA ethernet adapter requires awareness of the
-        * GX bus. Without that awareness we can easily assume
-        * we will never see an HEA ethernet device.
-        */
-#ifdef CONFIG_IBMEBUS
-       return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
-               firmware_has_feature(FW_FEATURE_SPLPAR);
-#else
-       return false;
-#endif
-}
-
-#endif /* #ifdef CONFIG_PPC_64K_PAGES */
-
-static void __init htab_scan_page_sizes(void)
-{
-       int rc;
-
-       /* se the invalid penc to -1 */
-       mmu_psize_set_default_penc();
-
-       /* Default to 4K pages only */
-       memcpy(mmu_psize_defs, mmu_psize_defaults,
-              sizeof(mmu_psize_defaults));
-
-       /*
-        * Try to find the available page sizes in the device-tree
-        */
-       rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
-       if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
-               /*
-                * Nothing in the device-tree, but the CPU supports 16M pages,
-                * so let's fallback on a known size list for 16M capable CPUs.
-                */
-               memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
-                      sizeof(mmu_psize_defaults_gp));
-       }
-
-#ifdef CONFIG_HUGETLB_PAGE
-       if (!hugetlb_disabled) {
-               /* Reserve 16G huge page memory sections for huge pages */
-               of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-       }
-#endif /* CONFIG_HUGETLB_PAGE */
-}
-
-/*
- * Fill in the hpte_page_sizes[] array.
- * We go through the mmu_psize_defs[] array looking for all the
- * supported base/actual page size combinations.  Each combination
- * has a unique pagesize encoding (penc) value in the low bits of
- * the LP field of the HPTE.  For actual page sizes less than 1MB,
- * some of the upper LP bits are used for RPN bits, meaning that
- * we need to fill in several entries in hpte_page_sizes[].
- *
- * In diagrammatic form, with r = RPN bits and z = page size bits:
- *        PTE LP     actual page size
- *    rrrr rrrz                >=8KB
- *    rrrr rrzz                >=16KB
- *    rrrr rzzz                >=32KB
- *    rrrr zzzz                >=64KB
- *    ...
- *
- * The zzzz bits are implementation-specific but are chosen so that
- * no encoding for a larger page size uses the same value in its
- * low-order N bits as the encoding for the 2^(12+N) byte page size
- * (if it exists).
- */
-static void init_hpte_page_sizes(void)
-{
-       long int ap, bp;
-       long int shift, penc;
-
-       for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
-               if (!mmu_psize_defs[bp].shift)
-                       continue;       /* not a supported page size */
-               for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
-                       penc = mmu_psize_defs[bp].penc[ap];
-                       if (penc == -1 || !mmu_psize_defs[ap].shift)
-                               continue;
-                       shift = mmu_psize_defs[ap].shift - LP_SHIFT;
-                       if (shift <= 0)
-                               continue;       /* should never happen */
-                       /*
-                        * For page sizes less than 1MB, this loop
-                        * replicates the entry for all possible values
-                        * of the rrrr bits.
-                        */
-                       while (penc < (1 << LP_BITS)) {
-                               hpte_page_sizes[penc] = (ap << 4) | bp;
-                               penc += 1 << shift;
-                       }
-               }
-       }
-}
-
-static void __init htab_init_page_sizes(void)
-{
-       init_hpte_page_sizes();
-
-       if (!debug_pagealloc_enabled()) {
-               /*
-                * Pick a size for the linear mapping. Currently, we only
-                * support 16M, 1M and 4K which is the default
-                */
-               if (mmu_psize_defs[MMU_PAGE_16M].shift)
-                       mmu_linear_psize = MMU_PAGE_16M;
-               else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-                       mmu_linear_psize = MMU_PAGE_1M;
-       }
-
-#ifdef CONFIG_PPC_64K_PAGES
-       /*
-        * Pick a size for the ordinary pages. Default is 4K, we support
-        * 64K for user mappings and vmalloc if supported by the processor.
-        * We only use 64k for ioremap if the processor
-        * (and firmware) support cache-inhibited large pages.
-        * If not, we use 4k and set mmu_ci_restrictions so that
-        * hash_page knows to switch processes that use cache-inhibited
-        * mappings to 4k pages.
-        */
-       if (mmu_psize_defs[MMU_PAGE_64K].shift) {
-               mmu_virtual_psize = MMU_PAGE_64K;
-               mmu_vmalloc_psize = MMU_PAGE_64K;
-               if (mmu_linear_psize == MMU_PAGE_4K)
-                       mmu_linear_psize = MMU_PAGE_64K;
-               if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
-                       /*
-                        * When running on pSeries using 64k pages for ioremap
-                        * would stop us accessing the HEA ethernet. So if we
-                        * have the chance of ever seeing one, stay at 4k.
-                        */
-                       if (!might_have_hea())
-                               mmu_io_psize = MMU_PAGE_64K;
-               } else
-                       mmu_ci_restrictions = 1;
-       }
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-       /* We try to use 16M pages for vmemmap if that is supported
-        * and we have at least 1G of RAM at boot
-        */
-       if (mmu_psize_defs[MMU_PAGE_16M].shift &&
-           memblock_phys_mem_size() >= 0x40000000)
-               mmu_vmemmap_psize = MMU_PAGE_16M;
-       else if (mmu_psize_defs[MMU_PAGE_64K].shift)
-               mmu_vmemmap_psize = MMU_PAGE_64K;
-       else
-               mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
-       printk(KERN_DEBUG "Page orders: linear mapping = %d, "
-              "virtual = %d, io = %d"
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-              ", vmemmap = %d"
-#endif
-              "\n",
-              mmu_psize_defs[mmu_linear_psize].shift,
-              mmu_psize_defs[mmu_virtual_psize].shift,
-              mmu_psize_defs[mmu_io_psize].shift
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-              ,mmu_psize_defs[mmu_vmemmap_psize].shift
-#endif
-              );
-}
-
-static int __init htab_dt_scan_pftsize(unsigned long node,
-                                      const char *uname, int depth,
-                                      void *data)
-{
-       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-       const __be32 *prop;
-
-       /* We are scanning "cpu" nodes only */
-       if (type == NULL || strcmp(type, "cpu") != 0)
-               return 0;
-
-       prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
-       if (prop != NULL) {
-               /* pft_size[0] is the NUMA CEC cookie */
-               ppc64_pft_size = be32_to_cpu(prop[1]);
-               return 1;
-       }
-       return 0;
-}
-
-unsigned htab_shift_for_mem_size(unsigned long mem_size)
-{
-       unsigned memshift = __ilog2(mem_size);
-       unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
-       unsigned pteg_shift;
-
-       /* round mem_size up to next power of 2 */
-       if ((1UL << memshift) < mem_size)
-               memshift += 1;
-
-       /* aim for 2 pages / pteg */
-       pteg_shift = memshift - (pshift + 1);
-
-       /*
-        * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
-        * size permitted by the architecture.
-        */
-       return max(pteg_shift + 7, 18U);
-}
-
-static unsigned long __init htab_get_table_size(void)
-{
-       /* If hash size isn't already provided by the platform, we try to
-        * retrieve it from the device-tree. If it's not there neither, we
-        * calculate it now based on the total RAM size
-        */
-       if (ppc64_pft_size == 0)
-               of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
-       if (ppc64_pft_size)
-               return 1UL << ppc64_pft_size;
-
-       return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
-{
-       unsigned target_hpt_shift;
-
-       if (!mmu_hash_ops.resize_hpt)
-               return 0;
-
-       target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
-
-       /*
-        * To avoid lots of HPT resizes if memory size is fluctuating
-        * across a boundary, we deliberately have some hysterisis
-        * here: we immediately increase the HPT size if the target
-        * shift exceeds the current shift, but we won't attempt to
-        * reduce unless the target shift is at least 2 below the
-        * current shift
-        */
-       if (target_hpt_shift > ppc64_pft_size ||
-           target_hpt_shift < ppc64_pft_size - 1)
-               return mmu_hash_ops.resize_hpt(target_hpt_shift);
-
-       return 0;
-}
-
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
-       int rc;
-
-       if (end >= H_VMALLOC_START) {
-               pr_warn("Outside the supported range\n");
-               return -1;
-       }
-
-       rc = htab_bolt_mapping(start, end, __pa(start),
-                              pgprot_val(PAGE_KERNEL), mmu_linear_psize,
-                              mmu_kernel_ssize);
-
-       if (rc < 0) {
-               int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
-                                             mmu_kernel_ssize);
-               BUG_ON(rc2 && (rc2 != -ENOENT));
-       }
-       return rc;
-}
-
-int hash__remove_section_mapping(unsigned long start, unsigned long end)
-{
-       int rc = htab_remove_mapping(start, end, mmu_linear_psize,
-                                    mmu_kernel_ssize);
-       WARN_ON(rc < 0);
-       return rc;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-static void __init hash_init_partition_table(phys_addr_t hash_table,
-                                            unsigned long htab_size)
-{
-       mmu_partition_table_init();
-
-       /*
-        * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
-        * For now, UPRT is 0 and we have no segment table.
-        */
-       htab_size =  __ilog2(htab_size) - 18;
-       mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
-       pr_info("Partition table %p\n", partition_tb);
-}
-
-static void __init htab_initialize(void)
-{
-       unsigned long table;
-       unsigned long pteg_count;
-       unsigned long prot;
-       unsigned long base = 0, size = 0;
-       struct memblock_region *reg;
-
-       DBG(" -> htab_initialize()\n");
-
-       if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
-               mmu_kernel_ssize = MMU_SEGSIZE_1T;
-               mmu_highuser_ssize = MMU_SEGSIZE_1T;
-               printk(KERN_INFO "Using 1TB segments\n");
-       }
-
-       /*
-        * Calculate the required size of the htab.  We want the number of
-        * PTEGs to equal one half the number of real pages.
-        */ 
-       htab_size_bytes = htab_get_table_size();
-       pteg_count = htab_size_bytes >> 7;
-
-       htab_hash_mask = pteg_count - 1;
-
-       if (firmware_has_feature(FW_FEATURE_LPAR) ||
-           firmware_has_feature(FW_FEATURE_PS3_LV1)) {
-               /* Using a hypervisor which owns the htab */
-               htab_address = NULL;
-               _SDR1 = 0; 
-               /*
-                * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
-                * to inform the hypervisor that we wish to use the HPT.
-                */
-               if (cpu_has_feature(CPU_FTR_ARCH_300))
-                       register_process_table(0, 0, 0);
-#ifdef CONFIG_FA_DUMP
-               /*
-                * If firmware assisted dump is active firmware preserves
-                * the contents of htab along with entire partition memory.
-                * Clear the htab if firmware assisted dump is active so
-                * that we dont end up using old mappings.
-                */
-               if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
-                       mmu_hash_ops.hpte_clear_all();
-#endif
-       } else {
-               unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
-
-#ifdef CONFIG_PPC_CELL
-               /*
-                * Cell may require the hash table down low when using the
-                * Axon IOMMU in order to fit the dynamic region over it, see
-                * comments in cell/iommu.c
-                */
-               if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
-                       limit = 0x80000000;
-                       pr_info("Hash table forced below 2G for Axon IOMMU\n");
-               }
-#endif /* CONFIG_PPC_CELL */
-
-               table = memblock_phys_alloc_range(htab_size_bytes,
-                                                 htab_size_bytes,
-                                                 0, limit);
-               if (!table)
-                       panic("ERROR: Failed to allocate %pa bytes below %pa\n",
-                             &htab_size_bytes, &limit);
-
-               DBG("Hash table allocated at %lx, size: %lx\n", table,
-                   htab_size_bytes);
-
-               htab_address = __va(table);
-
-               /* htab absolute addr + encoded htabsize */
-               _SDR1 = table + __ilog2(htab_size_bytes) - 18;
-
-               /* Initialize the HPT with no entries */
-               memset((void *)table, 0, htab_size_bytes);
-
-               if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                       /* Set SDR1 */
-                       mtspr(SPRN_SDR1, _SDR1);
-               else
-                       hash_init_partition_table(table, htab_size_bytes);
-       }
-
-       prot = pgprot_val(PAGE_KERNEL);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       if (debug_pagealloc_enabled()) {
-               linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
-               linear_map_hash_slots = memblock_alloc_try_nid(
-                               linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
-                               ppc64_rma_size, NUMA_NO_NODE);
-               if (!linear_map_hash_slots)
-                       panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
-                             __func__, linear_map_hash_count, &ppc64_rma_size);
-       }
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-       /* create bolted the linear mapping in the hash table */
-       for_each_memblock(memory, reg) {
-               base = (unsigned long)__va(reg->base);
-               size = reg->size;
-
-               DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
-                   base, size, prot);
-
-               if ((base + size) >= H_VMALLOC_START) {
-                       pr_warn("Outside the supported range\n");
-                       continue;
-               }
-
-               BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
-                               prot, mmu_linear_psize, mmu_kernel_ssize));
-       }
-       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
-       /*
-        * If we have a memory_limit and we've allocated TCEs then we need to
-        * explicitly map the TCE area at the top of RAM. We also cope with the
-        * case that the TCEs start below memory_limit.
-        * tce_alloc_start/end are 16MB aligned so the mapping should work
-        * for either 4K or 16MB pages.
-        */
-       if (tce_alloc_start) {
-               tce_alloc_start = (unsigned long)__va(tce_alloc_start);
-               tce_alloc_end = (unsigned long)__va(tce_alloc_end);
-
-               if (base + size >= tce_alloc_start)
-                       tce_alloc_start = base + size + 1;
-
-               BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
-                                        __pa(tce_alloc_start), prot,
-                                        mmu_linear_psize, mmu_kernel_ssize));
-       }
-
-
-       DBG(" <- htab_initialize()\n");
-}
-#undef KB
-#undef MB
-
-void __init hash__early_init_devtree(void)
-{
-       /* Initialize segment sizes */
-       of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
-
-       /* Initialize page sizes */
-       htab_scan_page_sizes();
-}
-
-struct hash_mm_context init_hash_mm_context;
-void __init hash__early_init_mmu(void)
-{
-#ifndef CONFIG_PPC_64K_PAGES
-       /*
-        * We have code in __hash_page_4K() and elsewhere, which assumes it can
-        * do the following:
-        *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
-        *
-        * Where the slot number is between 0-15, and values of 8-15 indicate
-        * the secondary bucket. For that code to work H_PAGE_F_SECOND and
-        * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
-        * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
-        * with a BUILD_BUG_ON().
-        */
-       BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
-#endif /* CONFIG_PPC_64K_PAGES */
-
-       htab_init_page_sizes();
-
-       /*
-        * initialize page table size
-        */
-       __pte_frag_nr = H_PTE_FRAG_NR;
-       __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
-       __pmd_frag_nr = H_PMD_FRAG_NR;
-       __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
-
-       __pte_index_size = H_PTE_INDEX_SIZE;
-       __pmd_index_size = H_PMD_INDEX_SIZE;
-       __pud_index_size = H_PUD_INDEX_SIZE;
-       __pgd_index_size = H_PGD_INDEX_SIZE;
-       __pud_cache_index = H_PUD_CACHE_INDEX;
-       __pte_table_size = H_PTE_TABLE_SIZE;
-       __pmd_table_size = H_PMD_TABLE_SIZE;
-       __pud_table_size = H_PUD_TABLE_SIZE;
-       __pgd_table_size = H_PGD_TABLE_SIZE;
-       /*
-        * 4k use hugepd format, so for hash set then to
-        * zero
-        */
-       __pmd_val_bits = HASH_PMD_VAL_BITS;
-       __pud_val_bits = HASH_PUD_VAL_BITS;
-       __pgd_val_bits = HASH_PGD_VAL_BITS;
-
-       __kernel_virt_start = H_KERN_VIRT_START;
-       __vmalloc_start = H_VMALLOC_START;
-       __vmalloc_end = H_VMALLOC_END;
-       __kernel_io_start = H_KERN_IO_START;
-       __kernel_io_end = H_KERN_IO_END;
-       vmemmap = (struct page *)H_VMEMMAP_START;
-       ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PCI
-       pci_io_base = ISA_IO_BASE;
-#endif
-
-       /* Select appropriate backend */
-       if (firmware_has_feature(FW_FEATURE_PS3_LV1))
-               ps3_early_mm_init();
-       else if (firmware_has_feature(FW_FEATURE_LPAR))
-               hpte_init_pseries();
-       else if (IS_ENABLED(CONFIG_PPC_NATIVE))
-               hpte_init_native();
-
-       if (!mmu_hash_ops.hpte_insert)
-               panic("hash__early_init_mmu: No MMU hash ops defined!\n");
-
-       /* Initialize the MMU Hash table and create the linear mapping
-        * of memory. Has to be done before SLB initialization as this is
-        * currently where the page size encoding is obtained.
-        */
-       htab_initialize();
-
-       init_mm.context.hash_context = &init_hash_mm_context;
-       init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
-
-       pr_info("Initializing hash mmu with SLB\n");
-       /* Initialize SLB management */
-       slb_initialize();
-
-       if (cpu_has_feature(CPU_FTR_ARCH_206)
-                       && cpu_has_feature(CPU_FTR_HVMODE))
-               tlbiel_all();
-}
-
-#ifdef CONFIG_SMP
-void hash__early_init_mmu_secondary(void)
-{
-       /* Initialize hash table for that CPU */
-       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-
-               if (!cpu_has_feature(CPU_FTR_ARCH_300))
-                       mtspr(SPRN_SDR1, _SDR1);
-               else
-                       mtspr(SPRN_PTCR,
-                             __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
-       }
-       /* Initialize SLB */
-       slb_initialize();
-
-       if (cpu_has_feature(CPU_FTR_ARCH_206)
-                       && cpu_has_feature(CPU_FTR_HVMODE))
-               tlbiel_all();
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
-{
-       struct page *page;
-
-       if (!pfn_valid(pte_pfn(pte)))
-               return pp;
-
-       page = pte_page(pte);
-
-       /* page is dirty */
-       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-               if (trap == 0x400) {
-                       flush_dcache_icache_page(page);
-                       set_bit(PG_arch_1, &page->flags);
-               } else
-                       pp |= HPTE_R_N;
-       }
-       return pp;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-static unsigned int get_paca_psize(unsigned long addr)
-{
-       unsigned char *psizes;
-       unsigned long index, mask_index;
-
-       if (addr < SLICE_LOW_TOP) {
-               psizes = get_paca()->mm_ctx_low_slices_psize;
-               index = GET_LOW_SLICE_INDEX(addr);
-       } else {
-               psizes = get_paca()->mm_ctx_high_slices_psize;
-               index = GET_HIGH_SLICE_INDEX(addr);
-       }
-       mask_index = index & 0x1;
-       return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
-}
-
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
-       return get_paca()->mm_ctx_user_psize;
-}
-#endif
-
-/*
- * Demote a segment to using 4k pages.
- * For now this makes the whole process use 4k pages.
- */
-#ifdef CONFIG_PPC_64K_PAGES
-void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
-{
-       if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
-               return;
-       slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
-       copro_flush_all_slbs(mm);
-       if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
-
-               copy_mm_to_paca(mm);
-               slb_flush_and_restore_bolted();
-       }
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
- * Userspace sets the subpage permissions using the subpage_prot system call.
- *
- * Result is 0: full permissions, _PAGE_RW: read-only,
- * _PAGE_RWX: no access.
- */
-static int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
-       struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
-       u32 spp = 0;
-       u32 **sbpm, *sbpp;
-
-       if (!spt)
-               return 0;
-
-       if (ea >= spt->maxaddr)
-               return 0;
-       if (ea < 0x100000000UL) {
-               /* addresses below 4GB use spt->low_prot */
-               sbpm = spt->low_prot;
-       } else {
-               sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
-               if (!sbpm)
-                       return 0;
-       }
-       sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
-       if (!sbpp)
-               return 0;
-       spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
-
-       /* extract 2-bit bitfield for this 4k subpage */
-       spp >>= 30 - 2 * ((ea >> 12) & 0xf);
-
-       /*
-        * 0 -> full premission
-        * 1 -> Read only
-        * 2 -> no access.
-        * We return the flag that need to be cleared.
-        */
-       spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
-       return spp;
-}
-
-#else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
-       return 0;
-}
-#endif
-
-void hash_failure_debug(unsigned long ea, unsigned long access,
-                       unsigned long vsid, unsigned long trap,
-                       int ssize, int psize, int lpsize, unsigned long pte)
-{
-       if (!printk_ratelimit())
-               return;
-       pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
-               ea, access, current->comm);
-       pr_info("    trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
-               trap, vsid, ssize, psize, lpsize, pte);
-}
-
-static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
-                            int psize, bool user_region)
-{
-       if (user_region) {
-               if (psize != get_paca_psize(ea)) {
-                       copy_mm_to_paca(mm);
-                       slb_flush_and_restore_bolted();
-               }
-       } else if (get_paca()->vmalloc_sllp !=
-                  mmu_psize_defs[mmu_vmalloc_psize].sllp) {
-               get_paca()->vmalloc_sllp =
-                       mmu_psize_defs[mmu_vmalloc_psize].sllp;
-               slb_vmalloc_update();
-       }
-}
-
-/* Result code is:
- *  0 - handled
- *  1 - normal page fault
- * -1 - critical hash insertion error
- * -2 - access not permitted by subpage protection mechanism
- */
-int hash_page_mm(struct mm_struct *mm, unsigned long ea,
-                unsigned long access, unsigned long trap,
-                unsigned long flags)
-{
-       bool is_thp;
-       enum ctx_state prev_state = exception_enter();
-       pgd_t *pgdir;
-       unsigned long vsid;
-       pte_t *ptep;
-       unsigned hugeshift;
-       int rc, user_region = 0;
-       int psize, ssize;
-
-       DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
-               ea, access, trap);
-       trace_hash_fault(ea, access, trap);
-
-       /* Get region & vsid */
-       switch (get_region_id(ea)) {
-       case USER_REGION_ID:
-               user_region = 1;
-               if (! mm) {
-                       DBG_LOW(" user region with no mm !\n");
-                       rc = 1;
-                       goto bail;
-               }
-               psize = get_slice_psize(mm, ea);
-               ssize = user_segment_size(ea);
-               vsid = get_user_vsid(&mm->context, ea, ssize);
-               break;
-       case VMALLOC_REGION_ID:
-               vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
-               psize = mmu_vmalloc_psize;
-               ssize = mmu_kernel_ssize;
-               break;
-
-       case IO_REGION_ID:
-               vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
-               psize = mmu_io_psize;
-               ssize = mmu_kernel_ssize;
-               break;
-       default:
-               /* Not a valid range
-                * Send the problem up to do_page_fault 
-                */
-               rc = 1;
-               goto bail;
-       }
-       DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
-
-       /* Bad address. */
-       if (!vsid) {
-               DBG_LOW("Bad address!\n");
-               rc = 1;
-               goto bail;
-       }
-       /* Get pgdir */
-       pgdir = mm->pgd;
-       if (pgdir == NULL) {
-               rc = 1;
-               goto bail;
-       }
-
-       /* Check CPU locality */
-       if (user_region && mm_is_thread_local(mm))
-               flags |= HPTE_LOCAL_UPDATE;
-
-#ifndef CONFIG_PPC_64K_PAGES
-       /* If we use 4K pages and our psize is not 4K, then we might
-        * be hitting a special driver mapping, and need to align the
-        * address before we fetch the PTE.
-        *
-        * It could also be a hugepage mapping, in which case this is
-        * not necessary, but it's not harmful, either.
-        */
-       if (psize != MMU_PAGE_4K)
-               ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
-#endif /* CONFIG_PPC_64K_PAGES */
-
-       /* Get PTE and page size from page tables */
-       ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
-       if (ptep == NULL || !pte_present(*ptep)) {
-               DBG_LOW(" no PTE !\n");
-               rc = 1;
-               goto bail;
-       }
-
-       /* Add _PAGE_PRESENT to the required access perm */
-       access |= _PAGE_PRESENT;
-
-       /* Pre-check access permissions (will be re-checked atomically
-        * in __hash_page_XX but this pre-check is a fast path
-        */
-       if (!check_pte_access(access, pte_val(*ptep))) {
-               DBG_LOW(" no access !\n");
-               rc = 1;
-               goto bail;
-       }
-
-       if (hugeshift) {
-               if (is_thp)
-                       rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
-                                            trap, flags, ssize, psize);
-#ifdef CONFIG_HUGETLB_PAGE
-               else
-                       rc = __hash_page_huge(ea, access, vsid, ptep, trap,
-                                             flags, ssize, hugeshift, psize);
-#else
-               else {
-                       /*
-                        * if we have hugeshift, and is not transhuge with
-                        * hugetlb disabled, something is really wrong.
-                        */
-                       rc = 1;
-                       WARN_ON(1);
-               }
-#endif
-               if (current->mm == mm)
-                       check_paca_psize(ea, mm, psize, user_region);
-
-               goto bail;
-       }
-
-#ifndef CONFIG_PPC_64K_PAGES
-       DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
-#else
-       DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
-               pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
-       /* Do actual hashing */
-#ifdef CONFIG_PPC_64K_PAGES
-       /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
-       if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
-               demote_segment_4k(mm, ea);
-               psize = MMU_PAGE_4K;
-       }
-
-       /* If this PTE is non-cacheable and we have restrictions on
-        * using non cacheable large pages, then we switch to 4k
-        */
-       if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
-               if (user_region) {
-                       demote_segment_4k(mm, ea);
-                       psize = MMU_PAGE_4K;
-               } else if (ea < VMALLOC_END) {
-                       /*
-                        * some driver did a non-cacheable mapping
-                        * in vmalloc space, so switch vmalloc
-                        * to 4k pages
-                        */
-                       printk(KERN_ALERT "Reducing vmalloc segment "
-                              "to 4kB pages because of "
-                              "non-cacheable mapping\n");
-                       psize = mmu_vmalloc_psize = MMU_PAGE_4K;
-                       copro_flush_all_slbs(mm);
-               }
-       }
-
-#endif /* CONFIG_PPC_64K_PAGES */
-
-       if (current->mm == mm)
-               check_paca_psize(ea, mm, psize, user_region);
-
-#ifdef CONFIG_PPC_64K_PAGES
-       if (psize == MMU_PAGE_64K)
-               rc = __hash_page_64K(ea, access, vsid, ptep, trap,
-                                    flags, ssize);
-       else
-#endif /* CONFIG_PPC_64K_PAGES */
-       {
-               int spp = subpage_protection(mm, ea);
-               if (access & spp)
-                       rc = -2;
-               else
-                       rc = __hash_page_4K(ea, access, vsid, ptep, trap,
-                                           flags, ssize, spp);
-       }
-
-       /* Dump some info in case of hash insertion failure, they should
-        * never happen so it is really useful to know if/when they do
-        */
-       if (rc == -1)
-               hash_failure_debug(ea, access, vsid, trap, ssize, psize,
-                                  psize, pte_val(*ptep));
-#ifndef CONFIG_PPC_64K_PAGES
-       DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
-#else
-       DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
-               pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
-       DBG_LOW(" -> rc=%d\n", rc);
-
-bail:
-       exception_exit(prev_state);
-       return rc;
-}
-EXPORT_SYMBOL_GPL(hash_page_mm);
-
-int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
-             unsigned long dsisr)
-{
-       unsigned long flags = 0;
-       struct mm_struct *mm = current->mm;
-
-       if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
-           (get_region_id(ea) == IO_REGION_ID))
-               mm = &init_mm;
-
-       if (dsisr & DSISR_NOHPTE)
-               flags |= HPTE_NOHPTE_UPDATE;
-
-       return hash_page_mm(mm, ea, access, trap, flags);
-}
-EXPORT_SYMBOL_GPL(hash_page);
-
-int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
-               unsigned long dsisr)
-{
-       unsigned long access = _PAGE_PRESENT | _PAGE_READ;
-       unsigned long flags = 0;
-       struct mm_struct *mm = current->mm;
-       unsigned int region_id = get_region_id(ea);
-
-       if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
-               mm = &init_mm;
-
-       if (dsisr & DSISR_NOHPTE)
-               flags |= HPTE_NOHPTE_UPDATE;
-
-       if (dsisr & DSISR_ISSTORE)
-               access |= _PAGE_WRITE;
-       /*
-        * We set _PAGE_PRIVILEGED only when
-        * kernel mode access kernel space.
-        *
-        * _PAGE_PRIVILEGED is NOT set
-        * 1) when kernel mode access user space
-        * 2) user space access kernel space.
-        */
-       access |= _PAGE_PRIVILEGED;
-       if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
-               access &= ~_PAGE_PRIVILEGED;
-
-       if (trap == 0x400)
-               access |= _PAGE_EXEC;
-
-       return hash_page_mm(mm, ea, access, trap, flags);
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
-       int psize = get_slice_psize(mm, ea);
-
-       /* We only prefault standard pages for now */
-       if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
-               return false;
-
-       /*
-        * Don't prefault if subpage protection is enabled for the EA.
-        */
-       if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
-               return false;
-
-       return true;
-}
-#else
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
-       return true;
-}
-#endif
-
-void hash_preload(struct mm_struct *mm, unsigned long ea,
-                 bool is_exec, unsigned long trap)
-{
-       int hugepage_shift;
-       unsigned long vsid;
-       pgd_t *pgdir;
-       pte_t *ptep;
-       unsigned long flags;
-       int rc, ssize, update_flags = 0;
-       unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
-
-       BUG_ON(get_region_id(ea) != USER_REGION_ID);
-
-       if (!should_hash_preload(mm, ea))
-               return;
-
-       DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
-               " trap=%lx\n", mm, mm->pgd, ea, access, trap);
-
-       /* Get Linux PTE if available */
-       pgdir = mm->pgd;
-       if (pgdir == NULL)
-               return;
-
-       /* Get VSID */
-       ssize = user_segment_size(ea);
-       vsid = get_user_vsid(&mm->context, ea, ssize);
-       if (!vsid)
-               return;
-       /*
-        * Hash doesn't like irqs. Walking linux page table with irq disabled
-        * saves us from holding multiple locks.
-        */
-       local_irq_save(flags);
-
-       /*
-        * THP pages use update_mmu_cache_pmd. We don't do
-        * hash preload there. Hence can ignore THP here
-        */
-       ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
-       if (!ptep)
-               goto out_exit;
-
-       WARN_ON(hugepage_shift);
-#ifdef CONFIG_PPC_64K_PAGES
-       /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
-        * a 64K kernel), then we don't preload, hash_page() will take
-        * care of it once we actually try to access the page.
-        * That way we don't have to duplicate all of the logic for segment
-        * page size demotion here
-        */
-       if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
-               goto out_exit;
-#endif /* CONFIG_PPC_64K_PAGES */
-
-       /* Is that local to this CPU ? */
-       if (mm_is_thread_local(mm))
-               update_flags |= HPTE_LOCAL_UPDATE;
-
-       /* Hash it in */
-#ifdef CONFIG_PPC_64K_PAGES
-       if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
-               rc = __hash_page_64K(ea, access, vsid, ptep, trap,
-                                    update_flags, ssize);
-       else
-#endif /* CONFIG_PPC_64K_PAGES */
-               rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
-                                   ssize, subpage_protection(mm, ea));
-
-       /* Dump some info in case of hash insertion failure, they should
-        * never happen so it is really useful to know if/when they do
-        */
-       if (rc == -1)
-               hash_failure_debug(ea, access, vsid, trap, ssize,
-                                  mm_ctx_user_psize(&mm->context),
-                                  mm_ctx_user_psize(&mm->context),
-                                  pte_val(*ptep));
-out_exit:
-       local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
-       pte_t *ptep;
-       u16 pkey = 0;
-       unsigned long flags;
-
-       if (!mm || !mm->pgd)
-               return 0;
-
-       local_irq_save(flags);
-       ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
-       if (ptep)
-               pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
-       local_irq_restore(flags);
-
-       return pkey;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-static inline void tm_flush_hash_page(int local)
-{
-       /*
-        * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
-        * page back to a block device w/PIO could pick up transactional data
-        * (bad!) so we force an abort here. Before the sync the page will be
-        * made read-only, which will flush_hash_page. BIG ISSUE here: if the
-        * kernel uses a page from userspace without unmapping it first, it may
-        * see the speculated version.
-        */
-       if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
-           MSR_TM_ACTIVE(current->thread.regs->msr)) {
-               tm_enable();
-               tm_abort(TM_CAUSE_TLBI);
-       }
-}
-#else
-static inline void tm_flush_hash_page(int local)
-{
-}
-#endif
-
-/*
- * Return the global hash slot, corresponding to the given PTE, which contains
- * the HPTE.
- */
-unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
-               int ssize, real_pte_t rpte, unsigned int subpg_index)
-{
-       unsigned long hash, gslot, hidx;
-
-       hash = hpt_hash(vpn, shift, ssize);
-       hidx = __rpte_to_hidx(rpte, subpg_index);
-       if (hidx & _PTEIDX_SECONDARY)
-               hash = ~hash;
-       gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-       gslot += hidx & _PTEIDX_GROUP_IX;
-       return gslot;
-}
-
-/* WARNING: This is called from hash_low_64.S, if you change this prototype,
- *          do not forget to update the assembly call site !
- */
-void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
-                    unsigned long flags)
-{
-       unsigned long index, shift, gslot;
-       int local = flags & HPTE_LOCAL_UPDATE;
-
-       DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
-       pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
-               gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
-               DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
-               /*
-                * We use same base page size and actual psize, because we don't
-                * use these functions for hugepage
-                */
-               mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
-                                            ssize, local);
-       } pte_iterate_hashed_end();
-
-       tm_flush_hash_page(local);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
-                        pmd_t *pmdp, unsigned int psize, int ssize,
-                        unsigned long flags)
-{
-       int i, max_hpte_count, valid;
-       unsigned long s_addr;
-       unsigned char *hpte_slot_array;
-       unsigned long hidx, shift, vpn, hash, slot;
-       int local = flags & HPTE_LOCAL_UPDATE;
-
-       s_addr = addr & HPAGE_PMD_MASK;
-       hpte_slot_array = get_hpte_slot_array(pmdp);
-       /*
-        * IF we try to do a HUGE PTE update after a withdraw is done.
-        * we will find the below NULL. This happens when we do
-        * split_huge_page_pmd
-        */
-       if (!hpte_slot_array)
-               return;
-
-       if (mmu_hash_ops.hugepage_invalidate) {
-               mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
-                                                psize, ssize, local);
-               goto tm_abort;
-       }
-       /*
-        * No bluk hpte removal support, invalidate each entry
-        */
-       shift = mmu_psize_defs[psize].shift;
-       max_hpte_count = HPAGE_PMD_SIZE >> shift;
-       for (i = 0; i < max_hpte_count; i++) {
-               /*
-                * 8 bits per each hpte entries
-                * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
-                */
-               valid = hpte_valid(hpte_slot_array, i);
-               if (!valid)
-                       continue;
-               hidx =  hpte_hash_index(hpte_slot_array, i);
-
-               /* get the vpn */
-               addr = s_addr + (i * (1ul << shift));
-               vpn = hpt_vpn(addr, vsid, ssize);
-               hash = hpt_hash(vpn, shift, ssize);
-               if (hidx & _PTEIDX_SECONDARY)
-                       hash = ~hash;
-
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += hidx & _PTEIDX_GROUP_IX;
-               mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
-                                            MMU_PAGE_16M, ssize, local);
-       }
-tm_abort:
-       tm_flush_hash_page(local);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void flush_hash_range(unsigned long number, int local)
-{
-       if (mmu_hash_ops.flush_hash_range)
-               mmu_hash_ops.flush_hash_range(number, local);
-       else {
-               int i;
-               struct ppc64_tlb_batch *batch =
-                       this_cpu_ptr(&ppc64_tlb_batch);
-
-               for (i = 0; i < number; i++)
-                       flush_hash_page(batch->vpn[i], batch->pte[i],
-                                       batch->psize, batch->ssize, local);
-       }
-}
-
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
-       enum ctx_state prev_state = exception_enter();
-
-       if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-               if (rc == -2)
-                       _exception(SIGSEGV, regs, SEGV_ACCERR, address);
-               else
-#endif
-                       _exception(SIGBUS, regs, BUS_ADRERR, address);
-       } else
-               bad_page_fault(regs, address, SIGBUS);
-
-       exception_exit(prev_state);
-}
-
-long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
-                          unsigned long pa, unsigned long rflags,
-                          unsigned long vflags, int psize, int ssize)
-{
-       unsigned long hpte_group;
-       long slot;
-
-repeat:
-       hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-       /* Insert into the hash table, primary slot */
-       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
-                                       psize, psize, ssize);
-
-       /* Primary is full, try the secondary */
-       if (unlikely(slot == -1)) {
-               hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
-                                               vflags | HPTE_V_SECONDARY,
-                                               psize, psize, ssize);
-               if (slot == -1) {
-                       if (mftb() & 0x1)
-                               hpte_group = (hash & htab_hash_mask) *
-                                               HPTES_PER_GROUP;
-
-                       mmu_hash_ops.hpte_remove(hpte_group);
-                       goto repeat;
-               }
-       }
-
-       return slot;
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
-{
-       unsigned long hash;
-       unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-       unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-       unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
-       long ret;
-
-       hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-
-       /* Don't create HPTE entries for bad address */
-       if (!vsid)
-               return;
-
-       ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
-                                   HPTE_V_BOLTED,
-                                   mmu_linear_psize, mmu_kernel_ssize);
-
-       BUG_ON (ret < 0);
-       spin_lock(&linear_map_hash_lock);
-       BUG_ON(linear_map_hash_slots[lmi] & 0x80);
-       linear_map_hash_slots[lmi] = ret | 0x80;
-       spin_unlock(&linear_map_hash_lock);
-}
-
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
-{
-       unsigned long hash, hidx, slot;
-       unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-       unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-
-       hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-       spin_lock(&linear_map_hash_lock);
-       BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
-       hidx = linear_map_hash_slots[lmi] & 0x7f;
-       linear_map_hash_slots[lmi] = 0;
-       spin_unlock(&linear_map_hash_lock);
-       if (hidx & _PTEIDX_SECONDARY)
-               hash = ~hash;
-       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-       slot += hidx & _PTEIDX_GROUP_IX;
-       mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
-                                    mmu_linear_psize,
-                                    mmu_kernel_ssize, 0);
-}
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
-{
-       unsigned long flags, vaddr, lmi;
-       int i;
-
-       local_irq_save(flags);
-       for (i = 0; i < numpages; i++, page++) {
-               vaddr = (unsigned long)page_address(page);
-               lmi = __pa(vaddr) >> PAGE_SHIFT;
-               if (lmi >= linear_map_hash_count)
-                       continue;
-               if (enable)
-                       kernel_map_linear_page(vaddr, lmi);
-               else
-                       kernel_unmap_linear_page(vaddr, lmi);
-       }
-       local_irq_restore(flags);
-}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
-                               phys_addr_t first_memblock_size)
-{
-       /* We don't currently support the first MEMBLOCK not mapping 0
-        * physical on those processors
-        */
-       BUG_ON(first_memblock_base != 0);
-
-       /*
-        * On virtualized systems the first entry is our RMA region aka VRMA,
-        * non-virtualized 64-bit hash MMU systems don't have a limitation
-        * on real mode access.
-        *
-        * For guests on platforms before POWER9, we clamp the it limit to 1G
-        * to avoid some funky things such as RTAS bugs etc...
-        */
-       if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
-               ppc64_rma_size = first_memblock_size;
-               if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
-                       ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
-
-               /* Finally limit subsequent allocations */
-               memblock_set_current_limit(ppc64_rma_size);
-       } else {
-               ppc64_rma_size = ULONG_MAX;
-       }
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-static int hpt_order_get(void *data, u64 *val)
-{
-       *val = ppc64_pft_size;
-       return 0;
-}
-
-static int hpt_order_set(void *data, u64 val)
-{
-       if (!mmu_hash_ops.resize_hpt)
-               return -ENODEV;
-
-       return mmu_hash_ops.resize_hpt(val);
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
-
-static int __init hash64_debugfs(void)
-{
-       if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
-                                       NULL, &fops_hpt_order)) {
-               pr_err("lpar: unable to create hpt_order debugsfs file\n");
-       }
-
-       return 0;
-}
-machine_device_initcall(pseries, hash64_debugfs);
-#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
deleted file mode 100644 (file)
index dfbc3b3..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright IBM Corporation, 2013
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-/*
- * PPC64 THP Support for hash based MMUs
- */
-#include <linux/mm.h>
-#include <asm/machdep.h>
-
-int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
-                   pmd_t *pmdp, unsigned long trap, unsigned long flags,
-                   int ssize, unsigned int psize)
-{
-       unsigned int index, valid;
-       unsigned char *hpte_slot_array;
-       unsigned long rflags, pa, hidx;
-       unsigned long old_pmd, new_pmd;
-       int ret, lpsize = MMU_PAGE_16M;
-       unsigned long vpn, hash, shift, slot;
-
-       /*
-        * atomically mark the linux large page PMD busy and dirty
-        */
-       do {
-               pmd_t pmd = READ_ONCE(*pmdp);
-
-               old_pmd = pmd_val(pmd);
-               /* If PMD busy, retry the access */
-               if (unlikely(old_pmd & H_PAGE_BUSY))
-                       return 0;
-               /* If PMD permissions don't match, take page fault */
-               if (unlikely(!check_pte_access(access, old_pmd)))
-                       return 1;
-               /*
-                * Try to lock the PTE, add ACCESSED and DIRTY if it was
-                * a write access
-                */
-               new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_WRITE)
-                       new_pmd |= _PAGE_DIRTY;
-       } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
-
-       /*
-        * Make sure this is thp or devmap entry
-        */
-       if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
-               return 0;
-
-       rflags = htab_convert_pte_flags(new_pmd);
-
-#if 0
-       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
-
-               /*
-                * No CPU has hugepages but lacks no execute, so we
-                * don't need to worry about that case
-                */
-               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-       }
-#endif
-       /*
-        * Find the slot index details for this ea, using base page size.
-        */
-       shift = mmu_psize_defs[psize].shift;
-       index = (ea & ~HPAGE_PMD_MASK) >> shift;
-       BUG_ON(index >= PTE_FRAG_SIZE);
-
-       vpn = hpt_vpn(ea, vsid, ssize);
-       hpte_slot_array = get_hpte_slot_array(pmdp);
-       if (psize == MMU_PAGE_4K) {
-               /*
-                * invalidate the old hpte entry if we have that mapped via 64K
-                * base page size. This is because demote_segment won't flush
-                * hash page table entries.
-                */
-               if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
-                       flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
-                                           ssize, flags);
-                       /*
-                        * With THP, we also clear the slot information with
-                        * respect to all the 64K hash pte mapping the 16MB
-                        * page. They are all invalid now. This make sure we
-                        * don't find the slot valid when we fault with 4k
-                        * base page size.
-                        *
-                        */
-                       memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
-               }
-       }
-
-       valid = hpte_valid(hpte_slot_array, index);
-       if (valid) {
-               /* update the hpte bits */
-               hash = hpt_hash(vpn, shift, ssize);
-               hidx =  hpte_hash_index(hpte_slot_array, index);
-               if (hidx & _PTEIDX_SECONDARY)
-                       hash = ~hash;
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += hidx & _PTEIDX_GROUP_IX;
-
-               ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
-                                                psize, lpsize, ssize, flags);
-               /*
-                * We failed to update, try to insert a new entry.
-                */
-               if (ret == -1) {
-                       /*
-                        * large pte is marked busy, so we can be sure
-                        * nobody is looking at hpte_slot_array. hence we can
-                        * safely update this here.
-                        */
-                       valid = 0;
-                       hpte_slot_array[index] = 0;
-               }
-       }
-
-       if (!valid) {
-               unsigned long hpte_group;
-
-               hash = hpt_hash(vpn, shift, ssize);
-               /* insert new entry */
-               pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-               new_pmd |= H_PAGE_HASHPTE;
-
-repeat:
-               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-               /* Insert into the hash table, primary slot */
-               slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-                                               psize, lpsize, ssize);
-               /*
-                * Primary is full, try the secondary
-                */
-               if (unlikely(slot == -1)) {
-                       hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-                       slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-                                                       rflags,
-                                                       HPTE_V_SECONDARY,
-                                                       psize, lpsize, ssize);
-                       if (slot == -1) {
-                               if (mftb() & 0x1)
-                                       hpte_group = (hash & htab_hash_mask) *
-                                                       HPTES_PER_GROUP;
-
-                               mmu_hash_ops.hpte_remove(hpte_group);
-                               goto repeat;
-                       }
-               }
-               /*
-                * Hypervisor failure. Restore old pmd and return -1
-                * similar to __hash_page_*
-                */
-               if (unlikely(slot == -2)) {
-                       *pmdp = __pmd(old_pmd);
-                       hash_failure_debug(ea, access, vsid, trap, ssize,
-                                          psize, lpsize, old_pmd);
-                       return -1;
-               }
-               /*
-                * large pte is marked busy, so we can be sure
-                * nobody is looking at hpte_slot_array. hence we can
-                * safely update this here.
-                */
-               mark_hpte_slot_valid(hpte_slot_array, index, slot);
-       }
-       /*
-        * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
-        * base page size 4k.
-        */
-       if (psize == MMU_PAGE_4K)
-               new_pmd |= H_PAGE_COMBO;
-       /*
-        * The hpte valid is stored in the pgtable whose address is in the
-        * second half of the PMD. Order this against clearing of the busy bit in
-        * huge pmd.
-        */
-       smp_wmb();
-       *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
-       return 0;
-}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
deleted file mode 100644 (file)
index b0d9209..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
- *
- * Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * Based on the IA-32 version:
- * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-
-extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
-                                 unsigned long pa, unsigned long rlags,
-                                 unsigned long vflags, int psize, int ssize);
-
-int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
-                    pte_t *ptep, unsigned long trap, unsigned long flags,
-                    int ssize, unsigned int shift, unsigned int mmu_psize)
-{
-       real_pte_t rpte;
-       unsigned long vpn;
-       unsigned long old_pte, new_pte;
-       unsigned long rflags, pa;
-       long slot, offset;
-
-       BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
-
-       /* Search the Linux page table for a match with va */
-       vpn = hpt_vpn(ea, vsid, ssize);
-
-       /* At this point, we have a pte (old_pte) which can be used to build
-        * or update an HPTE. There are 2 cases:
-        *
-        * 1. There is a valid (present) pte with no associated HPTE (this is
-        *      the most common case)
-        * 2. There is a valid (present) pte with an associated HPTE. The
-        *      current values of the pp bits in the HPTE prevent access
-        *      because we are doing software DIRTY bit management and the
-        *      page is currently not DIRTY.
-        */
-
-
-       do {
-               old_pte = pte_val(*ptep);
-               /* If PTE busy, retry the access */
-               if (unlikely(old_pte & H_PAGE_BUSY))
-                       return 0;
-               /* If PTE permissions don't match, take page fault */
-               if (unlikely(!check_pte_access(access, old_pte)))
-                       return 1;
-
-               /* Try to lock the PTE, add ACCESSED and DIRTY if it was
-                * a write access */
-               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_WRITE)
-                       new_pte |= _PAGE_DIRTY;
-       } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-       /* Make sure this is a hugetlb entry */
-       if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
-               return 0;
-
-       rflags = htab_convert_pte_flags(new_pte);
-       if (unlikely(mmu_psize == MMU_PAGE_16G))
-               offset = PTRS_PER_PUD;
-       else
-               offset = PTRS_PER_PMD;
-       rpte = __real_pte(__pte(old_pte), ptep, offset);
-
-       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               /* No CPU has hugepages but lacks no execute, so we
-                * don't need to worry about that case */
-               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
-       /* Check if pte already has an hpte (case 2) */
-       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
-               /* There MIGHT be an HPTE for this pte */
-               unsigned long gslot;
-
-               gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
-               if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
-                                              mmu_psize, ssize, flags) == -1)
-                       old_pte &= ~_PAGE_HPTEFLAGS;
-       }
-
-       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-               unsigned long hash = hpt_hash(vpn, shift, ssize);
-
-               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-               /* clear HPTE slot informations in new PTE */
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-
-               slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
-                                            mmu_psize, ssize);
-
-               /*
-                * Hypervisor failure. Restore old pte and return -1
-                * similar to __hash_page_*
-                */
-               if (unlikely(slot == -2)) {
-                       *ptep = __pte(old_pte);
-                       hash_failure_debug(ea, access, vsid, trap, ssize,
-                                          mmu_psize, mmu_psize, old_pte);
-                       return -1;
-               }
-
-               new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
-       }
-
-       /*
-        * No need to use ldarx/stdcx here
-        */
-       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
-       return 0;
-}
-
-pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
-                                 unsigned long addr, pte_t *ptep)
-{
-       unsigned long pte_val;
-       /*
-        * Clear the _PAGE_PRESENT so that no hardware parallel update is
-        * possible. Also keep the pte_present true so that we don't take
-        * wrong fault.
-        */
-       pte_val = pte_update(vma->vm_mm, addr, ptep,
-                            _PAGE_PRESENT, _PAGE_INVALID, 1);
-
-       return __pte(pte_val);
-}
-
-void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
-                                 pte_t *ptep, pte_t old_pte, pte_t pte)
-{
-
-       if (radix_enabled())
-               return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
-                                                          old_pte, pte);
-       set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
-}
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
deleted file mode 100644 (file)
index cab0633..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/security.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-
-void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-       int psize;
-       struct hstate *hstate = hstate_file(vma->vm_file);
-
-       psize = hstate_get_psize(hstate);
-       radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
-}
-
-void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-       int psize;
-       struct hstate *hstate = hstate_file(vma->vm_file);
-
-       psize = hstate_get_psize(hstate);
-       radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
-}
-
-void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
-                                  unsigned long end)
-{
-       int psize;
-       struct hstate *hstate = hstate_file(vma->vm_file);
-
-       psize = hstate_get_psize(hstate);
-       radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
-}
-
-/*
- * A vairant of hugetlb_get_unmapped_area doing topdown search
- * FIXME!! should we do as x86 does or non hugetlb area does ?
- * ie, use topdown or not based on mmap_is_legacy check ?
- */
-unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-                               unsigned long len, unsigned long pgoff,
-                               unsigned long flags)
-{
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       struct hstate *h = hstate_file(file);
-       int fixed = (flags & MAP_FIXED);
-       unsigned long high_limit;
-       struct vm_unmapped_area_info info;
-
-       high_limit = DEFAULT_MAP_WINDOW;
-       if (addr >= high_limit || (fixed && (addr + len > high_limit)))
-               high_limit = TASK_SIZE;
-
-       if (len & ~huge_page_mask(h))
-               return -EINVAL;
-       if (len > high_limit)
-               return -ENOMEM;
-
-       if (fixed) {
-               if (addr > high_limit - len)
-                       return -ENOMEM;
-               if (prepare_hugepage_range(file, addr, len))
-                       return -EINVAL;
-               return addr;
-       }
-
-       if (addr) {
-               addr = ALIGN(addr, huge_page_size(h));
-               vma = find_vma(mm, addr);
-               if (high_limit - len >= addr && addr >= mmap_min_addr &&
-                   (!vma || addr + len <= vm_start_gap(vma)))
-                       return addr;
-       }
-       /*
-        * We are always doing an topdown search here. Slice code
-        * does that too.
-        */
-       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-       info.length = len;
-       info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-       info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
-       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-       info.align_offset = 0;
-
-       return vm_unmapped_area(&info);
-}
-
-void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
-                                        unsigned long addr, pte_t *ptep,
-                                        pte_t old_pte, pte_t pte)
-{
-       struct mm_struct *mm = vma->vm_mm;
-
-       /*
-        * To avoid NMMU hang while relaxing access we need to flush the tlb before
-        * we set the new value.
-        */
-       if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
-           (atomic_read(&mm->context.copros) > 0))
-               radix__flush_hugetlb_page(vma, addr);
-
-       set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
-}
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
deleted file mode 100644 (file)
index cb2b086..0000000
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- *  MMU context allocation for 64-bit kernels.
- *
- *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/pkeys.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-
-static DEFINE_IDA(mmu_context_ida);
-
-static int alloc_context_id(int min_id, int max_id)
-{
-       return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
-}
-
-void hash__reserve_context_id(int id)
-{
-       int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
-
-       WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
-}
-
-int hash__alloc_context_id(void)
-{
-       unsigned long max;
-
-       if (mmu_has_feature(MMU_FTR_68_BIT_VA))
-               max = MAX_USER_CONTEXT;
-       else
-               max = MAX_USER_CONTEXT_65BIT_VA;
-
-       return alloc_context_id(MIN_USER_CONTEXT, max);
-}
-EXPORT_SYMBOL_GPL(hash__alloc_context_id);
-
-void slb_setup_new_exec(void);
-
-static int hash__init_new_context(struct mm_struct *mm)
-{
-       int index;
-
-       index = hash__alloc_context_id();
-       if (index < 0)
-               return index;
-
-       mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
-                                          GFP_KERNEL);
-       if (!mm->context.hash_context) {
-               ida_free(&mmu_context_ida, index);
-               return -ENOMEM;
-       }
-
-       /*
-        * The old code would re-promote on fork, we don't do that when using
-        * slices as it could cause problem promoting slices that have been
-        * forced down to 4K.
-        *
-        * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
-        * explicitly against context.id == 0. This ensures that we properly
-        * initialize context slice details for newly allocated mm's (which will
-        * have id == 0) and don't alter context slice inherited via fork (which
-        * will have id != 0).
-        *
-        * We should not be calling init_new_context() on init_mm. Hence a
-        * check against 0 is OK.
-        */
-       if (mm->context.id == 0) {
-               memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
-               slice_init_new_context_exec(mm);
-       } else {
-               /* This is fork. Copy hash_context details from current->mm */
-               memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-               /* inherit subpage prot detalis if we have one. */
-               if (current->mm->context.hash_context->spt) {
-                       mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
-                                                               GFP_KERNEL);
-                       if (!mm->context.hash_context->spt) {
-                               ida_free(&mmu_context_ida, index);
-                               kfree(mm->context.hash_context);
-                               return -ENOMEM;
-                       }
-               }
-#endif
-
-       }
-
-       pkey_mm_init(mm);
-       return index;
-}
-
-void hash__setup_new_exec(void)
-{
-       slice_setup_new_exec();
-
-       slb_setup_new_exec();
-}
-
-static int radix__init_new_context(struct mm_struct *mm)
-{
-       unsigned long rts_field;
-       int index, max_id;
-
-       max_id = (1 << mmu_pid_bits) - 1;
-       index = alloc_context_id(mmu_base_pid, max_id);
-       if (index < 0)
-               return index;
-
-       /*
-        * set the process table entry,
-        */
-       rts_field = radix__get_tree_size();
-       process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
-
-       /*
-        * Order the above store with subsequent update of the PID
-        * register (at which point HW can start loading/caching
-        * the entry) and the corresponding load by the MMU from
-        * the L2 cache.
-        */
-       asm volatile("ptesync;isync" : : : "memory");
-
-       mm->context.npu_context = NULL;
-       mm->context.hash_context = NULL;
-
-       return index;
-}
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       int index;
-
-       if (radix_enabled())
-               index = radix__init_new_context(mm);
-       else
-               index = hash__init_new_context(mm);
-
-       if (index < 0)
-               return index;
-
-       mm->context.id = index;
-
-       mm->context.pte_frag = NULL;
-       mm->context.pmd_frag = NULL;
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-       mm_iommu_init(mm);
-#endif
-       atomic_set(&mm->context.active_cpus, 0);
-       atomic_set(&mm->context.copros, 0);
-
-       return 0;
-}
-
-void __destroy_context(int context_id)
-{
-       ida_free(&mmu_context_ida, context_id);
-}
-EXPORT_SYMBOL_GPL(__destroy_context);
-
-static void destroy_contexts(mm_context_t *ctx)
-{
-       int index, context_id;
-
-       for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
-               context_id = ctx->extended_id[index];
-               if (context_id)
-                       ida_free(&mmu_context_ida, context_id);
-       }
-       kfree(ctx->hash_context);
-}
-
-static void pmd_frag_destroy(void *pmd_frag)
-{
-       int count;
-       struct page *page;
-
-       page = virt_to_page(pmd_frag);
-       /* drop all the pending references */
-       count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
-       /* We allow PTE_FRAG_NR fragments from a PTE page */
-       if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
-               pgtable_pmd_page_dtor(page);
-               __free_page(page);
-       }
-}
-
-static void destroy_pagetable_cache(struct mm_struct *mm)
-{
-       void *frag;
-
-       frag = mm->context.pte_frag;
-       if (frag)
-               pte_frag_destroy(frag);
-
-       frag = mm->context.pmd_frag;
-       if (frag)
-               pmd_frag_destroy(frag);
-       return;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-       WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
-#endif
-       if (radix_enabled())
-               WARN_ON(process_tb[mm->context.id].prtb0 != 0);
-       else
-               subpage_prot_free(mm);
-       destroy_contexts(&mm->context);
-       mm->context.id = MMU_NO_CONTEXT;
-}
-
-void arch_exit_mmap(struct mm_struct *mm)
-{
-       destroy_pagetable_cache(mm);
-
-       if (radix_enabled()) {
-               /*
-                * Radix doesn't have a valid bit in the process table
-                * entries. However we know that at least P9 implementation
-                * will avoid caching an entry with an invalid RTS field,
-                * and 0 is invalid. So this will do.
-                *
-                * This runs before the "fullmm" tlb flush in exit_mmap,
-                * which does a RIC=2 tlbie to clear the process table
-                * entry. See the "fullmm" comments in tlb-radix.c.
-                *
-                * No barrier required here after the store because
-                * this process will do the invalidate, which starts with
-                * ptesync.
-                */
-               process_tb[mm->context.id].prtb0 = 0;
-       }
-}
-
-#ifdef CONFIG_PPC_RADIX_MMU
-void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
-{
-       mtspr(SPRN_PID, next->context.id);
-       isync();
-}
-#endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
deleted file mode 100644 (file)
index e7a9c4f..0000000
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- *  IOMMU helpers in MMU context.
- *
- *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched/signal.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/swap.h>
-#include <linux/sizes.h>
-#include <asm/mmu_context.h>
-#include <asm/pte-walk.h>
-#include <linux/mm_inline.h>
-
-static DEFINE_MUTEX(mem_list_mutex);
-
-#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY        0x1
-#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
-
-struct mm_iommu_table_group_mem_t {
-       struct list_head next;
-       struct rcu_head rcu;
-       unsigned long used;
-       atomic64_t mapped;
-       unsigned int pageshift;
-       u64 ua;                 /* userspace address */
-       u64 entries;            /* number of entries in hpas/hpages[] */
-       /*
-        * in mm_iommu_get we temporarily use this to store
-        * struct page address.
-        *
-        * We need to convert ua to hpa in real mode. Make it
-        * simpler by storing physical address.
-        */
-       union {
-               struct page **hpages;   /* vmalloc'ed */
-               phys_addr_t *hpas;
-       };
-#define MM_IOMMU_TABLE_INVALID_HPA     ((uint64_t)-1)
-       u64 dev_hpa;            /* Device memory base address */
-};
-
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-               unsigned long npages, bool incr)
-{
-       long ret = 0, locked, lock_limit;
-
-       if (!npages)
-               return 0;
-
-       down_write(&mm->mmap_sem);
-
-       if (incr) {
-               locked = mm->locked_vm + npages;
-               lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                       ret = -ENOMEM;
-               else
-                       mm->locked_vm += npages;
-       } else {
-               if (WARN_ON_ONCE(npages > mm->locked_vm))
-                       npages = mm->locked_vm;
-               mm->locked_vm -= npages;
-       }
-
-       pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-                       current ? current->pid : 0,
-                       incr ? '+' : '-',
-                       npages << PAGE_SHIFT,
-                       mm->locked_vm << PAGE_SHIFT,
-                       rlimit(RLIMIT_MEMLOCK));
-       up_write(&mm->mmap_sem);
-
-       return ret;
-}
-
-bool mm_iommu_preregistered(struct mm_struct *mm)
-{
-       return !list_empty(&mm->context.iommu_group_mem_list);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
-
-static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
-                             unsigned long entries, unsigned long dev_hpa,
-                             struct mm_iommu_table_group_mem_t **pmem)
-{
-       struct mm_iommu_table_group_mem_t *mem;
-       long i, ret, locked_entries = 0;
-       unsigned int pageshift;
-
-       mutex_lock(&mem_list_mutex);
-
-       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
-                       next) {
-               /* Overlap? */
-               if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
-                               (ua < (mem->ua +
-                                      (mem->entries << PAGE_SHIFT)))) {
-                       ret = -EINVAL;
-                       goto unlock_exit;
-               }
-
-       }
-
-       if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-               ret = mm_iommu_adjust_locked_vm(mm, entries, true);
-               if (ret)
-                       goto unlock_exit;
-
-               locked_entries = entries;
-       }
-
-       mem = kzalloc(sizeof(*mem), GFP_KERNEL);
-       if (!mem) {
-               ret = -ENOMEM;
-               goto unlock_exit;
-       }
-
-       if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
-               mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
-               mem->dev_hpa = dev_hpa;
-               goto good_exit;
-       }
-       mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
-
-       /*
-        * For a starting point for a maximum page size calculation
-        * we use @ua and @entries natural alignment to allow IOMMU pages
-        * smaller than huge pages but still bigger than PAGE_SIZE.
-        */
-       mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
-       mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
-       if (!mem->hpas) {
-               kfree(mem);
-               ret = -ENOMEM;
-               goto unlock_exit;
-       }
-
-       down_read(&mm->mmap_sem);
-       ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
-       up_read(&mm->mmap_sem);
-       if (ret != entries) {
-               /* free the reference taken */
-               for (i = 0; i < ret; i++)
-                       put_page(mem->hpages[i]);
-
-               vfree(mem->hpas);
-               kfree(mem);
-               ret = -EFAULT;
-               goto unlock_exit;
-       }
-
-       pageshift = PAGE_SHIFT;
-       for (i = 0; i < entries; ++i) {
-               struct page *page = mem->hpages[i];
-
-               /*
-                * Allow to use larger than 64k IOMMU pages. Only do that
-                * if we are backed by hugetlb.
-                */
-               if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
-                       struct page *head = compound_head(page);
-
-                       pageshift = compound_order(head) + PAGE_SHIFT;
-               }
-               mem->pageshift = min(mem->pageshift, pageshift);
-               /*
-                * We don't need struct page reference any more, switch
-                * to physical address.
-                */
-               mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
-       }
-
-good_exit:
-       ret = 0;
-       atomic64_set(&mem->mapped, 1);
-       mem->used = 1;
-       mem->ua = ua;
-       mem->entries = entries;
-       *pmem = mem;
-
-       list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
-
-unlock_exit:
-       if (locked_entries && ret)
-               mm_iommu_adjust_locked_vm(mm, locked_entries, false);
-
-       mutex_unlock(&mem_list_mutex);
-
-       return ret;
-}
-
-long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
-               struct mm_iommu_table_group_mem_t **pmem)
-{
-       return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
-                       pmem);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_new);
-
-long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
-               unsigned long entries, unsigned long dev_hpa,
-               struct mm_iommu_table_group_mem_t **pmem)
-{
-       return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_newdev);
-
-static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
-{
-       long i;
-       struct page *page = NULL;
-
-       if (!mem->hpas)
-               return;
-
-       for (i = 0; i < mem->entries; ++i) {
-               if (!mem->hpas[i])
-                       continue;
-
-               page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
-               if (!page)
-                       continue;
-
-               if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
-                       SetPageDirty(page);
-
-               put_page(page);
-               mem->hpas[i] = 0;
-       }
-}
-
-static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
-{
-
-       mm_iommu_unpin(mem);
-       vfree(mem->hpas);
-       kfree(mem);
-}
-
-static void mm_iommu_free(struct rcu_head *head)
-{
-       struct mm_iommu_table_group_mem_t *mem = container_of(head,
-                       struct mm_iommu_table_group_mem_t, rcu);
-
-       mm_iommu_do_free(mem);
-}
-
-static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
-{
-       list_del_rcu(&mem->next);
-       call_rcu(&mem->rcu, mm_iommu_free);
-}
-
-long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
-{
-       long ret = 0;
-       unsigned long entries, dev_hpa;
-
-       mutex_lock(&mem_list_mutex);
-
-       if (mem->used == 0) {
-               ret = -ENOENT;
-               goto unlock_exit;
-       }
-
-       --mem->used;
-       /* There are still users, exit */
-       if (mem->used)
-               goto unlock_exit;
-
-       /* Are there still mappings? */
-       if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
-               ++mem->used;
-               ret = -EBUSY;
-               goto unlock_exit;
-       }
-
-       /* @mapped became 0 so now mappings are disabled, release the region */
-       entries = mem->entries;
-       dev_hpa = mem->dev_hpa;
-       mm_iommu_release(mem);
-
-       if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
-               mm_iommu_adjust_locked_vm(mm, entries, false);
-
-unlock_exit:
-       mutex_unlock(&mem_list_mutex);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_put);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
-               unsigned long ua, unsigned long size)
-{
-       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
-       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
-               if ((mem->ua <= ua) &&
-                               (ua + size <= mem->ua +
-                                (mem->entries << PAGE_SHIFT))) {
-                       ret = mem;
-                       break;
-               }
-       }
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_lookup);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
-               unsigned long ua, unsigned long size)
-{
-       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
-       list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
-                       next) {
-               if ((mem->ua <= ua) &&
-                               (ua + size <= mem->ua +
-                                (mem->entries << PAGE_SHIFT))) {
-                       ret = mem;
-                       break;
-               }
-       }
-
-       return ret;
-}
-
-struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
-               unsigned long ua, unsigned long entries)
-{
-       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
-       mutex_lock(&mem_list_mutex);
-
-       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
-               if ((mem->ua == ua) && (mem->entries == entries)) {
-                       ret = mem;
-                       ++mem->used;
-                       break;
-               }
-       }
-
-       mutex_unlock(&mem_list_mutex);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_get);
-
-long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
-               unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
-       const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-       u64 *va;
-
-       if (entry >= mem->entries)
-               return -EFAULT;
-
-       if (pageshift > mem->pageshift)
-               return -EFAULT;
-
-       if (!mem->hpas) {
-               *hpa = mem->dev_hpa + (ua - mem->ua);
-               return 0;
-       }
-
-       va = &mem->hpas[entry];
-       *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
-
-long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
-               unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
-       const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-       unsigned long *pa;
-
-       if (entry >= mem->entries)
-               return -EFAULT;
-
-       if (pageshift > mem->pageshift)
-               return -EFAULT;
-
-       if (!mem->hpas) {
-               *hpa = mem->dev_hpa + (ua - mem->ua);
-               return 0;
-       }
-
-       pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
-       if (!pa)
-               return -EFAULT;
-
-       *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
-       return 0;
-}
-
-extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
-{
-       struct mm_iommu_table_group_mem_t *mem;
-       long entry;
-       void *va;
-       unsigned long *pa;
-
-       mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
-       if (!mem)
-               return;
-
-       if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
-               return;
-
-       entry = (ua - mem->ua) >> PAGE_SHIFT;
-       va = &mem->hpas[entry];
-
-       pa = (void *) vmalloc_to_phys(va);
-       if (!pa)
-               return;
-
-       *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
-}
-
-bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
-               unsigned int pageshift, unsigned long *size)
-{
-       struct mm_iommu_table_group_mem_t *mem;
-       unsigned long end;
-
-       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
-               if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
-                       continue;
-
-               end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
-               if ((mem->dev_hpa <= hpa) && (hpa < end)) {
-                       /*
-                        * Since the IOMMU page size might be bigger than
-                        * PAGE_SIZE, the amount of preregistered memory
-                        * starting from @hpa might be smaller than 1<<pageshift
-                        * and the caller needs to distinguish this situation.
-                        */
-                       *size = min(1UL << pageshift, end - hpa);
-                       return true;
-               }
-       }
-
-       return false;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
-
-long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
-{
-       if (atomic64_inc_not_zero(&mem->mapped))
-               return 0;
-
-       /* Last mm_iommu_put() has been called, no more mappings allowed() */
-       return -ENXIO;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
-
-void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
-{
-       atomic64_add_unless(&mem->mapped, -1, 1);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
-
-void mm_iommu_init(struct mm_struct *mm)
-{
-       INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
-}
index 6ef36d553cdee4e5af6ded2df808b0e7c9e882af..57e64273cb33b31899cd43608627fa715ccd5a56 100644 (file)
@@ -1068,7 +1068,7 @@ u64 memory_hotplug_max(void)
 /* Virtual Processor Home Node (VPHN) support */
 #ifdef CONFIG_PPC_SPLPAR
 
-#include "vphn.h"
+#include "book3s64/vphn.h"
 
 struct topology_update_data {
        struct topology_update_data *next;
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
deleted file mode 100644 (file)
index 16bda04..0000000
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/memblock.h>
-#include <misc/cxl-base.h>
-
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-#include <asm/trace.h>
-#include <asm/powernv.h>
-
-#include <mm/mmu_decl.h>
-#include <trace/events/thp.h>
-
-unsigned long __pmd_frag_nr;
-EXPORT_SYMBOL(__pmd_frag_nr);
-unsigned long __pmd_frag_size_shift;
-EXPORT_SYMBOL(__pmd_frag_size_shift);
-
-int (*register_process_table)(unsigned long base, unsigned long page_size,
-                             unsigned long tbl_size);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * This is called when relaxing access to a hugepage. It's also called in the page
- * fault path when we don't hit any of the major fault cases, ie, a minor
- * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
- * handled those two for us, we additionally deal with missing execute
- * permission here on some processors
- */
-int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp, pmd_t entry, int dirty)
-{
-       int changed;
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-       assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
-#endif
-       changed = !pmd_same(*(pmdp), entry);
-       if (changed) {
-               /*
-                * We can use MMU_PAGE_2M here, because only radix
-                * path look at the psize.
-                */
-               __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
-                                       pmd_pte(entry), address, MMU_PAGE_2M);
-       }
-       return changed;
-}
-
-int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-                             unsigned long address, pmd_t *pmdp)
-{
-       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-/*
- * set a new huge pmd. We should not be called for updating
- * an existing pmd entry. That should go via pmd_hugepage_update.
- */
-void set_pmd_at(struct mm_struct *mm, unsigned long addr,
-               pmd_t *pmdp, pmd_t pmd)
-{
-#ifdef CONFIG_DEBUG_VM
-       /*
-        * Make sure hardware valid bit is not set. We don't do
-        * tlb flush for this update.
-        */
-
-       WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
-       assert_spin_locked(pmd_lockptr(mm, pmdp));
-       WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
-#endif
-       trace_hugepage_set_pmd(addr, pmd_val(pmd));
-       return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
-}
-
-static void do_nothing(void *unused)
-{
-
-}
-/*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
-void serialize_against_pte_lookup(struct mm_struct *mm)
-{
-       smp_mb();
-       smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
-}
-
-/*
- * We use this to invalidate a pmdp entry before switching from a
- * hugepte to regular pmd entry.
- */
-pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-                    pmd_t *pmdp)
-{
-       unsigned long old_pmd;
-
-       old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
-       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-       /*
-        * This ensures that generic code that rely on IRQ disabling
-        * to prevent a parallel THP split work as expected.
-        */
-       serialize_against_pte_lookup(vma->vm_mm);
-       return __pmd(old_pmd);
-}
-
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-{
-       return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
-}
-
-pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
-{
-       unsigned long pmdv;
-
-       pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
-       return pmd_set_protbits(__pmd(pmdv), pgprot);
-}
-
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
-       return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
-       unsigned long pmdv;
-
-       pmdv = pmd_val(pmd);
-       pmdv &= _HPAGE_CHG_MASK;
-       return pmd_set_protbits(__pmd(pmdv), newprot);
-}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
-                         pmd_t *pmd)
-{
-       if (radix_enabled())
-               prefetch((void *)addr);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/* For use by kexec */
-void mmu_cleanup_all(void)
-{
-       if (radix_enabled())
-               radix__mmu_cleanup_all();
-       else if (mmu_hash_ops.hpte_clear_all)
-               mmu_hash_ops.hpte_clear_all();
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
-       if (radix_enabled())
-               return radix__create_section_mapping(start, end, nid);
-
-       return hash__create_section_mapping(start, end, nid);
-}
-
-int __meminit remove_section_mapping(unsigned long start, unsigned long end)
-{
-       if (radix_enabled())
-               return radix__remove_section_mapping(start, end);
-
-       return hash__remove_section_mapping(start, end);
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-void __init mmu_partition_table_init(void)
-{
-       unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
-       unsigned long ptcr;
-
-       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
-       /* Initialize the Partition Table with no entries */
-       partition_tb = memblock_alloc(patb_size, patb_size);
-       if (!partition_tb)
-               panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-                     __func__, patb_size, patb_size);
-
-       /*
-        * update partition table control register,
-        * 64 K size.
-        */
-       ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
-       mtspr(SPRN_PTCR, ptcr);
-       powernv_set_nmmu_ptcr(ptcr);
-}
-
-void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
-                                  unsigned long dw1)
-{
-       unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
-
-       partition_tb[lpid].patb0 = cpu_to_be64(dw0);
-       partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-
-       /*
-        * Global flush of TLBs and partition table caches for this lpid.
-        * The type of flush (hash or radix) depends on what the previous
-        * use of this partition ID was, not the new use.
-        */
-       asm volatile("ptesync" : : : "memory");
-       if (old & PATB_HR) {
-               asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
-                            "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-               asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-                            "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-               trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
-       } else {
-               asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
-                            "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-               trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
-       }
-       /* do we need fixup here ?*/
-       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-
-static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
-{
-       void *pmd_frag, *ret;
-
-       if (PMD_FRAG_NR == 1)
-               return NULL;
-
-       spin_lock(&mm->page_table_lock);
-       ret = mm->context.pmd_frag;
-       if (ret) {
-               pmd_frag = ret + PMD_FRAG_SIZE;
-               /*
-                * If we have taken up all the fragments mark PTE page NULL
-                */
-               if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
-                       pmd_frag = NULL;
-               mm->context.pmd_frag = pmd_frag;
-       }
-       spin_unlock(&mm->page_table_lock);
-       return (pmd_t *)ret;
-}
-
-static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
-{
-       void *ret = NULL;
-       struct page *page;
-       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
-
-       if (mm == &init_mm)
-               gfp &= ~__GFP_ACCOUNT;
-       page = alloc_page(gfp);
-       if (!page)
-               return NULL;
-       if (!pgtable_pmd_page_ctor(page)) {
-               __free_pages(page, 0);
-               return NULL;
-       }
-
-       atomic_set(&page->pt_frag_refcount, 1);
-
-       ret = page_address(page);
-       /*
-        * if we support only one fragment just return the
-        * allocated page.
-        */
-       if (PMD_FRAG_NR == 1)
-               return ret;
-
-       spin_lock(&mm->page_table_lock);
-       /*
-        * If we find pgtable_page set, we return
-        * the allocated page with single fragement
-        * count.
-        */
-       if (likely(!mm->context.pmd_frag)) {
-               atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
-               mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
-       }
-       spin_unlock(&mm->page_table_lock);
-
-       return (pmd_t *)ret;
-}
-
-pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
-{
-       pmd_t *pmd;
-
-       pmd = get_pmd_from_cache(mm);
-       if (pmd)
-               return pmd;
-
-       return __alloc_for_pmdcache(mm);
-}
-
-void pmd_fragment_free(unsigned long *pmd)
-{
-       struct page *page = virt_to_page(pmd);
-
-       BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
-       if (atomic_dec_and_test(&page->pt_frag_refcount)) {
-               pgtable_pmd_page_dtor(page);
-               __free_page(page);
-       }
-}
-
-static inline void pgtable_free(void *table, int index)
-{
-       switch (index) {
-       case PTE_INDEX:
-               pte_fragment_free(table, 0);
-               break;
-       case PMD_INDEX:
-               pmd_fragment_free(table);
-               break;
-       case PUD_INDEX:
-               kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
-               break;
-#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
-               /* 16M hugepd directory at pud level */
-       case HTLB_16M_INDEX:
-               BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
-               kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
-               break;
-               /* 16G hugepd directory at the pgd level */
-       case HTLB_16G_INDEX:
-               BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
-               kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
-               break;
-#endif
-               /* We don't free pgd table via RCU callback */
-       default:
-               BUG();
-       }
-}
-
-#ifdef CONFIG_SMP
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
-       unsigned long pgf = (unsigned long)table;
-
-       BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
-       pgf |= index;
-       tlb_remove_table(tlb, (void *)pgf);
-}
-
-void __tlb_remove_table(void *_table)
-{
-       void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
-       unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
-       return pgtable_free(table, index);
-}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
-       return pgtable_free(table, index);
-}
-#endif
-
-#ifdef CONFIG_PROC_FS
-atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
-
-void arch_report_meminfo(struct seq_file *m)
-{
-       /*
-        * Hash maps the memory with one size mmu_linear_psize.
-        * So don't bother to print these on hash
-        */
-       if (!radix_enabled())
-               return;
-       seq_printf(m, "DirectMap4k:    %8lu kB\n",
-                  atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
-       seq_printf(m, "DirectMap64k:    %8lu kB\n",
-                  atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
-       seq_printf(m, "DirectMap2M:    %8lu kB\n",
-                  atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
-       seq_printf(m, "DirectMap1G:    %8lu kB\n",
-                  atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
-}
-#endif /* CONFIG_PROC_FS */
-
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
-                            pte_t *ptep)
-{
-       unsigned long pte_val;
-
-       /*
-        * Clear the _PAGE_PRESENT so that no hardware parallel update is
-        * possible. Also keep the pte_present true so that we don't take
-        * wrong fault.
-        */
-       pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
-
-       return __pte(pte_val);
-
-}
-
-void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
-                            pte_t *ptep, pte_t old_pte, pte_t pte)
-{
-       if (radix_enabled())
-               return radix__ptep_modify_prot_commit(vma, addr,
-                                                     ptep, old_pte, pte);
-       set_pte_at(vma->vm_mm, addr, ptep, pte);
-}
-
-/*
- * For hash translation mode, we use the deposited table to store hash slot
- * information and they are stored at PTRS_PER_PMD offset from related pmd
- * location. Hence a pmd move requires deposit and withdraw.
- *
- * For radix translation with split pmd ptl, we store the deposited table in the
- * pmd page. Hence if we have different pmd page we need to withdraw during pmd
- * move.
- *
- * With hash we use deposited table always irrespective of anon or not.
- * With radix we use deposited table only for anonymous mapping.
- */
-int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-                          struct spinlock *old_pmd_ptl,
-                          struct vm_area_struct *vma)
-{
-       if (radix_enabled())
-               return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
-
-       return true;
-}
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
deleted file mode 100644 (file)
index 1fd025d..0000000
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright 2005, Paul Mackerras, IBM Corporation.
- * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/sections.h>
-#include <asm/mmu.h>
-#include <asm/tlb.h>
-
-#include <mm/mmu_decl.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/thp.h>
-
-#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-/*
- * vmemmap is the starting address of the virtual address space where
- * struct pages are allocated for all possible PFNs present on the system
- * including holes and bad memory (hence sparse). These virtual struct
- * pages are stored in sequence in this virtual address space irrespective
- * of the fact whether the corresponding PFN is valid or not. This achieves
- * constant relationship between address of struct page and its PFN.
- *
- * During boot or memory hotplug operation when a new memory section is
- * added, physical memory allocation (including hash table bolting) will
- * be performed for the set of struct pages which are part of the memory
- * section. This saves memory by not allocating struct pages for PFNs
- * which are not valid.
- *
- *             ----------------------------------------------
- *             | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
- *             ----------------------------------------------
- *
- *        f000000000000000                  c000000000000000
- * vmemmap +--------------+                  +--------------+
- *  +      |  page struct | +--------------> |  page struct |
- *  |      +--------------+                  +--------------+
- *  |      |  page struct | +--------------> |  page struct |
- *  |      +--------------+ |                +--------------+
- *  |      |  page struct | +       +------> |  page struct |
- *  |      +--------------+         |        +--------------+
- *  |      |  page struct |         |   +--> |  page struct |
- *  |      +--------------+         |   |    +--------------+
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct | +-------+   |
- *  |      +--------------+             |
- *  |      |  page struct | +-----------+
- *  |      +--------------+
- *  |      |  page struct | No mapping
- *  |      +--------------+
- *  |      |  page struct | No mapping
- *  v      +--------------+
- *
- *             -----------------------------------------
- *             | RELATION BETWEEN STRUCT PAGES AND PFNS|
- *             -----------------------------------------
- *
- * vmemmap +--------------+                 +---------------+
- *  +      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |              |
- *  |      +--------------+
- *  |      |              |
- *  |      +--------------+
- *  |      |              |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |              |
- *  |      +--------------+
- *  |      |              |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  v      +--------------+                 +---------------+
- */
-/*
- * On hash-based CPUs, the vmemmap is bolted in the hash table.
- *
- */
-int __meminit hash__vmemmap_create_mapping(unsigned long start,
-                                      unsigned long page_size,
-                                      unsigned long phys)
-{
-       int rc;
-
-       if ((start + page_size) >= H_VMEMMAP_END) {
-               pr_warn("Outside the supported range\n");
-               return -1;
-       }
-
-       rc = htab_bolt_mapping(start, start + page_size, phys,
-                              pgprot_val(PAGE_KERNEL),
-                              mmu_vmemmap_psize, mmu_kernel_ssize);
-       if (rc < 0) {
-               int rc2 = htab_remove_mapping(start, start + page_size,
-                                             mmu_vmemmap_psize,
-                                             mmu_kernel_ssize);
-               BUG_ON(rc2 && (rc2 != -ENOENT));
-       }
-       return rc;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void hash__vmemmap_remove_mapping(unsigned long start,
-                             unsigned long page_size)
-{
-       int rc = htab_remove_mapping(start, start + page_size,
-                                    mmu_vmemmap_psize,
-                                    mmu_kernel_ssize);
-       BUG_ON((rc < 0) && (rc != -ENOENT));
-       WARN_ON(rc == -ENOENT);
-}
-#endif
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
-/*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
- */
-int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
-{
-       pgd_t *pgdp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-
-       BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
-       if (slab_is_available()) {
-               pgdp = pgd_offset_k(ea);
-               pudp = pud_alloc(&init_mm, pgdp, ea);
-               if (!pudp)
-                       return -ENOMEM;
-               pmdp = pmd_alloc(&init_mm, pudp, ea);
-               if (!pmdp)
-                       return -ENOMEM;
-               ptep = pte_alloc_kernel(pmdp, ea);
-               if (!ptep)
-                       return -ENOMEM;
-               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
-       } else {
-               /*
-                * If the mm subsystem is not fully up, we cannot create a
-                * linux page table entry for this mapping.  Simply bolt an
-                * entry in the hardware page table.
-                *
-                */
-               if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
-                                     mmu_io_psize, mmu_kernel_ssize)) {
-                       printk(KERN_ERR "Failed to do bolted mapping IO "
-                              "memory at %016lx !\n", pa);
-                       return -ENOMEM;
-               }
-       }
-
-       smp_wmb();
-       return 0;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-                                   pmd_t *pmdp, unsigned long clr,
-                                   unsigned long set)
-{
-       __be64 old_be, tmp;
-       unsigned long old;
-
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-       assert_spin_locked(pmd_lockptr(mm, pmdp));
-#endif
-
-       __asm__ __volatile__(
-       "1:     ldarx   %0,0,%3\n\
-               and.    %1,%0,%6\n\
-               bne-    1b \n\
-               andc    %1,%0,%4 \n\
-               or      %1,%1,%7\n\
-               stdcx.  %1,0,%3 \n\
-               bne-    1b"
-       : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
-       : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
-         "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
-       : "cc" );
-
-       old = be64_to_cpu(old_be);
-
-       trace_hugepage_update(addr, old, clr, set);
-       if (old & H_PAGE_HASHPTE)
-               hpte_do_hugepage_flush(mm, addr, pmdp, old);
-       return old;
-}
-
-pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
-                           pmd_t *pmdp)
-{
-       pmd_t pmd;
-
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       VM_BUG_ON(pmd_trans_huge(*pmdp));
-       VM_BUG_ON(pmd_devmap(*pmdp));
-
-       pmd = *pmdp;
-       pmd_clear(pmdp);
-       /*
-        * Wait for all pending hash_page to finish. This is needed
-        * in case of subpage collapse. When we collapse normal pages
-        * to hugepage, we first clear the pmd, then invalidate all
-        * the PTE entries. The assumption here is that any low level
-        * page fault will see a none pmd and take the slow path that
-        * will wait on mmap_sem. But we could very well be in a
-        * hash_page with local ptep pointer value. Such a hash page
-        * can result in adding new HPTE entries for normal subpages.
-        * That means we could be modifying the page content as we
-        * copy them to a huge page. So wait for parallel hash_page
-        * to finish before invalidating HPTE entries. We can do this
-        * by sending an IPI to all the cpus and executing a dummy
-        * function there.
-        */
-       serialize_against_pte_lookup(vma->vm_mm);
-       /*
-        * Now invalidate the hpte entries in the range
-        * covered by pmd. This make sure we take a
-        * fault and will find the pmd as none, which will
-        * result in a major fault which takes mmap_sem and
-        * hence wait for collapse to complete. Without this
-        * the __collapse_huge_page_copy can result in copying
-        * the old content.
-        */
-       flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-       return pmd;
-}
-
-/*
- * We want to put the pgtable in pmd and use pgtable for tracking
- * the base page size hptes
- */
-void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-                                 pgtable_t pgtable)
-{
-       pgtable_t *pgtable_slot;
-
-       assert_spin_locked(pmd_lockptr(mm, pmdp));
-       /*
-        * we store the pgtable in the second half of PMD
-        */
-       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-       *pgtable_slot = pgtable;
-       /*
-        * expose the deposited pgtable to other cpus.
-        * before we set the hugepage PTE at pmd level
-        * hash fault code looks at the deposted pgtable
-        * to store hash index values.
-        */
-       smp_wmb();
-}
-
-pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
-       pgtable_t pgtable;
-       pgtable_t *pgtable_slot;
-
-       assert_spin_locked(pmd_lockptr(mm, pmdp));
-
-       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-       pgtable = *pgtable_slot;
-       /*
-        * Once we withdraw, mark the entry NULL.
-        */
-       *pgtable_slot = NULL;
-       /*
-        * We store HPTE information in the deposited PTE fragment.
-        * zero out the content on withdraw.
-        */
-       memset(pgtable, 0, PTE_FRAG_SIZE);
-       return pgtable;
-}
-
-/*
- * A linux hugepage PMD was changed and the corresponding hash table entries
- * neesd to be flushed.
- */
-void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-                           pmd_t *pmdp, unsigned long old_pmd)
-{
-       int ssize;
-       unsigned int psize;
-       unsigned long vsid;
-       unsigned long flags = 0;
-
-       /* get the base page size,vsid and segment size */
-#ifdef CONFIG_DEBUG_VM
-       psize = get_slice_psize(mm, addr);
-       BUG_ON(psize == MMU_PAGE_16M);
-#endif
-       if (old_pmd & H_PAGE_COMBO)
-               psize = MMU_PAGE_4K;
-       else
-               psize = MMU_PAGE_64K;
-
-       if (!is_kernel_addr(addr)) {
-               ssize = user_segment_size(addr);
-               vsid = get_user_vsid(&mm->context, addr, ssize);
-               WARN_ON(vsid == 0);
-       } else {
-               vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-               ssize = mmu_kernel_ssize;
-       }
-
-       if (mm_is_thread_local(mm))
-               flags |= HPTE_LOCAL_UPDATE;
-
-       return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
-}
-
-pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
-                               unsigned long addr, pmd_t *pmdp)
-{
-       pmd_t old_pmd;
-       pgtable_t pgtable;
-       unsigned long old;
-       pgtable_t *pgtable_slot;
-
-       old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
-       old_pmd = __pmd(old);
-       /*
-        * We have pmd == none and we are holding page_table_lock.
-        * So we can safely go and clear the pgtable hash
-        * index info.
-        */
-       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-       pgtable = *pgtable_slot;
-       /*
-        * Let's zero out old valid and hash index details
-        * hash fault look at them.
-        */
-       memset(pgtable, 0, PTE_FRAG_SIZE);
-       /*
-        * Serialize against find_current_mm_pte variants which does lock-less
-        * lookup in page tables with local interrupts disabled. For huge pages
-        * it casts pmd_t to pte_t. Since format of pte_t is different from
-        * pmd_t we want to prevent transit from pmd pointing to page table
-        * to pmd pointing to huge page (and back) while interrupts are disabled.
-        * We clear pmd to possibly replace it with page table pointer in
-        * different code paths. So make sure we wait for the parallel
-        * find_curren_mm_pte to finish.
-        */
-       serialize_against_pte_lookup(mm);
-       return old_pmd;
-}
-
-int hash__has_transparent_hugepage(void)
-{
-
-       if (!mmu_has_feature(MMU_FTR_16M_PAGE))
-               return 0;
-       /*
-        * We support THP only if PMD_SIZE is 16MB.
-        */
-       if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
-               return 0;
-       /*
-        * We need to make sure that we support 16MB hugepage in a segement
-        * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
-        * of 64K.
-        */
-       /*
-        * If we have 64K HPTE, we will be using that by default
-        */
-       if (mmu_psize_defs[MMU_PAGE_64K].shift &&
-           (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
-               return 0;
-       /*
-        * Ok we only have 4K HPTE
-        */
-       if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
-               return 0;
-
-       return 1;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-static bool hash__change_memory_range(unsigned long start, unsigned long end,
-                                     unsigned long newpp)
-{
-       unsigned long idx;
-       unsigned int step, shift;
-
-       shift = mmu_psize_defs[mmu_linear_psize].shift;
-       step = 1 << shift;
-
-       start = ALIGN_DOWN(start, step);
-       end = ALIGN(end, step); // aligns up
-
-       if (start >= end)
-               return false;
-
-       pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
-                start, end, newpp, step);
-
-       for (idx = start; idx < end; idx += step)
-               /* Not sure if we can do much with the return value */
-               mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
-                                                       mmu_kernel_ssize);
-
-       return true;
-}
-
-void hash__mark_rodata_ro(void)
-{
-       unsigned long start, end;
-
-       start = (unsigned long)_stext;
-       end = (unsigned long)__init_begin;
-
-       WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
-}
-
-void hash__mark_initmem_nx(void)
-{
-       unsigned long start, end, pp;
-
-       start = (unsigned long)__init_begin;
-       end = (unsigned long)__init_end;
-
-       pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
-
-       WARN_ON(!hash__change_memory_range(start, end, pp));
-}
-#endif
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
deleted file mode 100644 (file)
index fcb0169..0000000
+++ /dev/null
@@ -1,1124 +0,0 @@
-/*
- * Page table handling routines for radix page table.
- *
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) "radix-mmu: " fmt
-
-#include <linux/kernel.h>
-#include <linux/sched/mm.h>
-#include <linux/memblock.h>
-#include <linux/of_fdt.h>
-#include <linux/mm.h>
-#include <linux/string_helpers.h>
-#include <linux/stop_machine.h>
-
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-#include <asm/dma.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/firmware.h>
-#include <asm/powernv.h>
-#include <asm/sections.h>
-#include <asm/trace.h>
-#include <asm/uaccess.h>
-
-#include <trace/events/thp.h>
-
-unsigned int mmu_pid_bits;
-unsigned int mmu_base_pid;
-
-static int native_register_process_table(unsigned long base, unsigned long pg_sz,
-                                        unsigned long table_size)
-{
-       unsigned long patb0, patb1;
-
-       patb0 = be64_to_cpu(partition_tb[0].patb0);
-       patb1 = base | table_size | PATB_GR;
-
-       mmu_partition_table_set_entry(0, patb0, patb1);
-
-       return 0;
-}
-
-static __ref void *early_alloc_pgtable(unsigned long size, int nid,
-                       unsigned long region_start, unsigned long region_end)
-{
-       phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
-       phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
-       void *ptr;
-
-       if (region_start)
-               min_addr = region_start;
-       if (region_end)
-               max_addr = region_end;
-
-       ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
-
-       if (!ptr)
-               panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
-                     __func__, size, size, nid, &min_addr, &max_addr);
-
-       return ptr;
-}
-
-static int early_map_kernel_page(unsigned long ea, unsigned long pa,
-                         pgprot_t flags,
-                         unsigned int map_page_size,
-                         int nid,
-                         unsigned long region_start, unsigned long region_end)
-{
-       unsigned long pfn = pa >> PAGE_SHIFT;
-       pgd_t *pgdp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-
-       pgdp = pgd_offset_k(ea);
-       if (pgd_none(*pgdp)) {
-               pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
-                                               region_start, region_end);
-               pgd_populate(&init_mm, pgdp, pudp);
-       }
-       pudp = pud_offset(pgdp, ea);
-       if (map_page_size == PUD_SIZE) {
-               ptep = (pte_t *)pudp;
-               goto set_the_pte;
-       }
-       if (pud_none(*pudp)) {
-               pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
-                                               region_start, region_end);
-               pud_populate(&init_mm, pudp, pmdp);
-       }
-       pmdp = pmd_offset(pudp, ea);
-       if (map_page_size == PMD_SIZE) {
-               ptep = pmdp_ptep(pmdp);
-               goto set_the_pte;
-       }
-       if (!pmd_present(*pmdp)) {
-               ptep = early_alloc_pgtable(PAGE_SIZE, nid,
-                                               region_start, region_end);
-               pmd_populate_kernel(&init_mm, pmdp, ptep);
-       }
-       ptep = pte_offset_kernel(pmdp, ea);
-
-set_the_pte:
-       set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
-       smp_wmb();
-       return 0;
-}
-
-/*
- * nid, region_start, and region_end are hints to try to place the page
- * table memory in the same node or region.
- */
-static int __map_kernel_page(unsigned long ea, unsigned long pa,
-                         pgprot_t flags,
-                         unsigned int map_page_size,
-                         int nid,
-                         unsigned long region_start, unsigned long region_end)
-{
-       unsigned long pfn = pa >> PAGE_SHIFT;
-       pgd_t *pgdp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-       /*
-        * Make sure task size is correct as per the max adddr
-        */
-       BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
-
-#ifdef CONFIG_PPC_64K_PAGES
-       BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
-#endif
-
-       if (unlikely(!slab_is_available()))
-               return early_map_kernel_page(ea, pa, flags, map_page_size,
-                                               nid, region_start, region_end);
-
-       /*
-        * Should make page table allocation functions be able to take a
-        * node, so we can place kernel page tables on the right nodes after
-        * boot.
-        */
-       pgdp = pgd_offset_k(ea);
-       pudp = pud_alloc(&init_mm, pgdp, ea);
-       if (!pudp)
-               return -ENOMEM;
-       if (map_page_size == PUD_SIZE) {
-               ptep = (pte_t *)pudp;
-               goto set_the_pte;
-       }
-       pmdp = pmd_alloc(&init_mm, pudp, ea);
-       if (!pmdp)
-               return -ENOMEM;
-       if (map_page_size == PMD_SIZE) {
-               ptep = pmdp_ptep(pmdp);
-               goto set_the_pte;
-       }
-       ptep = pte_alloc_kernel(pmdp, ea);
-       if (!ptep)
-               return -ENOMEM;
-
-set_the_pte:
-       set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
-       smp_wmb();
-       return 0;
-}
-
-int radix__map_kernel_page(unsigned long ea, unsigned long pa,
-                         pgprot_t flags,
-                         unsigned int map_page_size)
-{
-       return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
-}
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-void radix__change_memory_range(unsigned long start, unsigned long end,
-                               unsigned long clear)
-{
-       unsigned long idx;
-       pgd_t *pgdp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-
-       start = ALIGN_DOWN(start, PAGE_SIZE);
-       end = PAGE_ALIGN(end); // aligns up
-
-       pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
-                start, end, clear);
-
-       for (idx = start; idx < end; idx += PAGE_SIZE) {
-               pgdp = pgd_offset_k(idx);
-               pudp = pud_alloc(&init_mm, pgdp, idx);
-               if (!pudp)
-                       continue;
-               if (pud_huge(*pudp)) {
-                       ptep = (pte_t *)pudp;
-                       goto update_the_pte;
-               }
-               pmdp = pmd_alloc(&init_mm, pudp, idx);
-               if (!pmdp)
-                       continue;
-               if (pmd_huge(*pmdp)) {
-                       ptep = pmdp_ptep(pmdp);
-                       goto update_the_pte;
-               }
-               ptep = pte_alloc_kernel(pmdp, idx);
-               if (!ptep)
-                       continue;
-update_the_pte:
-               radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
-       }
-
-       radix__flush_tlb_kernel_range(start, end);
-}
-
-void radix__mark_rodata_ro(void)
-{
-       unsigned long start, end;
-
-       start = (unsigned long)_stext;
-       end = (unsigned long)__init_begin;
-
-       radix__change_memory_range(start, end, _PAGE_WRITE);
-}
-
-void radix__mark_initmem_nx(void)
-{
-       unsigned long start = (unsigned long)__init_begin;
-       unsigned long end = (unsigned long)__init_end;
-
-       radix__change_memory_range(start, end, _PAGE_EXEC);
-}
-#endif /* CONFIG_STRICT_KERNEL_RWX */
-
-static inline void __meminit
-print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
-{
-       char buf[10];
-
-       if (end <= start)
-               return;
-
-       string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
-
-       pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
-               exec ? " (exec)" : "");
-}
-
-static unsigned long next_boundary(unsigned long addr, unsigned long end)
-{
-#ifdef CONFIG_STRICT_KERNEL_RWX
-       if (addr < __pa_symbol(__init_begin))
-               return __pa_symbol(__init_begin);
-#endif
-       return end;
-}
-
-static int __meminit create_physical_mapping(unsigned long start,
-                                            unsigned long end,
-                                            int nid)
-{
-       unsigned long vaddr, addr, mapping_size = 0;
-       bool prev_exec, exec = false;
-       pgprot_t prot;
-       int psize;
-
-       start = _ALIGN_UP(start, PAGE_SIZE);
-       for (addr = start; addr < end; addr += mapping_size) {
-               unsigned long gap, previous_size;
-               int rc;
-
-               gap = next_boundary(addr, end) - addr;
-               previous_size = mapping_size;
-               prev_exec = exec;
-
-               if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
-                   mmu_psize_defs[MMU_PAGE_1G].shift) {
-                       mapping_size = PUD_SIZE;
-                       psize = MMU_PAGE_1G;
-               } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
-                          mmu_psize_defs[MMU_PAGE_2M].shift) {
-                       mapping_size = PMD_SIZE;
-                       psize = MMU_PAGE_2M;
-               } else {
-                       mapping_size = PAGE_SIZE;
-                       psize = mmu_virtual_psize;
-               }
-
-               vaddr = (unsigned long)__va(addr);
-
-               if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
-                   overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
-                       prot = PAGE_KERNEL_X;
-                       exec = true;
-               } else {
-                       prot = PAGE_KERNEL;
-                       exec = false;
-               }
-
-               if (mapping_size != previous_size || exec != prev_exec) {
-                       print_mapping(start, addr, previous_size, prev_exec);
-                       start = addr;
-               }
-
-               rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
-               if (rc)
-                       return rc;
-
-               update_page_count(psize, 1);
-       }
-
-       print_mapping(start, addr, mapping_size, exec);
-       return 0;
-}
-
-void __init radix_init_pgtable(void)
-{
-       unsigned long rts_field;
-       struct memblock_region *reg;
-
-       /* We don't support slb for radix */
-       mmu_slb_size = 0;
-       /*
-        * Create the linear mapping, using standard page size for now
-        */
-       for_each_memblock(memory, reg) {
-               /*
-                * The memblock allocator  is up at this point, so the
-                * page tables will be allocated within the range. No
-                * need or a node (which we don't have yet).
-                */
-
-               if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
-                       pr_warn("Outside the supported range\n");
-                       continue;
-               }
-
-               WARN_ON(create_physical_mapping(reg->base,
-                                               reg->base + reg->size,
-                                               -1));
-       }
-
-       /* Find out how many PID bits are supported */
-       if (cpu_has_feature(CPU_FTR_HVMODE)) {
-               if (!mmu_pid_bits)
-                       mmu_pid_bits = 20;
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-               /*
-                * When KVM is possible, we only use the top half of the
-                * PID space to avoid collisions between host and guest PIDs
-                * which can cause problems due to prefetch when exiting the
-                * guest with AIL=3
-                */
-               mmu_base_pid = 1 << (mmu_pid_bits - 1);
-#else
-               mmu_base_pid = 1;
-#endif
-       } else {
-               /* The guest uses the bottom half of the PID space */
-               if (!mmu_pid_bits)
-                       mmu_pid_bits = 19;
-               mmu_base_pid = 1;
-       }
-
-       /*
-        * Allocate Partition table and process table for the
-        * host.
-        */
-       BUG_ON(PRTB_SIZE_SHIFT > 36);
-       process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
-       /*
-        * Fill in the process table.
-        */
-       rts_field = radix__get_tree_size();
-       process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
-       /*
-        * Fill in the partition table. We are suppose to use effective address
-        * of process table here. But our linear mapping also enable us to use
-        * physical address here.
-        */
-       register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
-       pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
-       asm volatile("ptesync" : : : "memory");
-       asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-                    "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
-       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-       trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
-
-       /*
-        * The init_mm context is given the first available (non-zero) PID,
-        * which is the "guard PID" and contains no page table. PIDR should
-        * never be set to zero because that duplicates the kernel address
-        * space at the 0x0... offset (quadrant 0)!
-        *
-        * An arbitrary PID that may later be allocated by the PID allocator
-        * for userspace processes must not be used either, because that
-        * would cause stale user mappings for that PID on CPUs outside of
-        * the TLB invalidation scheme (because it won't be in mm_cpumask).
-        *
-        * So permanently carve out one PID for the purpose of a guard PID.
-        */
-       init_mm.context.id = mmu_base_pid;
-       mmu_base_pid++;
-}
-
-static void __init radix_init_partition_table(void)
-{
-       unsigned long rts_field, dw0;
-
-       mmu_partition_table_init();
-       rts_field = radix__get_tree_size();
-       dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
-       mmu_partition_table_set_entry(0, dw0, 0);
-
-       pr_info("Initializing Radix MMU\n");
-       pr_info("Partition table %p\n", partition_tb);
-}
-
-void __init radix_init_native(void)
-{
-       register_process_table = native_register_process_table;
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
-       int idx = -1;
-
-       switch (shift) {
-       case 0xc:
-               idx = MMU_PAGE_4K;
-               break;
-       case 0x10:
-               idx = MMU_PAGE_64K;
-               break;
-       case 0x15:
-               idx = MMU_PAGE_2M;
-               break;
-       case 0x1e:
-               idx = MMU_PAGE_1G;
-               break;
-       }
-       return idx;
-}
-
-static int __init radix_dt_scan_page_sizes(unsigned long node,
-                                          const char *uname, int depth,
-                                          void *data)
-{
-       int size = 0;
-       int shift, idx;
-       unsigned int ap;
-       const __be32 *prop;
-       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-
-       /* We are scanning "cpu" nodes only */
-       if (type == NULL || strcmp(type, "cpu") != 0)
-               return 0;
-
-       /* Find MMU PID size */
-       prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
-       if (prop && size == 4)
-               mmu_pid_bits = be32_to_cpup(prop);
-
-       /* Grab page size encodings */
-       prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
-       if (!prop)
-               return 0;
-
-       pr_info("Page sizes from device-tree:\n");
-       for (; size >= 4; size -= 4, ++prop) {
-
-               struct mmu_psize_def *def;
-
-               /* top 3 bit is AP encoding */
-               shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
-               ap = be32_to_cpu(prop[0]) >> 29;
-               pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
-
-               idx = get_idx_from_shift(shift);
-               if (idx < 0)
-                       continue;
-
-               def = &mmu_psize_defs[idx];
-               def->shift = shift;
-               def->ap  = ap;
-       }
-
-       /* needed ? */
-       cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
-       return 1;
-}
-
-void __init radix__early_init_devtree(void)
-{
-       int rc;
-
-       /*
-        * Try to find the available page sizes in the device-tree
-        */
-       rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
-       if (rc != 0)  /* Found */
-               goto found;
-       /*
-        * let's assume we have page 4k and 64k support
-        */
-       mmu_psize_defs[MMU_PAGE_4K].shift = 12;
-       mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
-
-       mmu_psize_defs[MMU_PAGE_64K].shift = 16;
-       mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-       if (mmu_psize_defs[MMU_PAGE_2M].shift) {
-               /*
-                * map vmemmap using 2M if available
-                */
-               mmu_vmemmap_psize = MMU_PAGE_2M;
-       }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-       return;
-}
-
-static void radix_init_amor(void)
-{
-       /*
-       * In HV mode, we init AMOR (Authority Mask Override Register) so that
-       * the hypervisor and guest can setup IAMR (Instruction Authority Mask
-       * Register), enable key 0 and set it to 1.
-       *
-       * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
-       */
-       mtspr(SPRN_AMOR, (3ul << 62));
-}
-
-#ifdef CONFIG_PPC_KUEP
-void setup_kuep(bool disabled)
-{
-       if (disabled || !early_radix_enabled())
-               return;
-
-       if (smp_processor_id() == boot_cpuid)
-               pr_info("Activating Kernel Userspace Execution Prevention\n");
-
-       /*
-        * Radix always uses key0 of the IAMR to determine if an access is
-        * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
-        * fetch.
-        */
-       mtspr(SPRN_IAMR, (1ul << 62));
-}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-void setup_kuap(bool disabled)
-{
-       if (disabled || !early_radix_enabled())
-               return;
-
-       if (smp_processor_id() == boot_cpuid) {
-               pr_info("Activating Kernel Userspace Access Prevention\n");
-               cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
-       }
-
-       /* Make sure userspace can't change the AMR */
-       mtspr(SPRN_UAMOR, 0);
-       mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
-       isync();
-}
-#endif
-
-void __init radix__early_init_mmu(void)
-{
-       unsigned long lpcr;
-
-#ifdef CONFIG_PPC_64K_PAGES
-       /* PAGE_SIZE mappings */
-       mmu_virtual_psize = MMU_PAGE_64K;
-#else
-       mmu_virtual_psize = MMU_PAGE_4K;
-#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-       /* vmemmap mapping */
-       mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
-       /*
-        * initialize page table size
-        */
-       __pte_index_size = RADIX_PTE_INDEX_SIZE;
-       __pmd_index_size = RADIX_PMD_INDEX_SIZE;
-       __pud_index_size = RADIX_PUD_INDEX_SIZE;
-       __pgd_index_size = RADIX_PGD_INDEX_SIZE;
-       __pud_cache_index = RADIX_PUD_INDEX_SIZE;
-       __pte_table_size = RADIX_PTE_TABLE_SIZE;
-       __pmd_table_size = RADIX_PMD_TABLE_SIZE;
-       __pud_table_size = RADIX_PUD_TABLE_SIZE;
-       __pgd_table_size = RADIX_PGD_TABLE_SIZE;
-
-       __pmd_val_bits = RADIX_PMD_VAL_BITS;
-       __pud_val_bits = RADIX_PUD_VAL_BITS;
-       __pgd_val_bits = RADIX_PGD_VAL_BITS;
-
-       __kernel_virt_start = RADIX_KERN_VIRT_START;
-       __vmalloc_start = RADIX_VMALLOC_START;
-       __vmalloc_end = RADIX_VMALLOC_END;
-       __kernel_io_start = RADIX_KERN_IO_START;
-       __kernel_io_end = RADIX_KERN_IO_END;
-       vmemmap = (struct page *)RADIX_VMEMMAP_START;
-       ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PCI
-       pci_io_base = ISA_IO_BASE;
-#endif
-       __pte_frag_nr = RADIX_PTE_FRAG_NR;
-       __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
-       __pmd_frag_nr = RADIX_PMD_FRAG_NR;
-       __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
-
-       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-               radix_init_native();
-               lpcr = mfspr(SPRN_LPCR);
-               mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
-               radix_init_partition_table();
-               radix_init_amor();
-       } else {
-               radix_init_pseries();
-       }
-
-       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
-       radix_init_pgtable();
-       /* Switch to the guard PID before turning on MMU */
-       radix__switch_mmu_context(NULL, &init_mm);
-       if (cpu_has_feature(CPU_FTR_HVMODE))
-               tlbiel_all();
-}
-
-void radix__early_init_mmu_secondary(void)
-{
-       unsigned long lpcr;
-       /*
-        * update partition table control register and UPRT
-        */
-       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-               lpcr = mfspr(SPRN_LPCR);
-               mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
-
-               mtspr(SPRN_PTCR,
-                     __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
-               radix_init_amor();
-       }
-
-       radix__switch_mmu_context(NULL, &init_mm);
-       if (cpu_has_feature(CPU_FTR_HVMODE))
-               tlbiel_all();
-}
-
-void radix__mmu_cleanup_all(void)
-{
-       unsigned long lpcr;
-
-       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-               lpcr = mfspr(SPRN_LPCR);
-               mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
-               mtspr(SPRN_PTCR, 0);
-               powernv_set_nmmu_ptcr(0);
-               radix__flush_tlb_all();
-       }
-}
-
-void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
-                               phys_addr_t first_memblock_size)
-{
-       /* We don't currently support the first MEMBLOCK not mapping 0
-        * physical on those processors
-        */
-       BUG_ON(first_memblock_base != 0);
-
-       /*
-        * Radix mode is not limited by RMA / VRMA addressing.
-        */
-       ppc64_rma_size = ULONG_MAX;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
-{
-       pte_t *pte;
-       int i;
-
-       for (i = 0; i < PTRS_PER_PTE; i++) {
-               pte = pte_start + i;
-               if (!pte_none(*pte))
-                       return;
-       }
-
-       pte_free_kernel(&init_mm, pte_start);
-       pmd_clear(pmd);
-}
-
-static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
-{
-       pmd_t *pmd;
-       int i;
-
-       for (i = 0; i < PTRS_PER_PMD; i++) {
-               pmd = pmd_start + i;
-               if (!pmd_none(*pmd))
-                       return;
-       }
-
-       pmd_free(&init_mm, pmd_start);
-       pud_clear(pud);
-}
-
-struct change_mapping_params {
-       pte_t *pte;
-       unsigned long start;
-       unsigned long end;
-       unsigned long aligned_start;
-       unsigned long aligned_end;
-};
-
-static int __meminit stop_machine_change_mapping(void *data)
-{
-       struct change_mapping_params *params =
-                       (struct change_mapping_params *)data;
-
-       if (!data)
-               return -1;
-
-       spin_unlock(&init_mm.page_table_lock);
-       pte_clear(&init_mm, params->aligned_start, params->pte);
-       create_physical_mapping(params->aligned_start, params->start, -1);
-       create_physical_mapping(params->end, params->aligned_end, -1);
-       spin_lock(&init_mm.page_table_lock);
-       return 0;
-}
-
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
-                            unsigned long end)
-{
-       unsigned long next;
-       pte_t *pte;
-
-       pte = pte_start + pte_index(addr);
-       for (; addr < end; addr = next, pte++) {
-               next = (addr + PAGE_SIZE) & PAGE_MASK;
-               if (next > end)
-                       next = end;
-
-               if (!pte_present(*pte))
-                       continue;
-
-               if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
-                       /*
-                        * The vmemmap_free() and remove_section_mapping()
-                        * codepaths call us with aligned addresses.
-                        */
-                       WARN_ONCE(1, "%s: unaligned range\n", __func__);
-                       continue;
-               }
-
-               pte_clear(&init_mm, addr, pte);
-       }
-}
-
-/*
- * clear the pte and potentially split the mapping helper
- */
-static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
-                               unsigned long size, pte_t *pte)
-{
-       unsigned long mask = ~(size - 1);
-       unsigned long aligned_start = addr & mask;
-       unsigned long aligned_end = addr + size;
-       struct change_mapping_params params;
-       bool split_region = false;
-
-       if ((end - addr) < size) {
-               /*
-                * We're going to clear the PTE, but not flushed
-                * the mapping, time to remap and flush. The
-                * effects if visible outside the processor or
-                * if we are running in code close to the
-                * mapping we cleared, we are in trouble.
-                */
-               if (overlaps_kernel_text(aligned_start, addr) ||
-                       overlaps_kernel_text(end, aligned_end)) {
-                       /*
-                        * Hack, just return, don't pte_clear
-                        */
-                       WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
-                                 "text, not splitting\n", addr, end);
-                       return;
-               }
-               split_region = true;
-       }
-
-       if (split_region) {
-               params.pte = pte;
-               params.start = addr;
-               params.end = end;
-               params.aligned_start = addr & ~(size - 1);
-               params.aligned_end = min_t(unsigned long, aligned_end,
-                               (unsigned long)__va(memblock_end_of_DRAM()));
-               stop_machine(stop_machine_change_mapping, &params, NULL);
-               return;
-       }
-
-       pte_clear(&init_mm, addr, pte);
-}
-
-static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
-                            unsigned long end)
-{
-       unsigned long next;
-       pte_t *pte_base;
-       pmd_t *pmd;
-
-       pmd = pmd_start + pmd_index(addr);
-       for (; addr < end; addr = next, pmd++) {
-               next = pmd_addr_end(addr, end);
-
-               if (!pmd_present(*pmd))
-                       continue;
-
-               if (pmd_huge(*pmd)) {
-                       split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
-                       continue;
-               }
-
-               pte_base = (pte_t *)pmd_page_vaddr(*pmd);
-               remove_pte_table(pte_base, addr, next);
-               free_pte_table(pte_base, pmd);
-       }
-}
-
-static void remove_pud_table(pud_t *pud_start, unsigned long addr,
-                            unsigned long end)
-{
-       unsigned long next;
-       pmd_t *pmd_base;
-       pud_t *pud;
-
-       pud = pud_start + pud_index(addr);
-       for (; addr < end; addr = next, pud++) {
-               next = pud_addr_end(addr, end);
-
-               if (!pud_present(*pud))
-                       continue;
-
-               if (pud_huge(*pud)) {
-                       split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
-                       continue;
-               }
-
-               pmd_base = (pmd_t *)pud_page_vaddr(*pud);
-               remove_pmd_table(pmd_base, addr, next);
-               free_pmd_table(pmd_base, pud);
-       }
-}
-
-static void __meminit remove_pagetable(unsigned long start, unsigned long end)
-{
-       unsigned long addr, next;
-       pud_t *pud_base;
-       pgd_t *pgd;
-
-       spin_lock(&init_mm.page_table_lock);
-
-       for (addr = start; addr < end; addr = next) {
-               next = pgd_addr_end(addr, end);
-
-               pgd = pgd_offset_k(addr);
-               if (!pgd_present(*pgd))
-                       continue;
-
-               if (pgd_huge(*pgd)) {
-                       split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
-                       continue;
-               }
-
-               pud_base = (pud_t *)pgd_page_vaddr(*pgd);
-               remove_pud_table(pud_base, addr, next);
-       }
-
-       spin_unlock(&init_mm.page_table_lock);
-       radix__flush_tlb_kernel_range(start, end);
-}
-
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
-       if (end >= RADIX_VMALLOC_START) {
-               pr_warn("Outside the supported range\n");
-               return -1;
-       }
-
-       return create_physical_mapping(start, end, nid);
-}
-
-int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
-{
-       remove_pagetable(start, end);
-       return 0;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
-                                pgprot_t flags, unsigned int map_page_size,
-                                int nid)
-{
-       return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
-}
-
-int __meminit radix__vmemmap_create_mapping(unsigned long start,
-                                     unsigned long page_size,
-                                     unsigned long phys)
-{
-       /* Create a PTE encoding */
-       unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
-       int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
-       int ret;
-
-       if ((start + page_size) >= RADIX_VMEMMAP_END) {
-               pr_warn("Outside the supported range\n");
-               return -1;
-       }
-
-       ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
-       BUG_ON(ret);
-
-       return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
-{
-       remove_pagetable(start, start + page_size);
-}
-#endif
-#endif
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-                                 pmd_t *pmdp, unsigned long clr,
-                                 unsigned long set)
-{
-       unsigned long old;
-
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-       assert_spin_locked(pmd_lockptr(mm, pmdp));
-#endif
-
-       old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
-       trace_hugepage_update(addr, old, clr, set);
-
-       return old;
-}
-
-pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
-                       pmd_t *pmdp)
-
-{
-       pmd_t pmd;
-
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
-       VM_BUG_ON(pmd_devmap(*pmdp));
-       /*
-        * khugepaged calls this for normal pmd
-        */
-       pmd = *pmdp;
-       pmd_clear(pmdp);
-
-       /*FIXME!!  Verify whether we need this kick below */
-       serialize_against_pte_lookup(vma->vm_mm);
-
-       radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
-
-       return pmd;
-}
-
-/*
- * For us pgtable_t is pte_t *. Inorder to save the deposisted
- * page table, we consider the allocated page table as a list
- * head. On withdraw we need to make sure we zero out the used
- * list_head memory area.
- */
-void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-                                pgtable_t pgtable)
-{
-        struct list_head *lh = (struct list_head *) pgtable;
-
-        assert_spin_locked(pmd_lockptr(mm, pmdp));
-
-        /* FIFO */
-        if (!pmd_huge_pte(mm, pmdp))
-                INIT_LIST_HEAD(lh);
-        else
-                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
-        pmd_huge_pte(mm, pmdp) = pgtable;
-}
-
-pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
-        pte_t *ptep;
-        pgtable_t pgtable;
-        struct list_head *lh;
-
-        assert_spin_locked(pmd_lockptr(mm, pmdp));
-
-        /* FIFO */
-        pgtable = pmd_huge_pte(mm, pmdp);
-        lh = (struct list_head *) pgtable;
-        if (list_empty(lh))
-                pmd_huge_pte(mm, pmdp) = NULL;
-        else {
-                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
-                list_del(lh);
-        }
-        ptep = (pte_t *) pgtable;
-        *ptep = __pte(0);
-        ptep++;
-        *ptep = __pte(0);
-        return pgtable;
-}
-
-
-pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
-                              unsigned long addr, pmd_t *pmdp)
-{
-       pmd_t old_pmd;
-       unsigned long old;
-
-       old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
-       old_pmd = __pmd(old);
-       /*
-        * Serialize against find_current_mm_pte which does lock-less
-        * lookup in page tables with local interrupts disabled. For huge pages
-        * it casts pmd_t to pte_t. Since format of pte_t is different from
-        * pmd_t we want to prevent transit from pmd pointing to page table
-        * to pmd pointing to huge page (and back) while interrupts are disabled.
-        * We clear pmd to possibly replace it with page table pointer in
-        * different code paths. So make sure we wait for the parallel
-        * find_current_mm_pte to finish.
-        */
-       serialize_against_pte_lookup(mm);
-       return old_pmd;
-}
-
-int radix__has_transparent_hugepage(void)
-{
-       /* For radix 2M at PMD level means thp */
-       if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
-               return 1;
-       return 0;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
-                                 pte_t entry, unsigned long address, int psize)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
-                                             _PAGE_RW | _PAGE_EXEC);
-
-       unsigned long change = pte_val(entry) ^ pte_val(*ptep);
-       /*
-        * To avoid NMMU hang while relaxing access, we need mark
-        * the pte invalid in between.
-        */
-       if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
-               unsigned long old_pte, new_pte;
-
-               old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
-               /*
-                * new value of pte
-                */
-               new_pte = old_pte | set;
-               radix__flush_tlb_page_psize(mm, address, psize);
-               __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
-       } else {
-               __radix_pte_update(ptep, 0, set);
-               /*
-                * Book3S does not require a TLB flush when relaxing access
-                * restrictions when the address space is not attached to a
-                * NMMU, because the core MMU will reload the pte after taking
-                * an access fault, which is defined by the architectue.
-                */
-       }
-       /* See ptesync comment in radix__set_pte_at */
-}
-
-void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
-                                   unsigned long addr, pte_t *ptep,
-                                   pte_t old_pte, pte_t pte)
-{
-       struct mm_struct *mm = vma->vm_mm;
-
-       /*
-        * To avoid NMMU hang while relaxing access we need to flush the tlb before
-        * we set the new value. We need to do this only for radix, because hash
-        * translation does flush when updating the linux pte.
-        */
-       if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
-           (atomic_read(&mm->context.copros) > 0))
-               radix__flush_tlb_page(vma, addr);
-
-       set_pte_at(mm, addr, ptep, pte);
-}
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
deleted file mode 100644 (file)
index ae7fca4..0000000
+++ /dev/null
@@ -1,428 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * PowerPC Memory Protection Keys management
- *
- * Copyright 2017, Ram Pai, IBM Corporation.
- */
-
-#include <asm/mman.h>
-#include <asm/mmu_context.h>
-#include <asm/mmu.h>
-#include <asm/setup.h>
-#include <linux/pkeys.h>
-#include <linux/of_device.h>
-
-DEFINE_STATIC_KEY_TRUE(pkey_disabled);
-int  pkeys_total;              /* Total pkeys as per device tree */
-u32  initial_allocation_mask;   /* Bits set for the initially allocated keys */
-u32  reserved_allocation_mask;  /* Bits set for reserved keys */
-static bool pkey_execute_disable_supported;
-static bool pkeys_devtree_defined;     /* property exported by device tree */
-static u64 pkey_amr_mask;              /* Bits in AMR not to be touched */
-static u64 pkey_iamr_mask;             /* Bits in AMR not to be touched */
-static u64 pkey_uamor_mask;            /* Bits in UMOR not to be touched */
-static int execute_only_key = 2;
-
-#define AMR_BITS_PER_PKEY 2
-#define AMR_RD_BIT 0x1UL
-#define AMR_WR_BIT 0x2UL
-#define IAMR_EX_BIT 0x1UL
-#define PKEY_REG_BITS (sizeof(u64)*8)
-#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
-
-static void scan_pkey_feature(void)
-{
-       u32 vals[2];
-       struct device_node *cpu;
-
-       cpu = of_find_node_by_type(NULL, "cpu");
-       if (!cpu)
-               return;
-
-       if (of_property_read_u32_array(cpu,
-                       "ibm,processor-storage-keys", vals, 2))
-               return;
-
-       /*
-        * Since any pkey can be used for data or execute, we will just treat
-        * all keys as equal and track them as one entity.
-        */
-       pkeys_total = vals[0];
-       pkeys_devtree_defined = true;
-}
-
-static inline bool pkey_mmu_enabled(void)
-{
-       if (firmware_has_feature(FW_FEATURE_LPAR))
-               return pkeys_total;
-       else
-               return cpu_has_feature(CPU_FTR_PKEY);
-}
-
-static int pkey_initialize(void)
-{
-       int os_reserved, i;
-
-       /*
-        * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
-        * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
-        * Ensure that the bits a distinct.
-        */
-       BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
-                    (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
-       /*
-        * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
-        * in the vmaflag. Make sure that is really the case.
-        */
-       BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
-                    __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
-                               != (sizeof(u64) * BITS_PER_BYTE));
-
-       /* scan the device tree for pkey feature */
-       scan_pkey_feature();
-
-       /*
-        * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
-        * tree. We make this exception since skiboot forgot to expose this
-        * property on power8.
-        */
-       if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
-                       cpu_has_feature(CPU_FTRS_POWER8))
-               pkeys_total = 32;
-
-       /*
-        * Adjust the upper limit, based on the number of bits supported by
-        * arch-neutral code.
-        */
-       pkeys_total = min_t(int, pkeys_total,
-                       ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
-
-       if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
-               static_branch_enable(&pkey_disabled);
-       else
-               static_branch_disable(&pkey_disabled);
-
-       if (static_branch_likely(&pkey_disabled))
-               return 0;
-
-       /*
-        * The device tree cannot be relied to indicate support for
-        * execute_disable support. Instead we use a PVR check.
-        */
-       if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
-               pkey_execute_disable_supported = false;
-       else
-               pkey_execute_disable_supported = true;
-
-#ifdef CONFIG_PPC_4K_PAGES
-       /*
-        * The OS can manage only 8 pkeys due to its inability to represent them
-        * in the Linux 4K PTE.
-        */
-       os_reserved = pkeys_total - 8;
-#else
-       os_reserved = 0;
-#endif
-       /* Bits are in LE format. */
-       reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
-
-       /* register mask is in BE format */
-       pkey_amr_mask = ~0x0ul;
-       pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
-
-       pkey_iamr_mask = ~0x0ul;
-       pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
-       pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
-       pkey_uamor_mask = ~0x0ul;
-       pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
-       pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
-       /* mark the rest of the keys as reserved and hence unavailable */
-       for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
-               reserved_allocation_mask |= (0x1 << i);
-               pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
-       }
-       initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
-
-       if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
-               /*
-                * Insufficient number of keys to support
-                * execute only key. Mark it unavailable.
-                * Any AMR, UAMOR, IAMR bit set for
-                * this key is irrelevant since this key
-                * can never be allocated.
-                */
-               execute_only_key = -1;
-       }
-
-       return 0;
-}
-
-arch_initcall(pkey_initialize);
-
-void pkey_mm_init(struct mm_struct *mm)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return;
-       mm_pkey_allocation_map(mm) = initial_allocation_mask;
-       mm->context.execute_only_pkey = execute_only_key;
-}
-
-static inline u64 read_amr(void)
-{
-       return mfspr(SPRN_AMR);
-}
-
-static inline void write_amr(u64 value)
-{
-       mtspr(SPRN_AMR, value);
-}
-
-static inline u64 read_iamr(void)
-{
-       if (!likely(pkey_execute_disable_supported))
-               return 0x0UL;
-
-       return mfspr(SPRN_IAMR);
-}
-
-static inline void write_iamr(u64 value)
-{
-       if (!likely(pkey_execute_disable_supported))
-               return;
-
-       mtspr(SPRN_IAMR, value);
-}
-
-static inline u64 read_uamor(void)
-{
-       return mfspr(SPRN_UAMOR);
-}
-
-static inline void write_uamor(u64 value)
-{
-       mtspr(SPRN_UAMOR, value);
-}
-
-static bool is_pkey_enabled(int pkey)
-{
-       u64 uamor = read_uamor();
-       u64 pkey_bits = 0x3ul << pkeyshift(pkey);
-       u64 uamor_pkey_bits = (uamor & pkey_bits);
-
-       /*
-        * Both the bits in UAMOR corresponding to the key should be set or
-        * reset.
-        */
-       WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
-       return !!(uamor_pkey_bits);
-}
-
-static inline void init_amr(int pkey, u8 init_bits)
-{
-       u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
-       u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
-
-       write_amr(old_amr | new_amr_bits);
-}
-
-static inline void init_iamr(int pkey, u8 init_bits)
-{
-       u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
-       u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
-
-       write_iamr(old_iamr | new_iamr_bits);
-}
-
-/*
- * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
- * specified in @init_val.
- */
-int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
-                               unsigned long init_val)
-{
-       u64 new_amr_bits = 0x0ul;
-       u64 new_iamr_bits = 0x0ul;
-
-       if (!is_pkey_enabled(pkey))
-               return -EINVAL;
-
-       if (init_val & PKEY_DISABLE_EXECUTE) {
-               if (!pkey_execute_disable_supported)
-                       return -EINVAL;
-               new_iamr_bits |= IAMR_EX_BIT;
-       }
-       init_iamr(pkey, new_iamr_bits);
-
-       /* Set the bits we need in AMR: */
-       if (init_val & PKEY_DISABLE_ACCESS)
-               new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
-       else if (init_val & PKEY_DISABLE_WRITE)
-               new_amr_bits |= AMR_WR_BIT;
-
-       init_amr(pkey, new_amr_bits);
-       return 0;
-}
-
-void thread_pkey_regs_save(struct thread_struct *thread)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return;
-
-       /*
-        * TODO: Skip saving registers if @thread hasn't used any keys yet.
-        */
-       thread->amr = read_amr();
-       thread->iamr = read_iamr();
-       thread->uamor = read_uamor();
-}
-
-void thread_pkey_regs_restore(struct thread_struct *new_thread,
-                             struct thread_struct *old_thread)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return;
-
-       if (old_thread->amr != new_thread->amr)
-               write_amr(new_thread->amr);
-       if (old_thread->iamr != new_thread->iamr)
-               write_iamr(new_thread->iamr);
-       if (old_thread->uamor != new_thread->uamor)
-               write_uamor(new_thread->uamor);
-}
-
-void thread_pkey_regs_init(struct thread_struct *thread)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return;
-
-       thread->amr = pkey_amr_mask;
-       thread->iamr = pkey_iamr_mask;
-       thread->uamor = pkey_uamor_mask;
-
-       write_uamor(pkey_uamor_mask);
-       write_amr(pkey_amr_mask);
-       write_iamr(pkey_iamr_mask);
-}
-
-static inline bool pkey_allows_readwrite(int pkey)
-{
-       int pkey_shift = pkeyshift(pkey);
-
-       if (!is_pkey_enabled(pkey))
-               return true;
-
-       return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
-}
-
-int __execute_only_pkey(struct mm_struct *mm)
-{
-       return mm->context.execute_only_pkey;
-}
-
-static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
-{
-       /* Do this check first since the vm_flags should be hot */
-       if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
-               return false;
-
-       return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
-}
-
-/*
- * This should only be called for *plain* mprotect calls.
- */
-int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
-                                 int pkey)
-{
-       /*
-        * If the currently associated pkey is execute-only, but the requested
-        * protection is not execute-only, move it back to the default pkey.
-        */
-       if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
-               return 0;
-
-       /*
-        * The requested protection is execute-only. Hence let's use an
-        * execute-only pkey.
-        */
-       if (prot == PROT_EXEC) {
-               pkey = execute_only_pkey(vma->vm_mm);
-               if (pkey > 0)
-                       return pkey;
-       }
-
-       /* Nothing to override. */
-       return vma_pkey(vma);
-}
-
-static bool pkey_access_permitted(int pkey, bool write, bool execute)
-{
-       int pkey_shift;
-       u64 amr;
-
-       if (!is_pkey_enabled(pkey))
-               return true;
-
-       pkey_shift = pkeyshift(pkey);
-       if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
-               return true;
-
-       amr = read_amr(); /* Delay reading amr until absolutely needed */
-       return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
-               (write &&  !(amr & (AMR_WR_BIT << pkey_shift))));
-}
-
-bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return true;
-
-       return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
-}
-
-/*
- * We only want to enforce protection keys on the current thread because we
- * effectively have no access to AMR/IAMR for other threads or any way to tell
- * which AMR/IAMR in a threaded process we could use.
- *
- * So do not enforce things if the VMA is not from the current mm, or if we are
- * in a kernel thread.
- */
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
-{
-       if (!current->mm)
-               return true;
-
-       /* if it is not our ->mm, it has to be foreign */
-       if (current->mm != vma->vm_mm)
-               return true;
-
-       return false;
-}
-
-bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
-                              bool execute, bool foreign)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return true;
-       /*
-        * Do not enforce our key-permissions on a foreign vma.
-        */
-       if (foreign || vma_is_foreign(vma))
-               return true;
-
-       return pkey_access_permitted(vma_pkey(vma), write, execute);
-}
-
-void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
-{
-       if (static_branch_likely(&pkey_disabled))
-               return;
-
-       /* Duplicate the oldmm pkey state in mm: */
-       mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
-       mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
-}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
deleted file mode 100644 (file)
index 89e4531..0000000
+++ /dev/null
@@ -1,832 +0,0 @@
-/*
- * PowerPC64 SLB support.
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code written by:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- *    Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <asm/asm-prototypes.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/ppc-opcode.h>
-#include <asm/cputable.h>
-#include <asm/cacheflush.h>
-#include <asm/smp.h>
-#include <linux/compiler.h>
-#include <linux/context_tracking.h>
-#include <linux/mm_types.h>
-
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-
-enum slb_index {
-       LINEAR_INDEX    = 0, /* Kernel linear map  (0xc000000000000000) */
-       KSTACK_INDEX    = 1, /* Kernel stack map */
-};
-
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
-
-#define slb_esid_mask(ssize)   \
-       (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
-                                        enum slb_index index)
-{
-       return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
-
-static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
-                                        unsigned long flags)
-{
-       return (vsid << slb_vsid_shift(ssize)) | flags |
-               ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
-                                        unsigned long flags)
-{
-       return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
-}
-
-static void assert_slb_presence(bool present, unsigned long ea)
-{
-#ifdef CONFIG_DEBUG_VM
-       unsigned long tmp;
-
-       WARN_ON_ONCE(mfmsr() & MSR_EE);
-
-       if (!cpu_has_feature(CPU_FTR_ARCH_206))
-               return;
-
-       /*
-        * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
-        * ignores all other bits from 0-27, so just clear them all.
-        */
-       ea &= ~((1UL << 28) - 1);
-       asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
-
-       WARN_ON(present == (tmp == 0));
-#endif
-}
-
-static inline void slb_shadow_update(unsigned long ea, int ssize,
-                                    unsigned long flags,
-                                    enum slb_index index)
-{
-       struct slb_shadow *p = get_slb_shadow();
-
-       /*
-        * Clear the ESID first so the entry is not valid while we are
-        * updating it.  No write barriers are needed here, provided
-        * we only update the current CPU's SLB shadow buffer.
-        */
-       WRITE_ONCE(p->save_area[index].esid, 0);
-       WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
-       WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
-}
-
-static inline void slb_shadow_clear(enum slb_index index)
-{
-       WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
-}
-
-static inline void create_shadowed_slbe(unsigned long ea, int ssize,
-                                       unsigned long flags,
-                                       enum slb_index index)
-{
-       /*
-        * Updating the shadow buffer before writing the SLB ensures
-        * we don't get a stale entry here if we get preempted by PHYP
-        * between these two statements.
-        */
-       slb_shadow_update(ea, ssize, flags, index);
-
-       assert_slb_presence(false, ea);
-       asm volatile("slbmte  %0,%1" :
-                    : "r" (mk_vsid_data(ea, ssize, flags)),
-                      "r" (mk_esid_data(ea, ssize, index))
-                    : "memory" );
-}
-
-/*
- * Insert bolted entries into SLB (which may not be empty, so don't clear
- * slb_cache_ptr).
- */
-void __slb_restore_bolted_realmode(void)
-{
-       struct slb_shadow *p = get_slb_shadow();
-       enum slb_index index;
-
-        /* No isync needed because realmode. */
-       for (index = 0; index < SLB_NUM_BOLTED; index++) {
-               asm volatile("slbmte  %0,%1" :
-                    : "r" (be64_to_cpu(p->save_area[index].vsid)),
-                      "r" (be64_to_cpu(p->save_area[index].esid)));
-       }
-
-       assert_slb_presence(true, local_paca->kstack);
-}
-
-/*
- * Insert the bolted entries into an empty SLB.
- */
-void slb_restore_bolted_realmode(void)
-{
-       __slb_restore_bolted_realmode();
-       get_paca()->slb_cache_ptr = 0;
-
-       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-}
-
-/*
- * This flushes all SLB entries including 0, so it must be realmode.
- */
-void slb_flush_all_realmode(void)
-{
-       asm volatile("slbmte %0,%0; slbia" : : "r" (0));
-}
-
-/*
- * This flushes non-bolted entries, it can be run in virtual mode. Must
- * be called with interrupts disabled.
- */
-void slb_flush_and_restore_bolted(void)
-{
-       struct slb_shadow *p = get_slb_shadow();
-
-       BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
-
-       WARN_ON(!irqs_disabled());
-
-       /*
-        * We can't take a PMU exception in the following code, so hard
-        * disable interrupts.
-        */
-       hard_irq_disable();
-
-       asm volatile("isync\n"
-                    "slbia\n"
-                    "slbmte  %0, %1\n"
-                    "isync\n"
-                    :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
-                       "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
-                    : "memory");
-       assert_slb_presence(true, get_paca()->kstack);
-
-       get_paca()->slb_cache_ptr = 0;
-
-       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-}
-
-void slb_save_contents(struct slb_entry *slb_ptr)
-{
-       int i;
-       unsigned long e, v;
-
-       /* Save slb_cache_ptr value. */
-       get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
-
-       if (!slb_ptr)
-               return;
-
-       for (i = 0; i < mmu_slb_size; i++) {
-               asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
-               asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
-               slb_ptr->esid = e;
-               slb_ptr->vsid = v;
-               slb_ptr++;
-       }
-}
-
-void slb_dump_contents(struct slb_entry *slb_ptr)
-{
-       int i, n;
-       unsigned long e, v;
-       unsigned long llp;
-
-       if (!slb_ptr)
-               return;
-
-       pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
-       pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
-
-       for (i = 0; i < mmu_slb_size; i++) {
-               e = slb_ptr->esid;
-               v = slb_ptr->vsid;
-               slb_ptr++;
-
-               if (!e && !v)
-                       continue;
-
-               pr_err("%02d %016lx %016lx\n", i, e, v);
-
-               if (!(e & SLB_ESID_V)) {
-                       pr_err("\n");
-                       continue;
-               }
-               llp = v & SLB_VSID_LLP;
-               if (v & SLB_VSID_B_1T) {
-                       pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
-                              GET_ESID_1T(e),
-                              (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
-               } else {
-                       pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
-                              GET_ESID(e),
-                              (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
-               }
-       }
-       pr_err("----------------------------------\n");
-
-       /* Dump slb cache entires as well. */
-       pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
-       pr_err("Valid SLB cache entries:\n");
-       n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
-       for (i = 0; i < n; i++)
-               pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
-       pr_err("Rest of SLB cache entries:\n");
-       for (i = n; i < SLB_CACHE_ENTRIES; i++)
-               pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
-}
-
-void slb_vmalloc_update(void)
-{
-       /*
-        * vmalloc is not bolted, so just have to flush non-bolted.
-        */
-       slb_flush_and_restore_bolted();
-}
-
-static bool preload_hit(struct thread_info *ti, unsigned long esid)
-{
-       unsigned char i;
-
-       for (i = 0; i < ti->slb_preload_nr; i++) {
-               unsigned char idx;
-
-               idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
-               if (esid == ti->slb_preload_esid[idx])
-                       return true;
-       }
-       return false;
-}
-
-static bool preload_add(struct thread_info *ti, unsigned long ea)
-{
-       unsigned char idx;
-       unsigned long esid;
-
-       if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
-               /* EAs are stored >> 28 so 256MB segments don't need clearing */
-               if (ea & ESID_MASK_1T)
-                       ea &= ESID_MASK_1T;
-       }
-
-       esid = ea >> SID_SHIFT;
-
-       if (preload_hit(ti, esid))
-               return false;
-
-       idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
-       ti->slb_preload_esid[idx] = esid;
-       if (ti->slb_preload_nr == SLB_PRELOAD_NR)
-               ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
-       else
-               ti->slb_preload_nr++;
-
-       return true;
-}
-
-static void preload_age(struct thread_info *ti)
-{
-       if (!ti->slb_preload_nr)
-               return;
-       ti->slb_preload_nr--;
-       ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
-}
-
-void slb_setup_new_exec(void)
-{
-       struct thread_info *ti = current_thread_info();
-       struct mm_struct *mm = current->mm;
-       unsigned long exec = 0x10000000;
-
-       WARN_ON(irqs_disabled());
-
-       /*
-        * preload cache can only be used to determine whether a SLB
-        * entry exists if it does not start to overflow.
-        */
-       if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
-               return;
-
-       hard_irq_disable();
-
-       /*
-        * We have no good place to clear the slb preload cache on exec,
-        * flush_thread is about the earliest arch hook but that happens
-        * after we switch to the mm and have aleady preloaded the SLBEs.
-        *
-        * For the most part that's probably okay to use entries from the
-        * previous exec, they will age out if unused. It may turn out to
-        * be an advantage to clear the cache before switching to it,
-        * however.
-        */
-
-       /*
-        * preload some userspace segments into the SLB.
-        * Almost all 32 and 64bit PowerPC executables are linked at
-        * 0x10000000 so it makes sense to preload this segment.
-        */
-       if (!is_kernel_addr(exec)) {
-               if (preload_add(ti, exec))
-                       slb_allocate_user(mm, exec);
-       }
-
-       /* Libraries and mmaps. */
-       if (!is_kernel_addr(mm->mmap_base)) {
-               if (preload_add(ti, mm->mmap_base))
-                       slb_allocate_user(mm, mm->mmap_base);
-       }
-
-       /* see switch_slb */
-       asm volatile("isync" : : : "memory");
-
-       local_irq_enable();
-}
-
-void preload_new_slb_context(unsigned long start, unsigned long sp)
-{
-       struct thread_info *ti = current_thread_info();
-       struct mm_struct *mm = current->mm;
-       unsigned long heap = mm->start_brk;
-
-       WARN_ON(irqs_disabled());
-
-       /* see above */
-       if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
-               return;
-
-       hard_irq_disable();
-
-       /* Userspace entry address. */
-       if (!is_kernel_addr(start)) {
-               if (preload_add(ti, start))
-                       slb_allocate_user(mm, start);
-       }
-
-       /* Top of stack, grows down. */
-       if (!is_kernel_addr(sp)) {
-               if (preload_add(ti, sp))
-                       slb_allocate_user(mm, sp);
-       }
-
-       /* Bottom of heap, grows up. */
-       if (heap && !is_kernel_addr(heap)) {
-               if (preload_add(ti, heap))
-                       slb_allocate_user(mm, heap);
-       }
-
-       /* see switch_slb */
-       asm volatile("isync" : : : "memory");
-
-       local_irq_enable();
-}
-
-
-/* Flush all user entries from the segment table of the current processor. */
-void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
-{
-       struct thread_info *ti = task_thread_info(tsk);
-       unsigned char i;
-
-       /*
-        * We need interrupts hard-disabled here, not just soft-disabled,
-        * so that a PMU interrupt can't occur, which might try to access
-        * user memory (to get a stack trace) and possible cause an SLB miss
-        * which would update the slb_cache/slb_cache_ptr fields in the PACA.
-        */
-       hard_irq_disable();
-       asm volatile("isync" : : : "memory");
-       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-               /*
-                * SLBIA IH=3 invalidates all Class=1 SLBEs and their
-                * associated lookaside structures, which matches what
-                * switch_slb wants. So ARCH_300 does not use the slb
-                * cache.
-                */
-               asm volatile(PPC_SLBIA(3));
-       } else {
-               unsigned long offset = get_paca()->slb_cache_ptr;
-
-               if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
-                   offset <= SLB_CACHE_ENTRIES) {
-                       unsigned long slbie_data = 0;
-
-                       for (i = 0; i < offset; i++) {
-                               unsigned long ea;
-
-                               ea = (unsigned long)
-                                       get_paca()->slb_cache[i] << SID_SHIFT;
-                               /*
-                                * Could assert_slb_presence(true) here, but
-                                * hypervisor or machine check could have come
-                                * in and removed the entry at this point.
-                                */
-
-                               slbie_data = ea;
-                               slbie_data |= user_segment_size(slbie_data)
-                                               << SLBIE_SSIZE_SHIFT;
-                               slbie_data |= SLBIE_C; /* user slbs have C=1 */
-                               asm volatile("slbie %0" : : "r" (slbie_data));
-                       }
-
-                       /* Workaround POWER5 < DD2.1 issue */
-                       if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
-                               asm volatile("slbie %0" : : "r" (slbie_data));
-
-               } else {
-                       struct slb_shadow *p = get_slb_shadow();
-                       unsigned long ksp_esid_data =
-                               be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
-                       unsigned long ksp_vsid_data =
-                               be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
-
-                       asm volatile(PPC_SLBIA(1) "\n"
-                                    "slbmte    %0,%1\n"
-                                    "isync"
-                                    :: "r"(ksp_vsid_data),
-                                       "r"(ksp_esid_data));
-
-                       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-               }
-
-               get_paca()->slb_cache_ptr = 0;
-       }
-       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-
-       copy_mm_to_paca(mm);
-
-       /*
-        * We gradually age out SLBs after a number of context switches to
-        * reduce reload overhead of unused entries (like we do with FP/VEC
-        * reload). Each time we wrap 256 switches, take an entry out of the
-        * SLB preload cache.
-        */
-       tsk->thread.load_slb++;
-       if (!tsk->thread.load_slb) {
-               unsigned long pc = KSTK_EIP(tsk);
-
-               preload_age(ti);
-               preload_add(ti, pc);
-       }
-
-       for (i = 0; i < ti->slb_preload_nr; i++) {
-               unsigned char idx;
-               unsigned long ea;
-
-               idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
-               ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
-
-               slb_allocate_user(mm, ea);
-       }
-
-       /*
-        * Synchronize slbmte preloads with possible subsequent user memory
-        * address accesses by the kernel (user mode won't happen until
-        * rfid, which is safe).
-        */
-       asm volatile("isync" : : : "memory");
-}
-
-void slb_set_size(u16 size)
-{
-       mmu_slb_size = size;
-}
-
-void slb_initialize(void)
-{
-       unsigned long linear_llp, vmalloc_llp, io_llp;
-       unsigned long lflags;
-       static int slb_encoding_inited;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-       unsigned long vmemmap_llp;
-#endif
-
-       /* Prepare our SLB miss handler based on our page size */
-       linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
-       io_llp = mmu_psize_defs[mmu_io_psize].sllp;
-       vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
-       get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-       vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
-       if (!slb_encoding_inited) {
-               slb_encoding_inited = 1;
-               pr_devel("SLB: linear  LLP = %04lx\n", linear_llp);
-               pr_devel("SLB: io      LLP = %04lx\n", io_llp);
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-               pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
-#endif
-       }
-
-       get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
-       get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-       get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-
-       lflags = SLB_VSID_KERNEL | linear_llp;
-
-       /* Invalidate the entire SLB (even entry 0) & all the ERATS */
-       asm volatile("isync":::"memory");
-       asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
-       asm volatile("isync; slbia; isync":::"memory");
-       create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
-
-       /* For the boot cpu, we're running on the stack in init_thread_union,
-        * which is in the first segment of the linear mapping, and also
-        * get_paca()->kstack hasn't been initialized yet.
-        * For secondary cpus, we need to bolt the kernel stack entry now.
-        */
-       slb_shadow_clear(KSTACK_INDEX);
-       if (raw_smp_processor_id() != boot_cpuid &&
-           (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
-               create_shadowed_slbe(get_paca()->kstack,
-                                    mmu_kernel_ssize, lflags, KSTACK_INDEX);
-
-       asm volatile("isync":::"memory");
-}
-
-static void slb_cache_update(unsigned long esid_data)
-{
-       int slb_cache_index;
-
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               return; /* ISAv3.0B and later does not use slb_cache */
-
-       /*
-        * Now update slb cache entries
-        */
-       slb_cache_index = local_paca->slb_cache_ptr;
-       if (slb_cache_index < SLB_CACHE_ENTRIES) {
-               /*
-                * We have space in slb cache for optimized switch_slb().
-                * Top 36 bits from esid_data as per ISA
-                */
-               local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
-               local_paca->slb_cache_ptr++;
-       } else {
-               /*
-                * Our cache is full and the current cache content strictly
-                * doesn't indicate the active SLB conents. Bump the ptr
-                * so that switch_slb() will ignore the cache.
-                */
-               local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
-       }
-}
-
-static enum slb_index alloc_slb_index(bool kernel)
-{
-       enum slb_index index;
-
-       /*
-        * The allocation bitmaps can become out of synch with the SLB
-        * when the _switch code does slbie when bolting a new stack
-        * segment and it must not be anywhere else in the SLB. This leaves
-        * a kernel allocated entry that is unused in the SLB. With very
-        * large systems or small segment sizes, the bitmaps could slowly
-        * fill with these entries. They will eventually be cleared out
-        * by the round robin allocator in that case, so it's probably not
-        * worth accounting for.
-        */
-
-       /*
-        * SLBs beyond 32 entries are allocated with stab_rr only
-        * POWER7/8/9 have 32 SLB entries, this could be expanded if a
-        * future CPU has more.
-        */
-       if (local_paca->slb_used_bitmap != U32_MAX) {
-               index = ffz(local_paca->slb_used_bitmap);
-               local_paca->slb_used_bitmap |= 1U << index;
-               if (kernel)
-                       local_paca->slb_kern_bitmap |= 1U << index;
-       } else {
-               /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
-               index = local_paca->stab_rr;
-               if (index < (mmu_slb_size - 1))
-                       index++;
-               else
-                       index = SLB_NUM_BOLTED;
-               local_paca->stab_rr = index;
-               if (index < 32) {
-                       if (kernel)
-                               local_paca->slb_kern_bitmap |= 1U << index;
-                       else
-                               local_paca->slb_kern_bitmap &= ~(1U << index);
-               }
-       }
-       BUG_ON(index < SLB_NUM_BOLTED);
-
-       return index;
-}
-
-static long slb_insert_entry(unsigned long ea, unsigned long context,
-                               unsigned long flags, int ssize, bool kernel)
-{
-       unsigned long vsid;
-       unsigned long vsid_data, esid_data;
-       enum slb_index index;
-
-       vsid = get_vsid(context, ea, ssize);
-       if (!vsid)
-               return -EFAULT;
-
-       /*
-        * There must not be a kernel SLB fault in alloc_slb_index or before
-        * slbmte here or the allocation bitmaps could get out of whack with
-        * the SLB.
-        *
-        * User SLB faults or preloads take this path which might get inlined
-        * into the caller, so add compiler barriers here to ensure unsafe
-        * memory accesses do not come between.
-        */
-       barrier();
-
-       index = alloc_slb_index(kernel);
-
-       vsid_data = __mk_vsid_data(vsid, ssize, flags);
-       esid_data = mk_esid_data(ea, ssize, index);
-
-       /*
-        * No need for an isync before or after this slbmte. The exception
-        * we enter with and the rfid we exit with are context synchronizing.
-        * User preloads should add isync afterwards in case the kernel
-        * accesses user memory before it returns to userspace with rfid.
-        */
-       assert_slb_presence(false, ea);
-       asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
-
-       barrier();
-
-       if (!kernel)
-               slb_cache_update(esid_data);
-
-       return 0;
-}
-
-static long slb_allocate_kernel(unsigned long ea, unsigned long id)
-{
-       unsigned long context;
-       unsigned long flags;
-       int ssize;
-
-       if (id == LINEAR_MAP_REGION_ID) {
-
-               /* We only support upto MAX_PHYSMEM_BITS */
-               if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
-                       return -EFAULT;
-
-               flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-       } else if (id == VMEMMAP_REGION_ID) {
-
-               if (ea >= H_VMEMMAP_END)
-                       return -EFAULT;
-
-               flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
-       } else if (id == VMALLOC_REGION_ID) {
-
-               if (ea >= H_VMALLOC_END)
-                       return -EFAULT;
-
-               flags = local_paca->vmalloc_sllp;
-
-       } else if (id == IO_REGION_ID) {
-
-               if (ea >= H_KERN_IO_END)
-                       return -EFAULT;
-
-               flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
-
-       } else {
-               return -EFAULT;
-       }
-
-       ssize = MMU_SEGSIZE_1T;
-       if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
-               ssize = MMU_SEGSIZE_256M;
-
-       context = get_kernel_context(ea);
-
-       return slb_insert_entry(ea, context, flags, ssize, true);
-}
-
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
-{
-       unsigned long context;
-       unsigned long flags;
-       int bpsize;
-       int ssize;
-
-       /*
-        * consider this as bad access if we take a SLB miss
-        * on an address above addr limit.
-        */
-       if (ea >= mm_ctx_slb_addr_limit(&mm->context))
-               return -EFAULT;
-
-       context = get_user_context(&mm->context, ea);
-       if (!context)
-               return -EFAULT;
-
-       if (unlikely(ea >= H_PGTABLE_RANGE)) {
-               WARN_ON(1);
-               return -EFAULT;
-       }
-
-       ssize = user_segment_size(ea);
-
-       bpsize = get_slice_psize(mm, ea);
-       flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
-
-       return slb_insert_entry(ea, context, flags, ssize, false);
-}
-
-long do_slb_fault(struct pt_regs *regs, unsigned long ea)
-{
-       unsigned long id = get_region_id(ea);
-
-       /* IRQs are not reconciled here, so can't check irqs_disabled */
-       VM_WARN_ON(mfmsr() & MSR_EE);
-
-       if (unlikely(!(regs->msr & MSR_RI)))
-               return -EINVAL;
-
-       /*
-        * SLB kernel faults must be very careful not to touch anything
-        * that is not bolted. E.g., PACA and global variables are okay,
-        * mm->context stuff is not.
-        *
-        * SLB user faults can access all of kernel memory, but must be
-        * careful not to touch things like IRQ state because it is not
-        * "reconciled" here. The difficulty is that we must use
-        * fast_exception_return to return from kernel SLB faults without
-        * looking at possible non-bolted memory. We could test user vs
-        * kernel faults in the interrupt handler asm and do a full fault,
-        * reconcile, ret_from_except for user faults which would make them
-        * first class kernel code. But for performance it's probably nicer
-        * if they go via fast_exception_return too.
-        */
-       if (id >= LINEAR_MAP_REGION_ID) {
-               long err;
-#ifdef CONFIG_DEBUG_VM
-               /* Catch recursive kernel SLB faults. */
-               BUG_ON(local_paca->in_kernel_slb_handler);
-               local_paca->in_kernel_slb_handler = 1;
-#endif
-               err = slb_allocate_kernel(ea, id);
-#ifdef CONFIG_DEBUG_VM
-               local_paca->in_kernel_slb_handler = 0;
-#endif
-               return err;
-       } else {
-               struct mm_struct *mm = current->mm;
-               long err;
-
-               if (unlikely(!mm))
-                       return -EFAULT;
-
-               err = slb_allocate_user(mm, ea);
-               if (!err)
-                       preload_add(current_thread_info(), ea);
-
-               return err;
-       }
-}
-
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
-{
-       if (err == -EFAULT) {
-               if (user_mode(regs))
-                       _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
-               else
-                       bad_page_fault(regs, ea, SIGSEGV);
-       } else if (err == -EINVAL) {
-               unrecoverable_exception(regs);
-       } else {
-               BUG();
-       }
-}
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
deleted file mode 100644 (file)
index 473dd43..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright 2007-2008 Paul Mackerras, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/syscalls.h>
-
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-
-/*
- * Free all pages allocated for subpage protection maps and pointers.
- * Also makes sure that the subpage_prot_table structure is
- * reinitialized for the next user.
- */
-void subpage_prot_free(struct mm_struct *mm)
-{
-       struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
-       unsigned long i, j, addr;
-       u32 **p;
-
-       if (!spt)
-               return;
-
-       for (i = 0; i < 4; ++i) {
-               if (spt->low_prot[i]) {
-                       free_page((unsigned long)spt->low_prot[i]);
-                       spt->low_prot[i] = NULL;
-               }
-       }
-       addr = 0;
-       for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
-               p = spt->protptrs[i];
-               if (!p)
-                       continue;
-               spt->protptrs[i] = NULL;
-               for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
-                    ++j, addr += PAGE_SIZE)
-                       if (p[j])
-                               free_page((unsigned long)p[j]);
-               free_page((unsigned long)p);
-       }
-       spt->maxaddr = 0;
-       kfree(spt);
-}
-
-static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
-                            int npages)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       spinlock_t *ptl;
-
-       pgd = pgd_offset(mm, addr);
-       if (pgd_none(*pgd))
-               return;
-       pud = pud_offset(pgd, addr);
-       if (pud_none(*pud))
-               return;
-       pmd = pmd_offset(pud, addr);
-       if (pmd_none(*pmd))
-               return;
-       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-       arch_enter_lazy_mmu_mode();
-       for (; npages > 0; --npages) {
-               pte_update(mm, addr, pte, 0, 0, 0);
-               addr += PAGE_SIZE;
-               ++pte;
-       }
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(pte - 1, ptl);
-}
-
-/*
- * Clear the subpage protection map for an address range, allowing
- * all accesses that are allowed by the pte permissions.
- */
-static void subpage_prot_clear(unsigned long addr, unsigned long len)
-{
-       struct mm_struct *mm = current->mm;
-       struct subpage_prot_table *spt;
-       u32 **spm, *spp;
-       unsigned long i;
-       size_t nw;
-       unsigned long next, limit;
-
-       down_write(&mm->mmap_sem);
-
-       spt = mm_ctx_subpage_prot(&mm->context);
-       if (!spt)
-               goto err_out;
-
-       limit = addr + len;
-       if (limit > spt->maxaddr)
-               limit = spt->maxaddr;
-       for (; addr < limit; addr = next) {
-               next = pmd_addr_end(addr, limit);
-               if (addr < 0x100000000UL) {
-                       spm = spt->low_prot;
-               } else {
-                       spm = spt->protptrs[addr >> SBP_L3_SHIFT];
-                       if (!spm)
-                               continue;
-               }
-               spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
-               if (!spp)
-                       continue;
-               spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
-
-               i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-               nw = PTRS_PER_PTE - i;
-               if (addr + (nw << PAGE_SHIFT) > next)
-                       nw = (next - addr) >> PAGE_SHIFT;
-
-               memset(spp, 0, nw * sizeof(u32));
-
-               /* now flush any existing HPTEs for the range */
-               hpte_flush_range(mm, addr, nw);
-       }
-
-err_out:
-       up_write(&mm->mmap_sem);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
-                                 unsigned long end, struct mm_walk *walk)
-{
-       struct vm_area_struct *vma = walk->vma;
-       split_huge_pmd(vma, pmd, addr);
-       return 0;
-}
-
-static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
-                                   unsigned long len)
-{
-       struct vm_area_struct *vma;
-       struct mm_walk subpage_proto_walk = {
-               .mm = mm,
-               .pmd_entry = subpage_walk_pmd_entry,
-       };
-
-       /*
-        * We don't try too hard, we just mark all the vma in that range
-        * VM_NOHUGEPAGE and split them.
-        */
-       vma = find_vma(mm, addr);
-       /*
-        * If the range is in unmapped range, just return
-        */
-       if (vma && ((addr + len) <= vma->vm_start))
-               return;
-
-       while (vma) {
-               if (vma->vm_start >= (addr + len))
-                       break;
-               vma->vm_flags |= VM_NOHUGEPAGE;
-               walk_page_vma(vma, &subpage_proto_walk);
-               vma = vma->vm_next;
-       }
-}
-#else
-static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
-                                   unsigned long len)
-{
-       return;
-}
-#endif
-
-/*
- * Copy in a subpage protection map for an address range.
- * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
- * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
- * 2 or 3 to prevent all accesses.
- * Note that the normal page protections also apply; the subpage
- * protection mechanism is an additional constraint, so putting 0
- * in a 2-bit field won't allow writes to a page that is otherwise
- * write-protected.
- */
-SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
-               unsigned long, len, u32 __user *, map)
-{
-       struct mm_struct *mm = current->mm;
-       struct subpage_prot_table *spt;
-       u32 **spm, *spp;
-       unsigned long i;
-       size_t nw;
-       unsigned long next, limit;
-       int err;
-
-       if (radix_enabled())
-               return -ENOENT;
-
-       /* Check parameters */
-       if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
-           addr >= mm->task_size || len >= mm->task_size ||
-           addr + len > mm->task_size)
-               return -EINVAL;
-
-       if (is_hugepage_only_range(mm, addr, len))
-               return -EINVAL;
-
-       if (!map) {
-               /* Clear out the protection map for the address range */
-               subpage_prot_clear(addr, len);
-               return 0;
-       }
-
-       if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
-               return -EFAULT;
-
-       down_write(&mm->mmap_sem);
-
-       spt = mm_ctx_subpage_prot(&mm->context);
-       if (!spt) {
-               /*
-                * Allocate subpage prot table if not already done.
-                * Do this with mmap_sem held
-                */
-               spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
-               if (!spt) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               mm->context.hash_context->spt = spt;
-       }
-
-       subpage_mark_vma_nohuge(mm, addr, len);
-       for (limit = addr + len; addr < limit; addr = next) {
-               next = pmd_addr_end(addr, limit);
-               err = -ENOMEM;
-               if (addr < 0x100000000UL) {
-                       spm = spt->low_prot;
-               } else {
-                       spm = spt->protptrs[addr >> SBP_L3_SHIFT];
-                       if (!spm) {
-                               spm = (u32 **)get_zeroed_page(GFP_KERNEL);
-                               if (!spm)
-                                       goto out;
-                               spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
-                       }
-               }
-               spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
-               spp = *spm;
-               if (!spp) {
-                       spp = (u32 *)get_zeroed_page(GFP_KERNEL);
-                       if (!spp)
-                               goto out;
-                       *spm = spp;
-               }
-               spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
-
-               local_irq_disable();
-               demote_segment_4k(mm, addr);
-               local_irq_enable();
-
-               i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-               nw = PTRS_PER_PTE - i;
-               if (addr + (nw << PAGE_SHIFT) > next)
-                       nw = (next - addr) >> PAGE_SHIFT;
-
-               up_write(&mm->mmap_sem);
-               if (__copy_from_user(spp, map, nw * sizeof(u32)))
-                       return -EFAULT;
-               map += nw;
-               down_write(&mm->mmap_sem);
-
-               /* now flush any existing HPTEs for the range */
-               hpte_flush_range(mm, addr, nw);
-       }
-       if (limit > spt->maxaddr)
-               spt->maxaddr = limit;
-       err = 0;
- out:
-       up_write(&mm->mmap_sem);
-       return err;
-}
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
deleted file mode 100644 (file)
index 6a23b9e..0000000
+++ /dev/null
@@ -1,1101 +0,0 @@
-/*
- * TLB flush routines for radix kernels.
- *
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/memblock.h>
-#include <linux/mmu_context.h>
-#include <linux/sched/mm.h>
-
-#include <asm/ppc-opcode.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/trace.h>
-#include <asm/cputhreads.h>
-
-#define RIC_FLUSH_TLB 0
-#define RIC_FLUSH_PWC 1
-#define RIC_FLUSH_ALL 2
-
-/*
- * tlbiel instruction for radix, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
-                                       unsigned int pid,
-                                       unsigned int ric, unsigned int prs)
-{
-       unsigned long rb;
-       unsigned long rs;
-
-       rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-       rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
-       asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
-                    : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
-                    : "memory");
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
-       unsigned int set;
-
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Flush the first set of the TLB, and the entire Page Walk Cache
-        * and partition table entries. Then flush the remaining sets of the
-        * TLB.
-        */
-       tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
-       for (set = 1; set < num_sets; set++)
-               tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
-
-       /* Do the same for process scoped entries. */
-       tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
-       for (set = 1; set < num_sets; set++)
-               tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
-
-       asm volatile("ptesync": : :"memory");
-}
-
-void radix__tlbiel_all(unsigned int action)
-{
-       unsigned int is;
-
-       switch (action) {
-       case TLB_INVAL_SCOPE_GLOBAL:
-               is = 3;
-               break;
-       case TLB_INVAL_SCOPE_LPID:
-               is = 2;
-               break;
-       default:
-               BUG();
-       }
-
-       if (early_cpu_has_feature(CPU_FTR_ARCH_300))
-               tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
-       else
-               WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
-
-       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void __tlbiel_pid(unsigned long pid, int set,
-                               unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = PPC_BIT(53); /* IS = 1 */
-       rb |= set << PPC_BITLSHIFT(51);
-       rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(0, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = PPC_BIT(53); /* IS = 1 */
-       rs = pid << PPC_BITLSHIFT(31);
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbiel_lpid(unsigned long lpid, int set,
-                               unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = PPC_BIT(52); /* IS = 2 */
-       rb |= set << PPC_BITLSHIFT(51);
-       rs = 0;  /* LPID comes from LPIDR */
-       prs = 0; /* partition scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = PPC_BIT(52); /* IS = 2 */
-       rs = lpid;
-       prs = 0; /* partition scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
-                               unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = PPC_BIT(52); /* IS = 2 */
-       rb |= set << PPC_BITLSHIFT(51);
-       rs = 0;  /* LPID comes from LPIDR */
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-
-static inline void __tlbiel_va(unsigned long va, unsigned long pid,
-                              unsigned long ap, unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = va & ~(PPC_BITMASK(52, 63));
-       rb |= ap << PPC_BITLSHIFT(58);
-       rs = pid << PPC_BITLSHIFT(31);
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(0, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_va(unsigned long va, unsigned long pid,
-                             unsigned long ap, unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = va & ~(PPC_BITMASK(52, 63));
-       rb |= ap << PPC_BITLSHIFT(58);
-       rs = pid << PPC_BITLSHIFT(31);
-       prs = 1; /* process scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
-                             unsigned long ap, unsigned long ric)
-{
-       unsigned long rb,rs,prs,r;
-
-       rb = va & ~(PPC_BITMASK(52, 63));
-       rb |= ap << PPC_BITLSHIFT(58);
-       rs = lpid;
-       prs = 0; /* partition scoped */
-       r = 1;   /* radix format */
-
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-       trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
-}
-
-static inline void fixup_tlbie(void)
-{
-       unsigned long pid = 0;
-       unsigned long va = ((1UL << 52) - 1);
-
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-               asm volatile("ptesync": : :"memory");
-               __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
-       }
-}
-
-static inline void fixup_tlbie_lpid(unsigned long lpid)
-{
-       unsigned long va = ((1UL << 52) - 1);
-
-       if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-               asm volatile("ptesync": : :"memory");
-               __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
-       }
-}
-
-/*
- * We use 128 set in radix mode and 256 set in hpt mode.
- */
-static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
-{
-       int set;
-
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-        * also flush the entire Page Walk Cache.
-        */
-       __tlbiel_pid(pid, 0, ric);
-
-       /* For PWC, only one flush is needed */
-       if (ric == RIC_FLUSH_PWC) {
-               asm volatile("ptesync": : :"memory");
-               return;
-       }
-
-       /* For the remaining sets, just flush the TLB */
-       for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-               __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
-
-       asm volatile("ptesync": : :"memory");
-       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
-{
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Workaround the fact that the "ric" argument to __tlbie_pid
-        * must be a compile-time contraint to match the "i" constraint
-        * in the asm statement.
-        */
-       switch (ric) {
-       case RIC_FLUSH_TLB:
-               __tlbie_pid(pid, RIC_FLUSH_TLB);
-               break;
-       case RIC_FLUSH_PWC:
-               __tlbie_pid(pid, RIC_FLUSH_PWC);
-               break;
-       case RIC_FLUSH_ALL:
-       default:
-               __tlbie_pid(pid, RIC_FLUSH_ALL);
-       }
-       fixup_tlbie();
-       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
-{
-       int set;
-
-       VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-        * also flush the entire Page Walk Cache.
-        */
-       __tlbiel_lpid(lpid, 0, ric);
-
-       /* For PWC, only one flush is needed */
-       if (ric == RIC_FLUSH_PWC) {
-               asm volatile("ptesync": : :"memory");
-               return;
-       }
-
-       /* For the remaining sets, just flush the TLB */
-       for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-               __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
-
-       asm volatile("ptesync": : :"memory");
-       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
-{
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Workaround the fact that the "ric" argument to __tlbie_pid
-        * must be a compile-time contraint to match the "i" constraint
-        * in the asm statement.
-        */
-       switch (ric) {
-       case RIC_FLUSH_TLB:
-               __tlbie_lpid(lpid, RIC_FLUSH_TLB);
-               break;
-       case RIC_FLUSH_PWC:
-               __tlbie_lpid(lpid, RIC_FLUSH_PWC);
-               break;
-       case RIC_FLUSH_ALL:
-       default:
-               __tlbie_lpid(lpid, RIC_FLUSH_ALL);
-       }
-       fixup_tlbie_lpid(lpid);
-       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
-{
-       int set;
-
-       VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
-       asm volatile("ptesync": : :"memory");
-
-       /*
-        * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-        * also flush the entire Page Walk Cache.
-        */
-       __tlbiel_lpid_guest(lpid, 0, ric);
-
-       /* For PWC, only one flush is needed */
-       if (ric == RIC_FLUSH_PWC) {
-               asm volatile("ptesync": : :"memory");
-               return;
-       }
-
-       /* For the remaining sets, just flush the TLB */
-       for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-               __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
-
-       asm volatile("ptesync": : :"memory");
-       asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
-}
-
-
-static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
-                                   unsigned long pid, unsigned long page_size,
-                                   unsigned long psize)
-{
-       unsigned long addr;
-       unsigned long ap = mmu_get_ap(psize);
-
-       for (addr = start; addr < end; addr += page_size)
-               __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-}
-
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-                             unsigned long psize, unsigned long ric)
-{
-       unsigned long ap = mmu_get_ap(psize);
-
-       asm volatile("ptesync": : :"memory");
-       __tlbiel_va(va, pid, ap, ric);
-       asm volatile("ptesync": : :"memory");
-}
-
-static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
-                                   unsigned long pid, unsigned long page_size,
-                                   unsigned long psize, bool also_pwc)
-{
-       asm volatile("ptesync": : :"memory");
-       if (also_pwc)
-               __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
-       __tlbiel_va_range(start, end, pid, page_size, psize);
-       asm volatile("ptesync": : :"memory");
-}
-
-static inline void __tlbie_va_range(unsigned long start, unsigned long end,
-                                   unsigned long pid, unsigned long page_size,
-                                   unsigned long psize)
-{
-       unsigned long addr;
-       unsigned long ap = mmu_get_ap(psize);
-
-       for (addr = start; addr < end; addr += page_size)
-               __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-}
-
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
-                             unsigned long psize, unsigned long ric)
-{
-       unsigned long ap = mmu_get_ap(psize);
-
-       asm volatile("ptesync": : :"memory");
-       __tlbie_va(va, pid, ap, ric);
-       fixup_tlbie();
-       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
-                             unsigned long psize, unsigned long ric)
-{
-       unsigned long ap = mmu_get_ap(psize);
-
-       asm volatile("ptesync": : :"memory");
-       __tlbie_lpid_va(va, lpid, ap, ric);
-       fixup_tlbie_lpid(lpid);
-       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbie_va_range(unsigned long start, unsigned long end,
-                                   unsigned long pid, unsigned long page_size,
-                                   unsigned long psize, bool also_pwc)
-{
-       asm volatile("ptesync": : :"memory");
-       if (also_pwc)
-               __tlbie_pid(pid, RIC_FLUSH_PWC);
-       __tlbie_va_range(start, end, pid, page_size, psize);
-       fixup_tlbie();
-       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-/*
- * Base TLB flushing operations:
- *
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- *  - local_* variants of page and mm only apply to the current
- *    processor
- */
-void radix__local_flush_tlb_mm(struct mm_struct *mm)
-{
-       unsigned long pid;
-
-       preempt_disable();
-       pid = mm->context.id;
-       if (pid != MMU_NO_CONTEXT)
-               _tlbiel_pid(pid, RIC_FLUSH_TLB);
-       preempt_enable();
-}
-EXPORT_SYMBOL(radix__local_flush_tlb_mm);
-
-#ifndef CONFIG_SMP
-void radix__local_flush_all_mm(struct mm_struct *mm)
-{
-       unsigned long pid;
-
-       preempt_disable();
-       pid = mm->context.id;
-       if (pid != MMU_NO_CONTEXT)
-               _tlbiel_pid(pid, RIC_FLUSH_ALL);
-       preempt_enable();
-}
-EXPORT_SYMBOL(radix__local_flush_all_mm);
-#endif /* CONFIG_SMP */
-
-void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
-                                      int psize)
-{
-       unsigned long pid;
-
-       preempt_disable();
-       pid = mm->context.id;
-       if (pid != MMU_NO_CONTEXT)
-               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-       preempt_enable();
-}
-
-void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
-       /* need the return fix for nohash.c */
-       if (is_vm_hugetlb_page(vma))
-               return radix__local_flush_hugetlb_page(vma, vmaddr);
-#endif
-       radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
-}
-EXPORT_SYMBOL(radix__local_flush_tlb_page);
-
-static bool mm_is_singlethreaded(struct mm_struct *mm)
-{
-       if (atomic_read(&mm->context.copros) > 0)
-               return false;
-       if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
-               return true;
-       return false;
-}
-
-static bool mm_needs_flush_escalation(struct mm_struct *mm)
-{
-       /*
-        * P9 nest MMU has issues with the page walk cache
-        * caching PTEs and not flushing them properly when
-        * RIC = 0 for a PID/LPID invalidate
-        */
-       if (atomic_read(&mm->context.copros) > 0)
-               return true;
-       return false;
-}
-
-#ifdef CONFIG_SMP
-static void do_exit_flush_lazy_tlb(void *arg)
-{
-       struct mm_struct *mm = arg;
-       unsigned long pid = mm->context.id;
-
-       if (current->mm == mm)
-               return; /* Local CPU */
-
-       if (current->active_mm == mm) {
-               /*
-                * Must be a kernel thread because sender is single-threaded.
-                */
-               BUG_ON(current->mm);
-               mmgrab(&init_mm);
-               switch_mm(mm, &init_mm, current);
-               current->active_mm = &init_mm;
-               mmdrop(mm);
-       }
-       _tlbiel_pid(pid, RIC_FLUSH_ALL);
-}
-
-static void exit_flush_lazy_tlbs(struct mm_struct *mm)
-{
-       /*
-        * Would be nice if this was async so it could be run in
-        * parallel with our local flush, but generic code does not
-        * give a good API for it. Could extend the generic code or
-        * make a special powerpc IPI for flushing TLBs.
-        * For now it's not too performance critical.
-        */
-       smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
-                               (void *)mm, 1);
-       mm_reset_thread_local(mm);
-}
-
-void radix__flush_tlb_mm(struct mm_struct *mm)
-{
-       unsigned long pid;
-
-       pid = mm->context.id;
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       preempt_disable();
-       /*
-        * Order loads of mm_cpumask vs previous stores to clear ptes before
-        * the invalidate. See barrier in switch_mm_irqs_off
-        */
-       smp_mb();
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       exit_flush_lazy_tlbs(mm);
-                       goto local;
-               }
-
-               if (mm_needs_flush_escalation(mm))
-                       _tlbie_pid(pid, RIC_FLUSH_ALL);
-               else
-                       _tlbie_pid(pid, RIC_FLUSH_TLB);
-       } else {
-local:
-               _tlbiel_pid(pid, RIC_FLUSH_TLB);
-       }
-       preempt_enable();
-}
-EXPORT_SYMBOL(radix__flush_tlb_mm);
-
-static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
-{
-       unsigned long pid;
-
-       pid = mm->context.id;
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       preempt_disable();
-       smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       if (!fullmm) {
-                               exit_flush_lazy_tlbs(mm);
-                               goto local;
-                       }
-               }
-               _tlbie_pid(pid, RIC_FLUSH_ALL);
-       } else {
-local:
-               _tlbiel_pid(pid, RIC_FLUSH_ALL);
-       }
-       preempt_enable();
-}
-void radix__flush_all_mm(struct mm_struct *mm)
-{
-       __flush_all_mm(mm, false);
-}
-EXPORT_SYMBOL(radix__flush_all_mm);
-
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
-{
-       tlb->need_flush_all = 1;
-}
-EXPORT_SYMBOL(radix__flush_tlb_pwc);
-
-void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
-                                int psize)
-{
-       unsigned long pid;
-
-       pid = mm->context.id;
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       preempt_disable();
-       smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       exit_flush_lazy_tlbs(mm);
-                       goto local;
-               }
-               _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-       } else {
-local:
-               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-       }
-       preempt_enable();
-}
-
-void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
-       if (is_vm_hugetlb_page(vma))
-               return radix__flush_hugetlb_page(vma, vmaddr);
-#endif
-       radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
-}
-EXPORT_SYMBOL(radix__flush_tlb_page);
-
-#else /* CONFIG_SMP */
-#define radix__flush_all_mm radix__local_flush_all_mm
-#endif /* CONFIG_SMP */
-
-void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-       _tlbie_pid(0, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
-
-#define TLB_FLUSH_ALL -1UL
-
-/*
- * Number of pages above which we invalidate the entire PID rather than
- * flush individual pages, for local and global flushes respectively.
- *
- * tlbie goes out to the interconnect and individual ops are more costly.
- * It also does not iterate over sets like the local tlbiel variant when
- * invalidating a full PID, so it has a far lower threshold to change from
- * individual page flushes to full-pid flushes.
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
-
-static inline void __radix__flush_tlb_range(struct mm_struct *mm,
-                                       unsigned long start, unsigned long end,
-                                       bool flush_all_sizes)
-
-{
-       unsigned long pid;
-       unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
-       unsigned long page_size = 1UL << page_shift;
-       unsigned long nr_pages = (end - start) >> page_shift;
-       bool local, full;
-
-       pid = mm->context.id;
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       preempt_disable();
-       smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       if (end != TLB_FLUSH_ALL) {
-                               exit_flush_lazy_tlbs(mm);
-                               goto is_local;
-                       }
-               }
-               local = false;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_single_page_flush_ceiling);
-       } else {
-is_local:
-               local = true;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_local_single_page_flush_ceiling);
-       }
-
-       if (full) {
-               if (local) {
-                       _tlbiel_pid(pid, RIC_FLUSH_TLB);
-               } else {
-                       if (mm_needs_flush_escalation(mm))
-                               _tlbie_pid(pid, RIC_FLUSH_ALL);
-                       else
-                               _tlbie_pid(pid, RIC_FLUSH_TLB);
-               }
-       } else {
-               bool hflush = flush_all_sizes;
-               bool gflush = flush_all_sizes;
-               unsigned long hstart, hend;
-               unsigned long gstart, gend;
-
-               if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
-                       hflush = true;
-
-               if (hflush) {
-                       hstart = (start + PMD_SIZE - 1) & PMD_MASK;
-                       hend = end & PMD_MASK;
-                       if (hstart == hend)
-                               hflush = false;
-               }
-
-               if (gflush) {
-                       gstart = (start + PUD_SIZE - 1) & PUD_MASK;
-                       gend = end & PUD_MASK;
-                       if (gstart == gend)
-                               gflush = false;
-               }
-
-               asm volatile("ptesync": : :"memory");
-               if (local) {
-                       __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
-                       if (hflush)
-                               __tlbiel_va_range(hstart, hend, pid,
-                                               PMD_SIZE, MMU_PAGE_2M);
-                       if (gflush)
-                               __tlbiel_va_range(gstart, gend, pid,
-                                               PUD_SIZE, MMU_PAGE_1G);
-                       asm volatile("ptesync": : :"memory");
-               } else {
-                       __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
-                       if (hflush)
-                               __tlbie_va_range(hstart, hend, pid,
-                                               PMD_SIZE, MMU_PAGE_2M);
-                       if (gflush)
-                               __tlbie_va_range(gstart, gend, pid,
-                                               PUD_SIZE, MMU_PAGE_1G);
-                       fixup_tlbie();
-                       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-               }
-       }
-       preempt_enable();
-}
-
-void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-                    unsigned long end)
-
-{
-#ifdef CONFIG_HUGETLB_PAGE
-       if (is_vm_hugetlb_page(vma))
-               return radix__flush_hugetlb_tlb_range(vma, start, end);
-#endif
-
-       __radix__flush_tlb_range(vma->vm_mm, start, end, false);
-}
-EXPORT_SYMBOL(radix__flush_tlb_range);
-
-static int radix_get_mmu_psize(int page_size)
-{
-       int psize;
-
-       if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
-               psize = mmu_virtual_psize;
-       else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
-               psize = MMU_PAGE_2M;
-       else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
-               psize = MMU_PAGE_1G;
-       else
-               return -1;
-       return psize;
-}
-
-/*
- * Flush partition scoped LPID address translation for all CPUs.
- */
-void radix__flush_tlb_lpid_page(unsigned int lpid,
-                                       unsigned long addr,
-                                       unsigned long page_size)
-{
-       int psize = radix_get_mmu_psize(page_size);
-
-       _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
-}
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
-
-/*
- * Flush partition scoped PWC from LPID for all CPUs.
- */
-void radix__flush_pwc_lpid(unsigned int lpid)
-{
-       _tlbie_lpid(lpid, RIC_FLUSH_PWC);
-}
-EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
-
-/*
- * Flush partition scoped translations from LPID (=LPIDR)
- */
-void radix__flush_tlb_lpid(unsigned int lpid)
-{
-       _tlbie_lpid(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
-
-/*
- * Flush partition scoped translations from LPID (=LPIDR)
- */
-void radix__local_flush_tlb_lpid(unsigned int lpid)
-{
-       _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
-
-/*
- * Flush process scoped translations from LPID (=LPIDR).
- * Important difference, the guest normally manages its own translations,
- * but some cases e.g., vCPU CPU migration require KVM to flush.
- */
-void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
-{
-       _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
-
-
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
-                                 unsigned long end, int psize);
-
-void radix__tlb_flush(struct mmu_gather *tlb)
-{
-       int psize = 0;
-       struct mm_struct *mm = tlb->mm;
-       int page_size = tlb->page_size;
-       unsigned long start = tlb->start;
-       unsigned long end = tlb->end;
-
-       /*
-        * if page size is not something we understand, do a full mm flush
-        *
-        * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
-        * that flushes the process table entry cache upon process teardown.
-        * See the comment for radix in arch_exit_mmap().
-        */
-       if (tlb->fullmm) {
-               __flush_all_mm(mm, true);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
-       } else if (mm_tlb_flush_nested(mm)) {
-               /*
-                * If there is a concurrent invalidation that is clearing ptes,
-                * then it's possible this invalidation will miss one of those
-                * cleared ptes and miss flushing the TLB. If this invalidate
-                * returns before the other one flushes TLBs, that can result
-                * in it returning while there are still valid TLBs inside the
-                * range to be invalidated.
-                *
-                * See mm/memory.c:tlb_finish_mmu() for more details.
-                *
-                * The solution to this is ensure the entire range is always
-                * flushed here. The problem for powerpc is that the flushes
-                * are page size specific, so this "forced flush" would not
-                * do the right thing if there are a mix of page sizes in
-                * the range to be invalidated. So use __flush_tlb_range
-                * which invalidates all possible page sizes in the range.
-                *
-                * PWC flush probably is not be required because the core code
-                * shouldn't free page tables in this path, but accounting
-                * for the possibility makes us a bit more robust.
-                *
-                * need_flush_all is an uncommon case because page table
-                * teardown should be done with exclusive locks held (but
-                * after locks are dropped another invalidate could come
-                * in), it could be optimized further if necessary.
-                */
-               if (!tlb->need_flush_all)
-                       __radix__flush_tlb_range(mm, start, end, true);
-               else
-                       radix__flush_all_mm(mm);
-#endif
-       } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
-               if (!tlb->need_flush_all)
-                       radix__flush_tlb_mm(mm);
-               else
-                       radix__flush_all_mm(mm);
-       } else {
-               if (!tlb->need_flush_all)
-                       radix__flush_tlb_range_psize(mm, start, end, psize);
-               else
-                       radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
-       }
-       tlb->need_flush_all = 0;
-}
-
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
-                               unsigned long start, unsigned long end,
-                               int psize, bool also_pwc)
-{
-       unsigned long pid;
-       unsigned int page_shift = mmu_psize_defs[psize].shift;
-       unsigned long page_size = 1UL << page_shift;
-       unsigned long nr_pages = (end - start) >> page_shift;
-       bool local, full;
-
-       pid = mm->context.id;
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       preempt_disable();
-       smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       if (end != TLB_FLUSH_ALL) {
-                               exit_flush_lazy_tlbs(mm);
-                               goto is_local;
-                       }
-               }
-               local = false;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_single_page_flush_ceiling);
-       } else {
-is_local:
-               local = true;
-               full = (end == TLB_FLUSH_ALL ||
-                               nr_pages > tlb_local_single_page_flush_ceiling);
-       }
-
-       if (full) {
-               if (local) {
-                       _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-               } else {
-                       if (mm_needs_flush_escalation(mm))
-                               also_pwc = true;
-
-                       _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-               }
-       } else {
-               if (local)
-                       _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
-               else
-                       _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
-       }
-       preempt_enable();
-}
-
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
-                                 unsigned long end, int psize)
-{
-       return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
-}
-
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
-                                 unsigned long end, int psize)
-{
-       __radix__flush_tlb_range_psize(mm, start, end, psize, true);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
-{
-       unsigned long pid, end;
-
-       pid = mm->context.id;
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       /* 4k page size, just blow the world */
-       if (PAGE_SIZE == 0x1000) {
-               radix__flush_all_mm(mm);
-               return;
-       }
-
-       end = addr + HPAGE_PMD_SIZE;
-
-       /* Otherwise first do the PWC, then iterate the pages. */
-       preempt_disable();
-       smp_mb(); /* see radix__flush_tlb_mm */
-       if (!mm_is_thread_local(mm)) {
-               if (unlikely(mm_is_singlethreaded(mm))) {
-                       exit_flush_lazy_tlbs(mm);
-                       goto local;
-               }
-               _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
-       } else {
-local:
-               _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
-       }
-
-       preempt_enable();
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
-                               unsigned long start, unsigned long end)
-{
-       radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
-}
-EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
-
-void radix__flush_tlb_all(void)
-{
-       unsigned long rb,prs,r,rs;
-       unsigned long ric = RIC_FLUSH_ALL;
-
-       rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
-       prs = 0; /* partition scoped */
-       r = 1;   /* radix format */
-       rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
-
-       asm volatile("ptesync": : :"memory");
-       /*
-        * now flush guest entries by passing PRS = 1 and LPID != 0
-        */
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
-       /*
-        * now flush host entires by passing PRS = 0 and LPID == 0
-        */
-       asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
-       asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
-{
-       unsigned long pid = mm->context.id;
-
-       if (unlikely(pid == MMU_NO_CONTEXT))
-               return;
-
-       /*
-        * If this context hasn't run on that CPU before and KVM is
-        * around, there's a slim chance that the guest on another
-        * CPU just brought in obsolete translation into the TLB of
-        * this CPU due to a bad prefetch using the guest PID on
-        * the way into the hypervisor.
-        *
-        * We work around this here. If KVM is possible, we check if
-        * any sibling thread is in KVM. If it is, the window may exist
-        * and thus we flush that PID from the core.
-        *
-        * A potential future improvement would be to mark which PIDs
-        * have never been used on the system and avoid it if the PID
-        * is new and the process has no other cpumask bit set.
-        */
-       if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
-               int cpu = smp_processor_id();
-               int sib = cpu_first_thread_sibling(cpu);
-               bool flush = false;
-
-               for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
-                       if (sib == cpu)
-                               continue;
-                       if (!cpu_possible(sib))
-                               continue;
-                       if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
-                               flush = true;
-               }
-               if (flush)
-                       _tlbiel_pid(pid, RIC_FLUSH_ALL);
-       }
-}
-EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
-#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
deleted file mode 100644 (file)
index 87d71dd..0000000
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * This file contains the routines for flushing entries from the
- * TLB and MMU hash table.
- *
- *  Derived from arch/ppc64/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Dave Engebretsen <engebret@us.ibm.com>
- *      Rework for PPC64 port.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/bug.h>
-#include <asm/pte-walk.h>
-
-
-#include <trace/events/thp.h>
-
-DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-/*
- * A linux PTE was changed and the corresponding hash table entry
- * neesd to be flushed. This function will either perform the flush
- * immediately or will batch it up if the current CPU has an active
- * batch on it.
- */
-void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
-                    pte_t *ptep, unsigned long pte, int huge)
-{
-       unsigned long vpn;
-       struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
-       unsigned long vsid;
-       unsigned int psize;
-       int ssize;
-       real_pte_t rpte;
-       int i, offset;
-
-       i = batch->index;
-
-       /* Get page size (maybe move back to caller).
-        *
-        * NOTE: when using special 64K mappings in 4K environment like
-        * for SPEs, we obtain the page size from the slice, which thus
-        * must still exist (and thus the VMA not reused) at the time
-        * of this call
-        */
-       if (huge) {
-#ifdef CONFIG_HUGETLB_PAGE
-               psize = get_slice_psize(mm, addr);
-               /* Mask the address for the correct page size */
-               addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
-               if (unlikely(psize == MMU_PAGE_16G))
-                       offset = PTRS_PER_PUD;
-               else
-                       offset = PTRS_PER_PMD;
-#else
-               BUG();
-               psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
-#endif
-       } else {
-               psize = pte_pagesize_index(mm, addr, pte);
-               /* Mask the address for the standard page size.  If we
-                * have a 64k page kernel, but the hardware does not
-                * support 64k pages, this might be different from the
-                * hardware page size encoded in the slice table. */
-               addr &= PAGE_MASK;
-               offset = PTRS_PER_PTE;
-       }
-
-
-       /* Build full vaddr */
-       if (!is_kernel_addr(addr)) {
-               ssize = user_segment_size(addr);
-               vsid = get_user_vsid(&mm->context, addr, ssize);
-       } else {
-               vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-               ssize = mmu_kernel_ssize;
-       }
-       WARN_ON(vsid == 0);
-       vpn = hpt_vpn(addr, vsid, ssize);
-       rpte = __real_pte(__pte(pte), ptep, offset);
-
-       /*
-        * Check if we have an active batch on this CPU. If not, just
-        * flush now and return.
-        */
-       if (!batch->active) {
-               flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
-               put_cpu_var(ppc64_tlb_batch);
-               return;
-       }
-
-       /*
-        * This can happen when we are in the middle of a TLB batch and
-        * we encounter memory pressure (eg copy_page_range when it tries
-        * to allocate a new pte). If we have to reclaim memory and end
-        * up scanning and resetting referenced bits then our batch context
-        * will change mid stream.
-        *
-        * We also need to ensure only one page size is present in a given
-        * batch
-        */
-       if (i != 0 && (mm != batch->mm || batch->psize != psize ||
-                      batch->ssize != ssize)) {
-               __flush_tlb_pending(batch);
-               i = 0;
-       }
-       if (i == 0) {
-               batch->mm = mm;
-               batch->psize = psize;
-               batch->ssize = ssize;
-       }
-       batch->pte[i] = rpte;
-       batch->vpn[i] = vpn;
-       batch->index = ++i;
-       if (i >= PPC64_TLB_BATCH_NR)
-               __flush_tlb_pending(batch);
-       put_cpu_var(ppc64_tlb_batch);
-}
-
-/*
- * This function is called when terminating an mmu batch or when a batch
- * is full. It will perform the flush of all the entries currently stored
- * in a batch.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
-{
-       int i, local;
-
-       i = batch->index;
-       local = mm_is_thread_local(batch->mm);
-       if (i == 1)
-               flush_hash_page(batch->vpn[0], batch->pte[0],
-                               batch->psize, batch->ssize, local);
-       else
-               flush_hash_range(i, local);
-       batch->index = 0;
-}
-
-void hash__tlb_flush(struct mmu_gather *tlb)
-{
-       struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
-
-       /* If there's a TLB batch pending, then we must flush it because the
-        * pages are going to be freed and we really don't want to have a CPU
-        * access a freed page because it has a stale TLB
-        */
-       if (tlbbatch->index)
-               __flush_tlb_pending(tlbbatch);
-
-       put_cpu_var(ppc64_tlb_batch);
-}
-
-/**
- * __flush_hash_table_range - Flush all HPTEs for a given address range
- *                            from the hash table (and the TLB). But keeps
- *                            the linux PTEs intact.
- *
- * @mm         : mm_struct of the target address space (generally init_mm)
- * @start      : starting address
- * @end         : ending address (not included in the flush)
- *
- * This function is mostly to be used by some IO hotplug code in order
- * to remove all hash entries from a given address range used to map IO
- * space on a removed PCI-PCI bidge without tearing down the full mapping
- * since 64K pages may overlap with other bridges when using 64K pages
- * with 4K HW pages on IO space.
- *
- * Because of that usage pattern, it is implemented for small size rather
- * than speed.
- */
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
-                             unsigned long end)
-{
-       bool is_thp;
-       int hugepage_shift;
-       unsigned long flags;
-
-       start = _ALIGN_DOWN(start, PAGE_SIZE);
-       end = _ALIGN_UP(end, PAGE_SIZE);
-
-       BUG_ON(!mm->pgd);
-
-       /* Note: Normally, we should only ever use a batch within a
-        * PTE locked section. This violates the rule, but will work
-        * since we don't actually modify the PTEs, we just flush the
-        * hash while leaving the PTEs intact (including their reference
-        * to being hashed). This is not the most performance oriented
-        * way to do things but is fine for our needs here.
-        */
-       local_irq_save(flags);
-       arch_enter_lazy_mmu_mode();
-       for (; start < end; start += PAGE_SIZE) {
-               pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
-                                                 &hugepage_shift);
-               unsigned long pte;
-
-               if (ptep == NULL)
-                       continue;
-               pte = pte_val(*ptep);
-               if (is_thp)
-                       trace_hugepage_invalidate(start, pte);
-               if (!(pte & H_PAGE_HASHPTE))
-                       continue;
-               if (unlikely(is_thp))
-                       hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
-               else
-                       hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
-       }
-       arch_leave_lazy_mmu_mode();
-       local_irq_restore(flags);
-}
-
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
-{
-       pte_t *pte;
-       pte_t *start_pte;
-       unsigned long flags;
-
-       addr = _ALIGN_DOWN(addr, PMD_SIZE);
-       /* Note: Normally, we should only ever use a batch within a
-        * PTE locked section. This violates the rule, but will work
-        * since we don't actually modify the PTEs, we just flush the
-        * hash while leaving the PTEs intact (including their reference
-        * to being hashed). This is not the most performance oriented
-        * way to do things but is fine for our needs here.
-        */
-       local_irq_save(flags);
-       arch_enter_lazy_mmu_mode();
-       start_pte = pte_offset_map(pmd, addr);
-       for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
-               unsigned long pteval = pte_val(*pte);
-               if (pteval & H_PAGE_HASHPTE)
-                       hpte_need_flush(mm, addr, pte, pteval, 0);
-               addr += PAGE_SIZE;
-       }
-       arch_leave_lazy_mmu_mode();
-       local_irq_restore(flags);
-}
diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c
deleted file mode 100644 (file)
index f83044f..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <asm/byteorder.h>
-#include "vphn.h"
-
-/*
- * The associativity domain numbers are returned from the hypervisor as a
- * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
- * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
- * bytes.
- *
- *    --- 16-bit fields -->
- *  _________________________
- *  |  0  |  1  |  2  |  3  |   be_packed[0]
- *  ------+-----+-----+------
- *  _________________________
- *  |  4  |  5  |  6  |  7  |   be_packed[1]
- *  -------------------------
- *            ...
- *  _________________________
- *  | 20  | 21  | 22  | 23  |   be_packed[5]
- *  -------------------------
- *
- * Convert to the sequence they would appear in the ibm,associativity property.
- */
-int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
-{
-       __be64 be_packed[VPHN_REGISTER_COUNT];
-       int i, nr_assoc_doms = 0;
-       const __be16 *field = (const __be16 *) be_packed;
-       u16 last = 0;
-       bool is_32bit = false;
-
-#define VPHN_FIELD_UNUSED      (0xffff)
-#define VPHN_FIELD_MSB         (0x8000)
-#define VPHN_FIELD_MASK                (~VPHN_FIELD_MSB)
-
-       /* Let's fix the values returned by plpar_hcall9() */
-       for (i = 0; i < VPHN_REGISTER_COUNT; i++)
-               be_packed[i] = cpu_to_be64(packed[i]);
-
-       for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
-               u16 new = be16_to_cpup(field++);
-
-               if (is_32bit) {
-                       /* Let's concatenate the 16 bits of this field to the
-                        * 15 lower bits of the previous field
-                        */
-                       unpacked[++nr_assoc_doms] =
-                               cpu_to_be32(last << 16 | new);
-                       is_32bit = false;
-               } else if (new == VPHN_FIELD_UNUSED)
-                       /* This is the list terminator */
-                       break;
-               else if (new & VPHN_FIELD_MSB) {
-                       /* Data is in the lower 15 bits of this field */
-                       unpacked[++nr_assoc_doms] =
-                               cpu_to_be32(new & VPHN_FIELD_MASK);
-               } else {
-                       /* Data is in the lower 15 bits of this field
-                        * concatenated with the next 16 bit field
-                        */
-                       last = new;
-                       is_32bit = true;
-               }
-       }
-
-       /* The first cell contains the length of the property */
-       unpacked[0] = cpu_to_be32(nr_assoc_doms);
-
-       return nr_assoc_doms;
-}
diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h
deleted file mode 100644 (file)
index f9ffdb3..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ARCH_POWERPC_MM_VPHN_H_
-#define _ARCH_POWERPC_MM_VPHN_H_
-
-/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers.
- */
-#define VPHN_REGISTER_COUNT 6
-
-/*
- * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
- * form the complete property we have to add the length in the first cell.
- */
-#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
-
-extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
-
-#endif
index 186b906e66d5a7043d6d06cb5cbff48e7692ed57..1d1f5f2be3b295bdc199c3329b05ec8ecddea293 120000 (symlink)
@@ -1 +1 @@
-../../../../../arch/powerpc/mm/vphn.c
\ No newline at end of file
+../../../../../arch/powerpc/mm/book3s64/vphn.c
\ No newline at end of file
index 7131efe38c65fd98fd18fe4a0d5a25a792b0fd72..45fe160f8288152084ea74be3f54026eb35921c3 120000 (symlink)
@@ -1 +1 @@
-../../../../../arch/powerpc/mm/vphn.h
\ No newline at end of file
+../../../../../arch/powerpc/mm/book3s64/vphn.h
\ No newline at end of file